From e657dd342d711fdab7689761533829bb30018bad Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 15 May 2023 19:19:55 -0400 Subject: [PATCH] Add in-memory cache support for llama.cpp (#1936) --- README.md | 1 + modules/llamacpp_model.py | 47 ++++++++++++++++++++++++++------------- modules/shared.py | 1 + 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 6027333..241f494 100644 --- a/README.md +++ b/README.md @@ -230,6 +230,7 @@ Optionally, you can use the following command-line flags: | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. | | `--no-mmap` | Prevent mmap from being used. | | `--mlock` | Force the system to keep the model in RAM. | +| `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. | | `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. | #### GPTQ diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 65577ee..0ed3354 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -6,6 +6,9 @@ Documentation: https://abetlen.github.io/llama-cpp-python/ ''' +import logging +import re + from llama_cpp import Llama, LlamaCache from modules import shared @@ -23,6 +26,17 @@ class LlamaCppModel: def from_pretrained(self, path): result = self() + cache_capacity = 0 + if shared.args.cache_capacity is not None: + if 'GiB' in shared.args.cache_capacity: + cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 * 1000 + elif 'MiB' in shared.args.cache_capacity: + cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 + else: + cache_capacity = int(shared.args.cache_capacity) + + logging.info("Cache capacity is " + str(cache_capacity) + " bytes") + params = { 'model_path': str(path), 'n_ctx': 2048, @@ -34,7 +48,8 @@ class LlamaCppModel: 'n_gpu_layers': shared.args.n_gpu_layers } self.model = Llama(**params) - self.model.set_cache(LlamaCache) + if cache_capacity > 0: + self.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) # This is ugly, but the model and the tokenizer are the same object in this library. return result, result @@ -45,23 +60,23 @@ class LlamaCppModel: return self.model.tokenize(string) def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None): - if type(context) is str: - context = context.encode() - tokens = self.model.tokenize(context) - - output = b"" - count = 0 - for token in self.model.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty): - text = self.model.detokenize([token]) + context = context if type(context) is str else context.decode() + completion_chunks = self.model.create_completion( + prompt=context, + max_tokens=token_count, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repeat_penalty=repetition_penalty, + stream=True + ) + output = "" + for completion_chunk in completion_chunks: + text = completion_chunk['choices'][0]['text'] output += text if callback: - callback(text.decode()) - - count += 1 - if count >= token_count or (token == self.model.token_eos()): - break - - return output.decode() + callback(text) + return output def generate_with_streaming(self, **kwargs): with Iteratorize(self.generate, kwargs, callback=None) as generator: diff --git a/modules/shared.py b/modules/shared.py index 07a3c89..ce12875 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -123,6 +123,7 @@ parser.add_argument('--threads', type=int, default=0, help='Number of threads to parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.') parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') +parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.') parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') # GPTQ