Add the --cpu option for llama.cpp to prevent CUDA from being used (#3432)

2023-08-03 11:00:36 -03:00 · 2023-08-03 11:00:36 -03:00 · 87dab03dc0
commit 87dab03dc0
parent 3e70bce576
5 changed files with 40 additions and 10 deletions
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -7,13 +7,22 @@ from modules import shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger

+import llama_cpp
+
 if torch.cuda.is_available() and not torch.version.hip:
    try:
-        from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList
+        import llama_cpp_cuda
    except:
-        from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+        llama_cpp_cuda = None
 else:
-    from llama_cpp import Llama, LlamaCache, LogitsProcessorList
+    llama_cpp_cuda = None
+
+
+def llama_cpp_lib():
+    if shared.args.cpu or llama_cpp_cuda is None:
+        return llama_cpp
+    else:
+        return llama_cpp_cuda


 def ban_eos_logits_processor(eos_token, input_ids, logits):
@ -30,6 +39,10 @@ class LlamaCppModel:

    @classmethod
    def from_pretrained(self, path):
+
+        Llama = llama_cpp_lib().Llama
+        LlamaCache = llama_cpp_lib().LlamaCache
+
        result = self()
        cache_capacity = 0
        if shared.args.cache_capacity is not None:
@ -74,6 +87,9 @@ class LlamaCppModel:
        return self.model.detokenize(tokens)

    def generate(self, prompt, state, callback=None):
+
+        LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
+
        prompt = prompt if type(prompt) is str else prompt.decode()
        completion_chunks = self.model.create_completion(
            prompt=prompt,