Add the --cpu option for llama.cpp to prevent CUDA from being used (#3432)

This commit is contained in:
oobabooga 2023-08-03 11:00:36 -03:00 committed by GitHub
parent 3e70bce576
commit 87dab03dc0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 40 additions and 10 deletions

View file

@ -7,13 +7,22 @@ from modules import shared
from modules.callbacks import Iteratorize
from modules.logging_colors import logger
import llama_cpp
if torch.cuda.is_available() and not torch.version.hip:
try:
from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList
import llama_cpp_cuda
except:
from llama_cpp import Llama, LlamaCache, LogitsProcessorList
llama_cpp_cuda = None
else:
from llama_cpp import Llama, LlamaCache, LogitsProcessorList
llama_cpp_cuda = None
def llama_cpp_lib():
if shared.args.cpu or llama_cpp_cuda is None:
return llama_cpp
else:
return llama_cpp_cuda
def ban_eos_logits_processor(eos_token, input_ids, logits):
@ -30,6 +39,10 @@ class LlamaCppModel:
@classmethod
def from_pretrained(self, path):
Llama = llama_cpp_lib().Llama
LlamaCache = llama_cpp_lib().LlamaCache
result = self()
cache_capacity = 0
if shared.args.cache_capacity is not None:
@ -74,6 +87,9 @@ class LlamaCppModel:
return self.model.detokenize(tokens)
def generate(self, prompt, state, callback=None):
LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
prompt = prompt if type(prompt) is str else prompt.decode()
completion_chunks = self.model.create_completion(
prompt=prompt,