Add the --cpu option for llama.cpp to prevent CUDA from being used (#3432)
This commit is contained in:
parent
3e70bce576
commit
87dab03dc0
5 changed files with 40 additions and 10 deletions
|
@ -7,13 +7,22 @@ from modules import shared
|
|||
from modules.callbacks import Iteratorize
|
||||
from modules.logging_colors import logger
|
||||
|
||||
import llama_cpp
|
||||
|
||||
if torch.cuda.is_available() and not torch.version.hip:
|
||||
try:
|
||||
from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList
|
||||
import llama_cpp_cuda
|
||||
except:
|
||||
from llama_cpp import Llama, LlamaCache, LogitsProcessorList
|
||||
llama_cpp_cuda = None
|
||||
else:
|
||||
from llama_cpp import Llama, LlamaCache, LogitsProcessorList
|
||||
llama_cpp_cuda = None
|
||||
|
||||
|
||||
def llama_cpp_lib():
|
||||
if shared.args.cpu or llama_cpp_cuda is None:
|
||||
return llama_cpp
|
||||
else:
|
||||
return llama_cpp_cuda
|
||||
|
||||
|
||||
def ban_eos_logits_processor(eos_token, input_ids, logits):
|
||||
|
@ -30,6 +39,10 @@ class LlamaCppModel:
|
|||
|
||||
@classmethod
|
||||
def from_pretrained(self, path):
|
||||
|
||||
Llama = llama_cpp_lib().Llama
|
||||
LlamaCache = llama_cpp_lib().LlamaCache
|
||||
|
||||
result = self()
|
||||
cache_capacity = 0
|
||||
if shared.args.cache_capacity is not None:
|
||||
|
@ -74,6 +87,9 @@ class LlamaCppModel:
|
|||
return self.model.detokenize(tokens)
|
||||
|
||||
def generate(self, prompt, state, callback=None):
|
||||
|
||||
LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
|
||||
|
||||
prompt = prompt if type(prompt) is str else prompt.decode()
|
||||
completion_chunks = self.model.create_completion(
|
||||
prompt=prompt,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue