Add llama-cpp-python wheels with tensor cores support (#5003)
This commit is contained in:
parent
0a299d5959
commit
de138b8ba6
9 changed files with 69 additions and 21 deletions
|
|
@ -20,12 +20,21 @@ try:
|
|||
except:
|
||||
llama_cpp_cuda = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda_tensorcores
|
||||
except:
|
||||
llama_cpp_cuda_tensorcores = None
|
||||
|
||||
|
||||
def llama_cpp_lib():
|
||||
if (shared.args.cpu and llama_cpp is not None) or llama_cpp_cuda is None:
|
||||
if shared.args.cpu and llama_cpp is not None:
|
||||
return llama_cpp
|
||||
else:
|
||||
elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
|
||||
return llama_cpp_cuda_tensorcores
|
||||
elif llama_cpp_cuda is not None:
|
||||
return llama_cpp_cuda
|
||||
else:
|
||||
return llama_cpp
|
||||
|
||||
|
||||
class LlamacppHF(PreTrainedModel):
|
||||
|
|
|
|||
|
|
@ -19,12 +19,21 @@ try:
|
|||
except:
|
||||
llama_cpp_cuda = None
|
||||
|
||||
try:
|
||||
import llama_cpp_cuda_tensorcores
|
||||
except:
|
||||
llama_cpp_cuda_tensorcores = None
|
||||
|
||||
|
||||
def llama_cpp_lib():
|
||||
if (shared.args.cpu and llama_cpp is not None) or llama_cpp_cuda is None:
|
||||
if shared.args.cpu and llama_cpp is not None:
|
||||
return llama_cpp
|
||||
else:
|
||||
elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None:
|
||||
return llama_cpp_cuda_tensorcores
|
||||
elif llama_cpp_cuda is not None:
|
||||
return llama_cpp_cuda
|
||||
else:
|
||||
return llama_cpp
|
||||
|
||||
|
||||
def ban_eos_logits_processor(eos_token, input_ids, logits):
|
||||
|
|
|
|||
|
|
@ -43,7 +43,8 @@ loaders_and_params = OrderedDict({
|
|||
'compress_pos_emb',
|
||||
'cpu',
|
||||
'numa',
|
||||
'no_offload_kqv'
|
||||
'no_offload_kqv',
|
||||
'tensorcores',
|
||||
],
|
||||
'llamacpp_HF': [
|
||||
'n_ctx',
|
||||
|
|
@ -65,6 +66,7 @@ loaders_and_params = OrderedDict({
|
|||
'no_use_fast',
|
||||
'logits_all',
|
||||
'no_offload_kqv',
|
||||
'tensorcores',
|
||||
'llamacpp_HF_info',
|
||||
],
|
||||
'ExLlamav2_HF': [
|
||||
|
|
|
|||
|
|
@ -106,6 +106,7 @@ parser.add_argument('--compute_dtype', type=str, default='float16', help='comput
|
|||
parser.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')
|
||||
|
||||
# llama.cpp
|
||||
parser.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
|
||||
parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
|
||||
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||
parser.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
|
||||
|
|
|
|||
|
|
@ -92,6 +92,7 @@ def list_model_elements():
|
|||
'numa',
|
||||
'logits_all',
|
||||
'no_offload_kqv',
|
||||
'tensorcores',
|
||||
'hqq_backend',
|
||||
]
|
||||
if is_torch_xpu_available():
|
||||
|
|
|
|||
|
|
@ -105,6 +105,7 @@ def create_ui():
|
|||
shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# only works on Linux.')
|
||||
|
||||
with gr.Column():
|
||||
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
|
||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
|
||||
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue