Add llama-cpp-python wheels with tensor cores support (#5003)

This commit is contained in:
oobabooga 2023-12-19 17:30:53 -03:00 committed by GitHub
parent 0a299d5959
commit de138b8ba6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 69 additions and 21 deletions

View file

@ -105,6 +105,7 @@ def create_ui():
shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# only works on Linux.')
with gr.Column():
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')