Add llama-cpp-python wheels with tensor cores support (#5003)
This commit is contained in:
parent
0a299d5959
commit
de138b8ba6
9 changed files with 69 additions and 21 deletions
|
|
@ -105,6 +105,7 @@ def create_ui():
|
|||
shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# only works on Linux.')
|
||||
|
||||
with gr.Column():
|
||||
shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')
|
||||
shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
|
||||
shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
|
||||
shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue