Add ExLlama support (#2444)

2023-06-16 20:35:38 -03:00 · 2023-06-16 20:35:38 -03:00 · 9f40032d32
commit 9f40032d32
parent dea43685b0
12 changed files with 156 additions and 47 deletions
--- a/server.py
+++ b/server.py
@ -77,7 +77,10 @@ def load_model_wrapper(selected_model, loader, autoload=False):
            else:
                yield f"Failed to load {selected_model}."
        except:
-            yield traceback.format_exc()
+            exc = traceback.format_exc()
+            logger.error('Failed to load the model.')
+            print(exc)
+            yield exc


 def load_lora_wrapper(selected_loras):
@ -193,7 +196,7 @@ def create_model_menus():

    with gr.Row():
        with gr.Column():
-            shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp"], value=None)
+            shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "llama.cpp"], value=None)
            with gr.Box():
                with gr.Row():
                    with gr.Column():
@ -213,6 +216,7 @@ def create_model_menus():
                        shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
                        shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                        shared.gradio['autogptq_info'] = gr.Markdown('On some systems, AutoGPTQ can be 2x slower than GPTQ-for-LLaMa. You can manually select the GPTQ-for-LLaMa loader above.')
+                        shared.gradio['exllama_info'] = gr.Markdown('ExLlama has to be installed manually. See the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama')

                    with gr.Column():
                        shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)