Set use_fast=True by default, create --no_use_fast flag

This increases tokens/second for HF loaders.
2023-11-16 19:45:05 -08:00 · 2023-11-16 19:45:05 -08:00 · 8b66d83aa9
commit 8b66d83aa9
parent b2ce8dc7ee
6 changed files with 20 additions and 19 deletions
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -19,7 +19,7 @@ loaders_and_params = OrderedDict({
        'quant_type',
        'compute_dtype',
        'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
        'use_flash_attention_2',
        'alpha_value',
        'rope_freq_base',
@ -34,7 +34,7 @@ loaders_and_params = OrderedDict({
        'rope_freq_base',
        'compress_pos_emb',
        'cfg_cache',
-        'use_fast',
+        'no_use_fast',
        'exllama_HF_info',
    ],
    'ExLlamav2_HF': [
@ -45,7 +45,7 @@ loaders_and_params = OrderedDict({
        'cache_8bit',
        'alpha_value',
        'compress_pos_emb',
-        'use_fast',
+        'no_use_fast',
    ],
    'ExLlama': [
        'gpu_split',
@ -78,7 +78,7 @@ loaders_and_params = OrderedDict({
        'disk',
        'auto_devices',
        'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
        'autogptq_info',
    ],
    'GPTQ-for-LLaMa': [
@ -86,7 +86,7 @@ loaders_and_params = OrderedDict({
        'groupsize',
        'model_type',
        'pre_layer',
-        'use_fast',
+        'no_use_fast',
        'gptq_for_llama_info',
    ],
    'llama.cpp': [
@ -119,7 +119,7 @@ loaders_and_params = OrderedDict({
        'compress_pos_emb',
        'numa',
        'cfg_cache',
-        'use_fast',
+        'no_use_fast',
        'logits_all',
        'llamacpp_HF_info',
    ],
@ -139,7 +139,7 @@ loaders_and_params = OrderedDict({
        'max_seq_len',
        'no_inject_fused_attention',
        'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
    ]
 })