Reorganize model loading UI completely (#2720)

2023-06-16 19:00:37 -03:00 · 2023-06-16 19:00:37 -03:00 · 7ef6a50e84
commit 7ef6a50e84
parent 57be2eecdf
16 changed files with 365 additions and 243 deletions
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@ -77,8 +77,7 @@ def add_lora_to_model(lora_names):
                elif shared.args.load_in_8bit:
                    params['device_map'] = {'': 0}

-            shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_names[0]}"),adapter_name=lora_names[0], **params)
-
+            shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_names[0]}"), adapter_name=lora_names[0], **params)
            for lora in lora_names[1:]:
                shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)

--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@ -88,8 +88,8 @@ class RWKVModel:
                out, state = self.model.forward(tokens[:args.chunk_len], state)
                tokens = tokens[args.chunk_len:]
            if i == 0:
-                begin_token= len(all_tokens)
-                last_token_posi=begin_token
+                begin_token = len(all_tokens)
+                last_token_posi = begin_token
            # cache the model state after scanning the context
            # we don't cache the state after processing our own generated tokens because
            # the output string might be post-processed arbitrarily. Therefore, what's fed into the model
@ -122,7 +122,7 @@ class RWKVModel:
            if '\ufffd' not in tmp:  # is valid utf-8 string?
                if callback:
                    callback(tmp)
-                    
+
                out_str += tmp
                last_token_posi = begin_token + i + 1
        return out_str
--- a/modules/evaluate.py
+++ b/modules/evaluate.py
@ -8,8 +8,9 @@ from tqdm import tqdm

 from modules import shared
 from modules.models import load_model, unload_model
+from modules.models_settings import (get_model_settings_from_yamls,
+                                     update_model_parameters)
 from modules.text_generation import encode
-from server import get_model_specific_settings, update_model_parameters


 def load_past_evaluations():
@ -66,7 +67,7 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
        if model != 'current model':
            try:
                yield cumulative_log + f"Loading {model}...\n\n"
-                model_settings = get_model_specific_settings(model)
+                model_settings = get_model_settings_from_yamls(model)
                shared.settings.update(model_settings)  # hijacking the interface defaults
                update_model_parameters(model_settings)  # hijacking the command-line arguments
                shared.model_name = model
--- a/modules/github.py
+++ b/modules/github.py
@ -1,6 +1,7 @@
 import os
 import subprocess

+
 def clone_or_pull_repository(github_url):
    repository_folder = "extensions"
    repo_name = github_url.split("/")[-1].split(".")[0]
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -0,0 +1,86 @@
+import functools
+
+import gradio as gr
+
+from modules import shared
+
+loaders_and_params = {
+    'AutoGPTQ': [
+        'triton',
+        'no_inject_fused_attention',
+        'no_inject_fused_mlp',
+        'wbits',
+        'groupsize',
+        'desc_act',
+        'gpu_memory',
+        'cpu_memory',
+        'cpu',
+        'disk',
+        'auto_devices',
+        'trust_remote_code',
+        'autogptq_info',
+    ],
+    'GPTQ-for-LLaMa': [
+        'wbits',
+        'groupsize',
+        'model_type',
+        'pre_layer',
+        'gptq_for_llama_info',
+    ],
+    'llama.cpp': [
+        'n_ctx',
+        'n_gpu_layers',
+        'n_batch',
+        'threads',
+        'no_mmap',
+        'mlock',
+        'llama_cpp_seed',
+    ],
+    'Transformers': [
+        'cpu_memory',
+        'gpu_memory',
+        'trust_remote_code',
+        'load_in_8bit',
+        'bf16',
+        'cpu',
+        'disk',
+        'auto_devices',
+        'load_in_4bit',
+        'use_double_quant',
+        'quant_type',
+        'compute_dtype',
+        'trust_remote_code',
+    ],
+}
+
+
+def get_gpu_memory_keys():
+    return [k for k in shared.gradio if k.startswith('gpu_memory')]
+
+
+@functools.cache
+def get_all_params():
+    all_params = set()
+    for k in loaders_and_params:
+        for el in loaders_and_params[k]:
+            all_params.add(el)
+
+    if 'gpu_memory' in all_params:
+        all_params.remove('gpu_memory')
+        for k in get_gpu_memory_keys():
+            all_params.add(k)
+
+    return sorted(all_params)
+
+
+def make_loader_params_visible(loader):
+    params = []
+    all_params = get_all_params()
+    if loader in loaders_and_params:
+        params = loaders_and_params[loader]
+
+        if 'gpu_memory' in params:
+            params.remove('gpu_memory')
+            params += get_gpu_memory_keys()
+
+    return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]
--- a/modules/models.py
+++ b/modules/models.py
@ -14,6 +14,7 @@ from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
 import modules.shared as shared
 from modules import llama_attn_hijack, sampler_hijack
 from modules.logging_colors import logger
+from modules.models_settings import infer_loader

 transformers.logging.set_verbosity_error()

@ -36,62 +37,31 @@ if shared.args.deepspeed:
 sampler_hijack.hijack_samplers()


-# Some models require special treatment in various parts of the code.
-# This function detects those models
-def find_model_type(model_name):
-    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
-    if not path_to_model.exists():
-        return 'None'
-
-    model_name_lower = model_name.lower()
-    if re.match('.*rwkv.*\.pth', model_name_lower):
-        return 'rwkv'
-    elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
-        return 'llamacpp'
-    elif re.match('.*ggml.*\.bin', model_name_lower):
-        return 'llamacpp'
-    elif 'chatglm' in model_name_lower:
-        return 'chatglm'
-    elif 'galactica' in model_name_lower:
-        return 'galactica'
-    elif 'llava' in model_name_lower:
-        return 'llava'
-    elif 'oasst' in model_name_lower:
-        return 'oasst'
-    elif any((k in model_name_lower for k in ['gpt4chan', 'gpt-4chan'])):
-        return 'gpt4chan'
-    else:
-        config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
-        # Not a "catch all", but fairly accurate
-        if config.to_dict().get("is_encoder_decoder", False):
-            return 'HF_seq2seq'
-        else:
-            return 'HF_generic'
-
-
-def load_model(model_name):
+def load_model(model_name, loader=None):
    logger.info(f"Loading {model_name}...")
    t0 = time.time()

-    shared.model_type = find_model_type(model_name)
-    if shared.model_type == 'None':
-        logger.error('The path to the model does not exist. Exiting.')
-        return None, None
+    shared.is_seq2seq = False
+    load_func_map = {
+        'Transformers': huggingface_loader,
+        'AutoGPTQ': AutoGPTQ_loader,
+        'GPTQ-for-LLaMa': GPTQ_loader,
+        'llama.cpp': llamacpp_loader,
+        'FlexGen': flexgen_loader,
+        'RWKV': RWKV_loader
+    }

-    if shared.args.gptq_for_llama:
-        load_func = GPTQ_loader
-    elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or shared.args.wbits > 0:
-        load_func = AutoGPTQ_loader
-    elif shared.model_type == 'llamacpp':
-        load_func = llamacpp_loader
-    elif shared.model_type == 'rwkv':
-        load_func = RWKV_loader
-    elif shared.args.flexgen:
-        load_func = flexgen_loader
-    else:
-        load_func = huggingface_loader
+    if loader is None:
+        if shared.args.loader is not None:
+            loader = shared.args.loader
+        else:
+            loader = infer_loader(model_name)
+            if loader is None:
+                logger.error('The path to the model does not exist. Exiting.')
+                return None, None

-    output = load_func(model_name)
+    shared.args.loader = loader
+    output = load_func_map[loader](model_name)
    if type(output) is tuple:
        model, tokenizer = output
    else:
@ -111,11 +81,11 @@ def load_model(model_name):

 def load_tokenizer(model_name, model):
    tokenizer = None
-    if shared.model_type == 'gpt4chan' and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
+    if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
        tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
    elif type(model) is transformers.LlamaForCausalLM or "LlamaGPTQForCausalLM" in str(type(model)):
        # Try to load an universal LLaMA tokenizer
-        if shared.model_type not in ['llava', 'oasst']:
+        if any(s in shared.model_name.lower() for s in ['llava', 'oasst']):
            for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]:
                if p.exists():
                    logger.info(f"Loading the universal LLaMA tokenizer from {p}...")
@ -140,12 +110,16 @@ def load_tokenizer(model_name, model):


 def huggingface_loader(model_name):
-    if shared.model_type == 'chatglm':
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+    if 'chatglm' in model_name.lower():
        LoaderClass = AutoModel
-    elif shared.model_type == 'HF_seq2seq':
-        LoaderClass = AutoModelForSeq2SeqLM
    else:
-        LoaderClass = AutoModelForCausalLM
+        config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
+        if config.to_dict().get("is_encoder_decoder", False):
+            LoaderClass = AutoModelForSeq2SeqLM
+            shared.is_seq2seq = True
+        else:
+            LoaderClass = AutoModelForCausalLM

    # Load the model in simple 16-bit mode by default
    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -0,0 +1,134 @@
+import re
+from pathlib import Path
+
+import yaml
+
+from modules import shared, ui
+
+
+def get_model_settings_from_yamls(model):
+    settings = shared.model_config
+    model_settings = {}
+    for pat in settings:
+        if re.match(pat.lower(), model.lower()):
+            for k in settings[pat]:
+                model_settings[k] = settings[pat][k]
+
+    return model_settings
+
+
+def infer_loader(model_name):
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+    model_settings = get_model_settings_from_yamls(model_name)
+    if not path_to_model.exists():
+        loader = None
+    elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
+        loader = 'AutoGPTQ'
+    elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
+        loader = 'llama.cpp'
+    elif re.match('.*ggml.*\.bin', model_name.lower()):
+        loader = 'llama.cpp'
+    elif re.match('.*rwkv.*\.pth', model_name.lower()):
+        loader = 'RWKV'
+    elif shared.args.flexgen:
+        loader = 'FlexGen'
+    else:
+        loader = 'Transformers'
+
+    return loader
+
+
+# UI: update the command-line arguments based on the interface values
+def update_model_parameters(state, initial=False):
+    elements = ui.list_model_elements()  # the names of the parameters
+    gpu_memories = []
+
+    for i, element in enumerate(elements):
+        if element not in state:
+            continue
+
+        value = state[element]
+        if element.startswith('gpu_memory'):
+            gpu_memories.append(value)
+            continue
+
+        if initial and vars(shared.args)[element] != vars(shared.args_defaults)[element]:
+            continue
+
+        # Setting null defaults
+        if element in ['wbits', 'groupsize', 'model_type'] and value == 'None':
+            value = vars(shared.args_defaults)[element]
+        elif element in ['cpu_memory'] and value == 0:
+            value = vars(shared.args_defaults)[element]
+
+        # Making some simple conversions
+        if element in ['wbits', 'groupsize', 'pre_layer']:
+            value = int(value)
+        elif element == 'cpu_memory' and value is not None:
+            value = f"{value}MiB"
+
+        if element in ['pre_layer']:
+            value = [value] if value > 0 else None
+
+        setattr(shared.args, element, value)
+
+    found_positive = False
+    for i in gpu_memories:
+        if i > 0:
+            found_positive = True
+            break
+
+    if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']):
+        if found_positive:
+            shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories]
+        else:
+            shared.args.gpu_memory = None
+
+
+# UI: update the state variable with the model settings
+def apply_model_settings_to_state(model, state):
+    model_settings = get_model_settings_from_yamls(model)
+    if 'loader' not in model_settings:
+        loader = infer_loader(model)
+        if 'wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0:
+            loader = 'AutoGPTQ'
+
+        # If the user is using an alternative GPTQ loader, let them keep using it
+        if not (loader == 'AutoGPTQ' and state['loader'] in ['GPTQ-for-LLaMa', 'exllama']):
+            state['loader'] = loader
+
+    for k in model_settings:
+        if k in state:
+            state[k] = model_settings[k]
+
+    return state
+
+
+# Save the settings for this model to models/config-user.yaml
+def save_model_settings(model, state):
+    if model == 'None':
+        yield ("Not saving the settings because no model is loaded.")
+        return
+
+    with Path(f'{shared.args.model_dir}/config-user.yaml') as p:
+        if p.exists():
+            user_config = yaml.safe_load(open(p, 'r').read())
+        else:
+            user_config = {}
+
+        model_regex = model + '$'  # For exact matches
+        for _dict in [user_config, shared.model_config]:
+            if model_regex not in _dict:
+                _dict[model_regex] = {}
+
+        if model_regex not in user_config:
+            user_config[model_regex] = {}
+
+        for k in ui.list_model_elements():
+            user_config[model_regex][k] = state[k]
+            shared.model_config[model_regex][k] = state[k]
+
+        with open(p, 'w') as f:
+            f.write(yaml.dump(user_config, sort_keys=False))
+
+        yield (f"Settings for {model} saved to {p}")
--- a/modules/presets.py
+++ b/modules/presets.py
@ -52,4 +52,3 @@ def load_preset_for_ui(name, state):
 def generate_preset_yaml(state):
    data = {k: state[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']}
    return yaml.dump(data, sort_keys=False)
-
--- a/modules/shared.py
+++ b/modules/shared.py
@ -10,7 +10,6 @@ generation_lock = None
 model = None
 tokenizer = None
 model_name = "None"
-model_type = None
 lora_names = []

 # Chat variables
@ -97,6 +96,9 @@ parser.add_argument('--settings', type=str, help='Load the default interface set
 parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')

+# Model loader
+parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: autogptq, gptq-for-llama, transformers, llamacpp, rwkv, flexgen')
+
 # Accelerate/transformers
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
@ -139,7 +141,7 @@ parser.add_argument('--warmup_autotune', action='store_true', help='(triton) Ena
 parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fused mlp.')

 # AutoGPTQ
-parser.add_argument('--gptq-for-llama', action='store_true', help='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ.')
+parser.add_argument('--gptq-for-llama', action='store_true', help='DEPRECATED')
 parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
 parser.add_argument('--triton', action='store_true', help='Use triton.')
 parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do not use fused attention (lowers VRAM requirements).')
@ -147,7 +149,7 @@ parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton m
 parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')

 # FlexGen
-parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
+parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
 parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
 parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
 parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).")
@ -184,7 +186,14 @@ args_defaults = parser.parse_args([])

 # Deprecation warnings
 if args.autogptq:
-    logger.warning('--autogptq has been deprecated and will be removed soon. AutoGPTQ is now used by default for GPTQ models.')
+    logger.warning('--autogptq has been deprecated and will be removed soon. Use --loader autogptq instead.')
+    args.loader = 'autogptq'
+if args.gptq_for_llama:
+    logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.')
+    args.loader = 'gptq-for-llama'
+if args.flexgen:
+    logger.warning('--flexgen has been deprecated and will be removed soon. Use --loader flexgen instead.')
+    args.loader = 'FlexGen'

 # Security warnings
 if args.trust_remote_code:
@ -193,6 +202,22 @@ if args.share:
    logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")


+def fix_loader_name(name):
+    name = name.lower()
+    if name in ['llamacpp', 'llama.cpp', 'llama-cpp', 'llama cpp']:
+        return 'llama.cpp'
+    elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
+        return 'Transformers'
+    elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']:
+        return 'AutoGPTQ'
+    elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']:
+        return 'GPTQ-for-LLaMa'
+
+
+if args.loader is not None:
+    args.loader = fix_loader_name(args.loader)
+
+
 def add_extension(name):
    if args.extensions is None:
        args.extensions = [name]
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -31,7 +31,7 @@ def get_max_prompt_length(state):


 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
-    if shared.model_type in ['rwkv', 'llamacpp']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']:
        input_ids = shared.tokenizer.encode(str(prompt))
        input_ids = np.array(input_ids).reshape(1, len(input_ids))
        return input_ids
@ -51,7 +51,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
    if truncation_length is not None:
        input_ids = input_ids[:, -truncation_length:]

-    if shared.model_type in ['rwkv', 'llamacpp'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel'] or shared.args.cpu:
        return input_ids
    elif shared.args.flexgen:
        return input_ids.numpy()
@ -99,7 +99,7 @@ def fix_galactica(s):


 def get_reply_from_output_ids(output_ids, input_ids, original_question, state, is_chat=False):
-    if shared.model_type == 'HF_seq2seq':
+    if shared.is_seq2seq:
        reply = decode(output_ids, state['skip_special_tokens'])
    else:
        new_tokens = len(output_ids) - len(input_ids[0])
@ -117,7 +117,7 @@ def get_reply_from_output_ids(output_ids, input_ids, original_question, state, i


 def formatted_outputs(reply, model_name):
-    if shared.model_type == 'gpt4chan':
+    if any(s in model_name for s in ['gpt-4chan', 'gpt4chan']):
        reply = fix_gpt4chan(reply)
        return reply, generate_4chan_html(reply)
    else:
@ -142,7 +142,7 @@ def stop_everything_event():

 def generate_reply_wrapper(question, state, eos_token=None, stopping_strings=None):
    for reply in generate_reply(question, state, eos_token, stopping_strings, is_chat=False):
-        if shared.model_type not in ['HF_seq2seq']:
+        if not shared.is_seq2seq:
            reply = question + reply

        yield formatted_outputs(reply, shared.model_name)
@ -157,7 +157,7 @@ def _generate_reply(question, state, eos_token=None, stopping_strings=None, is_c
            yield ''
            return

-        if shared.model_type in ['rwkv', 'llamacpp']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']:
            generate_func = generate_reply_custom
        elif shared.args.flexgen:
            generate_func = generate_reply_flexgen
@ -240,7 +240,7 @@ def generate_reply_HF(question, original_question, seed, state, eos_token=None,

    t0 = time.time()
    try:
-        if not is_chat and shared.model_type != 'HF_seq2seq':
+        if not is_chat and not shared.is_seq2seq:
            yield ''

        # Generate the entire reply at once.
@ -276,7 +276,7 @@ def generate_reply_HF(question, original_question, seed, state, eos_token=None,
    finally:
        t1 = time.time()
        original_tokens = len(original_input_ids[0])
-        new_tokens = len(output) - (original_tokens if shared.model_type != 'HF_seq2seq' else 0)
+        new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
        return

@ -287,7 +287,7 @@ def generate_reply_custom(question, original_question, seed, state, eos_token=No
    for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']:
        generate_params[k] = state[k]

-    if shared.model_type == 'llamacpp':
+    if shared.model.__class__.__name__ in ['LlamaCppModel']:
        for k in ['mirostat_mode', 'mirostat_tau', 'mirostat_eta']:
            generate_params[k] = state[k]

@ -381,6 +381,6 @@ def generate_reply_flexgen(question, original_question, seed, state, eos_token=N
    finally:
        t1 = time.time()
        original_tokens = len(original_input_ids[0])
-        new_tokens = len(output) - (original_tokens if shared.model_type != 'HF_seq2seq' else 0)
+        new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
        return
--- a/modules/ui.py
+++ b/modules/ui.py
@ -30,7 +30,7 @@ theme = gr.themes.Default(


 def list_model_elements():
-    elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'gptq_for_llama', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'no_inject_fused_attention', 'no_inject_fused_mlp', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
+    elements = ['loader', 'cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'no_inject_fused_attention', 'no_inject_fused_mlp', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
    for i in range(torch.cuda.device_count()):
        elements.append(f'gpu_memory_{i}')