Remove GGML support

2023-09-11 07:30:56 -07:00 · 2023-09-11 07:30:56 -07:00 · ed86878f02
commit ed86878f02
parent cc7b7ba153
15 changed files with 24 additions and 123 deletions
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -1,7 +1,5 @@
 import re
 from functools import partial
-from pathlib import Path
-from typing import Union

 import torch

@ -9,39 +7,23 @@ from modules import RoPE, shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length
-from modules.utils import is_gguf

 import llama_cpp

-try:
-    import llama_cpp_ggml
-except:
-    llama_cpp_ggml = llama_cpp
-
 if torch.cuda.is_available() and not torch.version.hip:
    try:
        import llama_cpp_cuda
    except:
        llama_cpp_cuda = None
-    try:
-        import llama_cpp_ggml_cuda
-    except:
-        llama_cpp_ggml_cuda = llama_cpp_cuda
 else:
    llama_cpp_cuda = None
-    llama_cpp_ggml_cuda = None


-def llama_cpp_lib(model_file: Union[str, Path] = None):
-    if model_file is not None:
-        gguf_model = is_gguf(model_file)
-    else:
-        gguf_model = True
-
+def llama_cpp_lib():
    if shared.args.cpu or llama_cpp_cuda is None:
-        return llama_cpp if gguf_model else llama_cpp_ggml
+        return llama_cpp
    else:
-        return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda
+        return llama_cpp_cuda


 def ban_eos_logits_processor(eos_token, input_ids, logits):
@ -59,8 +41,8 @@ class LlamaCppModel:
    @classmethod
    def from_pretrained(self, path):

-        Llama = llama_cpp_lib(path).Llama
-        LlamaCache = llama_cpp_lib(path).LlamaCache
+        Llama = llama_cpp_lib().Llama
+        LlamaCache = llama_cpp_lib().LlamaCache

        result = self()
        cache_capacity = 0
@ -95,13 +77,6 @@ class LlamaCppModel:
            'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
        }

-        if not is_gguf(path):
-            ggml_params = {
-                'n_gqa': shared.args.n_gqa or None,
-                'rms_norm_eps': shared.args.rms_norm_eps or None,
-            }
-            params = params | ggml_params
-
        result.model = Llama(**params)
        if cache_capacity > 0:
            result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))