Add ExLlama support (#2444)

2023-06-16 20:35:38 -03:00 · 2023-06-16 20:35:38 -03:00 · 9f40032d32
commit 9f40032d32
parent dea43685b0
12 changed files with 156 additions and 47 deletions
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@ -38,31 +38,31 @@ class RWKVModel:
        result.cached_output_logits = None
        return result

-    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=None, token_stop=None, callback=None):
+    def generate(self, prompt, state, callback=None):
        args = PIPELINE_ARGS(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            alpha_frequency=alpha_frequency,  # Frequency Penalty (as in GPT-3)
-            alpha_presence=alpha_presence,  # Presence Penalty (as in GPT-3)
-            token_ban=token_ban or [0],  # ban the generation of some tokens
-            token_stop=token_stop or []
+            temperature=state['temperature'],
+            top_p=state['top_p'],
+            top_k=state['top_k'],
+            alpha_frequency=0.1,  # Frequency Penalty (as in GPT-3)
+            alpha_presence=0.1,  # Presence Penalty (as in GPT-3)
+            token_ban=[0],  # ban the generation of some tokens
+            token_stop=[]
        )

        if self.cached_context != "":
-            if context.startswith(self.cached_context):
-                context = context[len(self.cached_context):]
+            if prompt.startswith(self.cached_context):
+                prompt = prompt[len(self.cached_context):]
            else:
                self.cached_context = ""
                self.cached_model_state = None
                self.cached_output_logits = None

-        # out = self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
-        out = self.generate_from_cached_state(context, token_count=token_count, args=args, callback=callback)
+        # out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
+        out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
        return out

-    def generate_with_streaming(self, **kwargs):
-        with Iteratorize(self.generate, kwargs, callback=None) as generator:
+    def generate_with_streaming(self, *args, **kwargs):
+        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
            reply = ''
            for token in generator:
                reply += token
@ -81,6 +81,7 @@ class RWKVModel:
        if ctx == "":
            out = self.cached_output_logits

+        token = None
        for i in range(token_count):
            # forward
            tokens = self.pipeline.encode(ctx) if i == 0 else [token]