From 1d03387f74fa7c3f7ee4d2f837011cbd1015c1ee Mon Sep 17 00:00:00 2001
From: Honkware <119620994+Honkware@users.noreply.github.com>
Date: Thu, 29 Jun 2023 01:31:33 -0500
Subject: [PATCH 1/5] Xgen instruction template

---
 characters/instruction-following/Xgen | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 characters/instruction-following/Xgen

diff --git a/characters/instruction-following/Xgen b/characters/instruction-following/Xgen
new file mode 100644
index 0000000..c7b76d3
--- /dev/null
+++ b/characters/instruction-following/Xgen
@@ -0,0 +1,4 @@
+user: "### Human:"
+bot: "### Assistant:"
+turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n"
+context: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"

From 0a6a4983830cc9681e3cdd7bdaacc49499ab0f5d Mon Sep 17 00:00:00 2001
From: Honkware <119620994+Honkware@users.noreply.github.com>
Date: Thu, 29 Jun 2023 01:32:44 -0500
Subject: [PATCH 2/5] Load xgen tokenizer

---
 modules/models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/models.py b/modules/models.py
index f12e700..d6a3585 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -94,6 +94,8 @@ def load_tokenizer(model_name, model):
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
     elif model.__class__.__name__ in ['LlamaForCausalLM', 'LlamaGPTQForCausalLM', 'ExllamaHF']:
+        if any(s in model_name.lower() for s in ['xgen']) and Path(f"{shared.args.model_dir}/{model_name}/").exists():
+            tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), trust_remote_code=shared.args.trust_remote_code)
         # Try to load an universal LLaMA tokenizer
         if not any(s in shared.model_name.lower() for s in ['llava', 'oasst']):
             for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]:

From 3147f0b8f8a17add29d4a19eb83996b6c5b9d5c0 Mon Sep 17 00:00:00 2001
From: Honkware <119620994+Honkware@users.noreply.github.com>
Date: Thu, 29 Jun 2023 01:32:53 -0500
Subject: [PATCH 3/5] xgen config

---
 models/config.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/models/config.yaml b/models/config.yaml
index 08b35e2..9e81def 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -240,3 +240,6 @@ TheBloke_WizardLM-30B-GPTQ:
   truncation_length: 8192
 .*superhot-8k:
   truncation_length: 8192
+.*xgen:
+  truncation_length: 8192
+  instruction_template: 'XGen'

From 31c297d7e04e02f1aeb92215fea69dbd06101404 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 4 Jul 2023 18:50:01 -0700
Subject: [PATCH 4/5] Various changes

---
 characters/instruction-following/Xgen | 4 ----
 models/config.yaml                    | 4 ++--
 modules/models.py                     | 2 --
 3 files changed, 2 insertions(+), 8 deletions(-)
 delete mode 100644 characters/instruction-following/Xgen

diff --git a/characters/instruction-following/Xgen b/characters/instruction-following/Xgen
deleted file mode 100644
index c7b76d3..0000000
--- a/characters/instruction-following/Xgen
+++ /dev/null
@@ -1,4 +0,0 @@
-user: "### Human:"
-bot: "### Assistant:"
-turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n"
-context: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
diff --git a/models/config.yaml b/models/config.yaml
index 9e81def..d81eac9 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -240,6 +240,6 @@ TheBloke_WizardLM-30B-GPTQ:
   truncation_length: 8192
 .*superhot-8k:
   truncation_length: 8192
-.*xgen:
+.*xgen.*-inst:
   truncation_length: 8192
-  instruction_template: 'XGen'
+  instruction_template: 'Vicuna-v0'
diff --git a/modules/models.py b/modules/models.py
index d6a3585..f12e700 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -94,8 +94,6 @@ def load_tokenizer(model_name, model):
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
     elif model.__class__.__name__ in ['LlamaForCausalLM', 'LlamaGPTQForCausalLM', 'ExllamaHF']:
-        if any(s in model_name.lower() for s in ['xgen']) and Path(f"{shared.args.model_dir}/{model_name}/").exists():
-            tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), trust_remote_code=shared.args.trust_remote_code)
         # Try to load an universal LLaMA tokenizer
         if not any(s in shared.model_name.lower() for s in ['llava', 'oasst']):
             for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]:

From 8705eba830ea04fc248ed66b71463f22a407ae90 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 4 Jul 2023 19:43:19 -0700
Subject: [PATCH 5/5] Remove universal llama tokenizer support

Instead replace it with a warning if the tokenizer files look off
---
 docs/LLaMA-model.md |  8 +-------
 modules/models.py   | 45 +++++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/docs/LLaMA-model.md b/docs/LLaMA-model.md
index 36e9c30..cd65526 100644
--- a/docs/LLaMA-model.md
+++ b/docs/LLaMA-model.md
@@ -12,13 +12,7 @@ This guide will cover usage through the official `transformers` implementation.
 * Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789
 * Direct download: https://huggingface.co/Neko-Institute-of-Science
 
-⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, so I recommend downloading the following universal LLaMA tokenizer: 
-
-```
-python download-model.py oobabooga/llama-tokenizer
-```
-
-Once downloaded, it will be automatically applied to **every** `LlamaForCausalLM` model that you try to load.
+⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer
 
 ### Option 2: convert the weights yourself
 
diff --git a/modules/models.py b/modules/models.py
index 4b47e64..160ca9e 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -3,6 +3,7 @@ import os
 import re
 import time
 from pathlib import Path
+import hashlib
 
 import torch
 import transformers
@@ -14,7 +15,6 @@ from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
     BitsAndBytesConfig,
-    LlamaTokenizer
 )
 
 import modules.shared as shared
@@ -91,30 +91,31 @@ def load_model(model_name, loader=None):
 
 def load_tokenizer(model_name, model):
     tokenizer = None
+    path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
-    elif model.__class__.__name__ in ['LlamaForCausalLM', 'LlamaGPTQForCausalLM', 'ExllamaHF']:
-        # Try to load an universal LLaMA tokenizer
-        if not any(s in shared.model_name.lower() for s in ['llava', 'oasst']):
-            for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]:
-                if p.exists():
-                    logger.info(f"Loading the universal LLaMA tokenizer from {p}...")
-                    tokenizer = LlamaTokenizer.from_pretrained(p, clean_up_tokenization_spaces=True)
-                    return tokenizer
+    elif path_to_model.exists():
+        tokenizer = AutoTokenizer.from_pretrained(
+            path_to_model,
+            trust_remote_code=shared.args.trust_remote_code,
+            use_fast=False
+        )
 
-        # Otherwise, load it from the model folder and hope that these
-        # are not outdated tokenizer files.
-        tokenizer = LlamaTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), clean_up_tokenization_spaces=True)
-        try:
-            tokenizer.eos_token_id = 2
-            tokenizer.bos_token_id = 1
-            tokenizer.pad_token_id = 0
-        except:
-            pass
-    else:
-        path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
-        if path_to_model.exists():
-            tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
+    if tokenizer.__class__.__name__ == 'LlamaTokenizer':
+        pairs = [
+            ['tokenizer_config.json', '516c6167c884793a738c440e29ccb80c15e1493ffc965affc69a1a8ddef4572a'],
+            ['special_tokens_map.json', 'ff3b4a612c4e447acb02d40071bddd989fe0da87eb5b7fe0dbadfc4f74de7531']
+        ]
+
+        for pair in pairs:
+            p = path_to_model / pair[0]
+            if p.exists():
+                with open(p, "rb") as f:
+                    bytes = f.read()
+
+                file_hash = hashlib.sha256(bytes).hexdigest()
+                if file_hash != pair[1]:
+                    logger.warning(f"{p} is different from the original LlamaTokenizer file. It is either customized or outdated.")
 
     return tokenizer