From 1d03387f74fa7c3f7ee4d2f837011cbd1015c1ee Mon Sep 17 00:00:00 2001 From: Honkware <119620994+Honkware@users.noreply.github.com> Date: Thu, 29 Jun 2023 01:31:33 -0500 Subject: [PATCH 1/5] Xgen instruction template --- characters/instruction-following/Xgen | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 characters/instruction-following/Xgen diff --git a/characters/instruction-following/Xgen b/characters/instruction-following/Xgen new file mode 100644 index 0000000..c7b76d3 --- /dev/null +++ b/characters/instruction-following/Xgen @@ -0,0 +1,4 @@ +user: "### Human:" +bot: "### Assistant:" +turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n" +context: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" From 0a6a4983830cc9681e3cdd7bdaacc49499ab0f5d Mon Sep 17 00:00:00 2001 From: Honkware <119620994+Honkware@users.noreply.github.com> Date: Thu, 29 Jun 2023 01:32:44 -0500 Subject: [PATCH 2/5] Load xgen tokenizer --- modules/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/models.py b/modules/models.py index f12e700..d6a3585 100644 --- a/modules/models.py +++ b/modules/models.py @@ -94,6 +94,8 @@ def load_tokenizer(model_name, model): if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) elif model.__class__.__name__ in ['LlamaForCausalLM', 'LlamaGPTQForCausalLM', 'ExllamaHF']: + if any(s in model_name.lower() for s in ['xgen']) and Path(f"{shared.args.model_dir}/{model_name}/").exists(): + tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), trust_remote_code=shared.args.trust_remote_code) # Try to load an universal LLaMA tokenizer if not any(s in shared.model_name.lower() for s in ['llava', 'oasst']): for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]: From 3147f0b8f8a17add29d4a19eb83996b6c5b9d5c0 Mon Sep 17 00:00:00 2001 From: Honkware <119620994+Honkware@users.noreply.github.com> Date: Thu, 29 Jun 2023 01:32:53 -0500 Subject: [PATCH 3/5] xgen config --- models/config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/models/config.yaml b/models/config.yaml index 08b35e2..9e81def 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -240,3 +240,6 @@ TheBloke_WizardLM-30B-GPTQ: truncation_length: 8192 .*superhot-8k: truncation_length: 8192 +.*xgen: + truncation_length: 8192 + instruction_template: 'XGen' From 31c297d7e04e02f1aeb92215fea69dbd06101404 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 4 Jul 2023 18:50:01 -0700 Subject: [PATCH 4/5] Various changes --- characters/instruction-following/Xgen | 4 ---- models/config.yaml | 4 ++-- modules/models.py | 2 -- 3 files changed, 2 insertions(+), 8 deletions(-) delete mode 100644 characters/instruction-following/Xgen diff --git a/characters/instruction-following/Xgen b/characters/instruction-following/Xgen deleted file mode 100644 index c7b76d3..0000000 --- a/characters/instruction-following/Xgen +++ /dev/null @@ -1,4 +0,0 @@ -user: "### Human:" -bot: "### Assistant:" -turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n" -context: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n" diff --git a/models/config.yaml b/models/config.yaml index 9e81def..d81eac9 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -240,6 +240,6 @@ TheBloke_WizardLM-30B-GPTQ: truncation_length: 8192 .*superhot-8k: truncation_length: 8192 -.*xgen: +.*xgen.*-inst: truncation_length: 8192 - instruction_template: 'XGen' + instruction_template: 'Vicuna-v0' diff --git a/modules/models.py b/modules/models.py index d6a3585..f12e700 100644 --- a/modules/models.py +++ b/modules/models.py @@ -94,8 +94,6 @@ def load_tokenizer(model_name, model): if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) elif model.__class__.__name__ in ['LlamaForCausalLM', 'LlamaGPTQForCausalLM', 'ExllamaHF']: - if any(s in model_name.lower() for s in ['xgen']) and Path(f"{shared.args.model_dir}/{model_name}/").exists(): - tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), trust_remote_code=shared.args.trust_remote_code) # Try to load an universal LLaMA tokenizer if not any(s in shared.model_name.lower() for s in ['llava', 'oasst']): for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]: From 8705eba830ea04fc248ed66b71463f22a407ae90 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 4 Jul 2023 19:43:19 -0700 Subject: [PATCH 5/5] Remove universal llama tokenizer support Instead replace it with a warning if the tokenizer files look off --- docs/LLaMA-model.md | 8 +------- modules/models.py | 45 +++++++++++++++++++++++---------------------- 2 files changed, 24 insertions(+), 29 deletions(-) diff --git a/docs/LLaMA-model.md b/docs/LLaMA-model.md index 36e9c30..cd65526 100644 --- a/docs/LLaMA-model.md +++ b/docs/LLaMA-model.md @@ -12,13 +12,7 @@ This guide will cover usage through the official `transformers` implementation. * Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789 * Direct download: https://huggingface.co/Neko-Institute-of-Science -⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, so I recommend downloading the following universal LLaMA tokenizer: - -``` -python download-model.py oobabooga/llama-tokenizer -``` - -Once downloaded, it will be automatically applied to **every** `LlamaForCausalLM` model that you try to load. +⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer ### Option 2: convert the weights yourself diff --git a/modules/models.py b/modules/models.py index 4b47e64..160ca9e 100644 --- a/modules/models.py +++ b/modules/models.py @@ -3,6 +3,7 @@ import os import re import time from pathlib import Path +import hashlib import torch import transformers @@ -14,7 +15,6 @@ from transformers import ( AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig, - LlamaTokenizer ) import modules.shared as shared @@ -91,30 +91,31 @@ def load_model(model_name, loader=None): def load_tokenizer(model_name, model): tokenizer = None + path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) - elif model.__class__.__name__ in ['LlamaForCausalLM', 'LlamaGPTQForCausalLM', 'ExllamaHF']: - # Try to load an universal LLaMA tokenizer - if not any(s in shared.model_name.lower() for s in ['llava', 'oasst']): - for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]: - if p.exists(): - logger.info(f"Loading the universal LLaMA tokenizer from {p}...") - tokenizer = LlamaTokenizer.from_pretrained(p, clean_up_tokenization_spaces=True) - return tokenizer + elif path_to_model.exists(): + tokenizer = AutoTokenizer.from_pretrained( + path_to_model, + trust_remote_code=shared.args.trust_remote_code, + use_fast=False + ) - # Otherwise, load it from the model folder and hope that these - # are not outdated tokenizer files. - tokenizer = LlamaTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), clean_up_tokenization_spaces=True) - try: - tokenizer.eos_token_id = 2 - tokenizer.bos_token_id = 1 - tokenizer.pad_token_id = 0 - except: - pass - else: - path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") - if path_to_model.exists(): - tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) + if tokenizer.__class__.__name__ == 'LlamaTokenizer': + pairs = [ + ['tokenizer_config.json', '516c6167c884793a738c440e29ccb80c15e1493ffc965affc69a1a8ddef4572a'], + ['special_tokens_map.json', 'ff3b4a612c4e447acb02d40071bddd989fe0da87eb5b7fe0dbadfc4f74de7531'] + ] + + for pair in pairs: + p = path_to_model / pair[0] + if p.exists(): + with open(p, "rb") as f: + bytes = f.read() + + file_hash = hashlib.sha256(bytes).hexdigest() + if file_hash != pair[1]: + logger.warning(f"{p} is different from the original LlamaTokenizer file. It is either customized or outdated.") return tokenizer