From eb2601a8c350ae5ba62ad1cfce0ab83397f6a95a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 Jun 2023 14:51:02 -0300 Subject: [PATCH 1/9] Reorganize Parameters tab --- server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server.py b/server.py index 5e770d1..25dbae8 100644 --- a/server.py +++ b/server.py @@ -474,21 +474,21 @@ def create_settings_menus(default_preset): gr.Markdown('Main parameters') with gr.Row(): with gr.Column(): + shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature', info='Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.') shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p', info='If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.') shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k', info='Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.') shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p', info='If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.') shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff', info='In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.') shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff', info='In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.') - shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs') - shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a') with gr.Column(): shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty', info='Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.') shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty', info='Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.') shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size', info='If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.') shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length', info='Minimum generation length in tokens.') - shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') + shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs') + shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a') with gr.Column(): create_chat_settings_menus() From 084b006cfe7c10b743830b5126e1caca9fcfab2f Mon Sep 17 00:00:00 2001 From: zaypen Date: Thu, 8 Jun 2023 02:34:50 +0800 Subject: [PATCH 2/9] Update LLaMA-model.md (#2460) Better approach of converting LLaMA model --- docs/LLaMA-model.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/LLaMA-model.md b/docs/LLaMA-model.md index 338d458..6706b16 100644 --- a/docs/LLaMA-model.md +++ b/docs/LLaMA-model.md @@ -30,7 +30,15 @@ pip install protobuf==3.20.1 2. Use the script below to convert the model in `.pth` format that you, a fellow academic, downloaded using Meta's official link: -### [convert_llama_weights_to_hf.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) +### Convert LLaMA to HuggingFace format + +If you have `transformers` installed in place + +``` +python -m transformers.models.llama.convert_llama_weights_to_hf --input_dir /path/to/LLaMA --model_size 7B --output_dir /tmp/outputs/llama-7b +``` + +Otherwise download script [convert_llama_weights_to_hf.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) ``` python convert_llama_weights_to_hf.py --input_dir /path/to/LLaMA --model_size 7B --output_dir /tmp/outputs/llama-7b From 240752617de97ecf5b65a949c454fdb2d50d35e4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 Jun 2023 11:16:38 -0300 Subject: [PATCH 3/9] Increase download timeout to 20s --- download-model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/download-model.py b/download-model.py index f5c2732..540f94c 100644 --- a/download-model.py +++ b/download-model.py @@ -108,7 +108,7 @@ class ModelDownloader: is_lora = False while True: url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "") - r = self.s.get(url, timeout=10) + r = self.s.get(url, timeout=20) r.raise_for_status() content = r.content @@ -180,7 +180,7 @@ class ModelDownloader: output_path = output_folder / filename if output_path.exists() and not start_from_scratch: # Check if the file has already been downloaded completely - r = self.s.get(url, stream=True, timeout=10) + r = self.s.get(url, stream=True, timeout=20) total_size = int(r.headers.get('content-length', 0)) if output_path.stat().st_size >= total_size: return @@ -191,7 +191,7 @@ class ModelDownloader: headers = {} mode = 'wb' - r = self.s.get(url, stream=True, headers=headers, timeout=10) + r = self.s.get(url, stream=True, headers=headers, timeout=20) with open(output_path, mode) as f: total_size = int(r.headers.get('content-length', 0)) block_size = 1024 From 7be6fe126b9feec78e77c22a8364fa049587b776 Mon Sep 17 00:00:00 2001 From: matatonic <73265741+matatonic@users.noreply.github.com> Date: Thu, 8 Jun 2023 10:34:36 -0400 Subject: [PATCH 4/9] extensions/api: models api for blocking_api (updated) (#2539) --- api-examples/api-example-chat.py | 2 +- api-examples/api-example-model.py | 176 ++++++++++++++++++ extensions/api/blocking_api.py | 73 ++++++++ .../multimodal/pipelines/llava/llava.py | 7 +- 4 files changed, 256 insertions(+), 2 deletions(-) create mode 100755 api-examples/api-example-model.py diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py index 905fbca..8ea6ed1 100644 --- a/api-examples/api-example-chat.py +++ b/api-examples/api-example-chat.py @@ -7,7 +7,7 @@ HOST = 'localhost:5000' URI = f'http://{HOST}/api/v1/chat' # For reverse-proxied streaming, the remote will likely host with ssl - https:// -# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate' +# URI = 'https://your-uri-here.trycloudflare.com/api/v1/chat' def run(user_input, history): diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py new file mode 100755 index 0000000..8e1e300 --- /dev/null +++ b/api-examples/api-example-model.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +import requests + +HOST = '0.0.0.0:5000' + +def generate(prompt, tokens = 200): + request = { 'prompt': prompt, 'max_new_tokens': tokens } + response = requests.post(f'http://{HOST}/api/v1/generate', json=request) + + if response.status_code == 200: + return response.json()['results'][0]['text'] + + +def model_api(request): + response = requests.post(f'http://{HOST}/api/v1/model', json=request) + return response.json() + + +# print some common settings +def print_basic_model_info(response): + basic_settings = ['truncation_length', 'instruction_template'] + print("Model: ", response['result']['model_name']) + print("Lora(s): ", response['result']['lora_names']) + for setting in basic_settings: + print(setting, "=", response['result']['shared.settings'][setting]) + + +# model info +def model_info(): + response = model_api({'action': 'info'}) + print_basic_model_info(response) + + +# simple loader +def model_load(model_name): + return model_api({'action': 'load', 'model_name': model_name}) + + +# complex loader +def complex_model_load(model): + + def guess_groupsize(model_name): + if '1024g' in model_name: + return 1024 + elif '128g' in model_name: + return 128 + elif '32g' in model_name: + return 32 + else: + return -1 + + req = { + 'action': 'load', + 'model_name': model, + 'args': { + 'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama + + 'bf16': False, + 'load_in_8bit': False, + 'groupsize': 0, + 'wbits': 0, + + # llama.cpp + 'threads': 0, + 'n_batch': 512, + 'no_mmap': False, + 'mlock': False, + 'cache_capacity': None, + 'n_gpu_layers': 0, + 'n_ctx': 2048, + + # RWKV + 'rwkv_strategy': None, + 'rwkv_cuda_on': False, + + # b&b 4-bit + #'load_in_4bit': False, + #'compute_dtype': 'float16', + #'quant_type': 'nf4', + #'use_double_quant': False, + + #"cpu": false, + #"auto_devices": false, + #"gpu_memory": null, + #"cpu_memory": null, + #"disk": false, + #"disk_cache_dir": "cache", + }, + } + + model = model.lower() + + if '4bit' in model or 'gptq' in model or 'int4' in model: + req['args']['wbits'] = 4 + req['args']['groupsize'] = guess_groupsize(model) + elif '3bit' in model: + req['args']['wbits'] = 3 + req['args']['groupsize'] = guess_groupsize(model) + else: + req['args']['gptq_for_llama'] = False + + if '8bit' in model: + req['args']['load_in_8bit'] = True + elif '-hf' in model or 'fp16' in model: + if '7b' in model: + req['args']['bf16'] = True # for 24GB + elif '13b' in model: + req['args']['load_in_8bit'] = True # for 24GB + elif 'ggml' in model: + #req['args']['threads'] = 16 + if '7b' in model: + req['args']['n_gpu_layers'] = 100 + elif '13b' in model: + req['args']['n_gpu_layers'] = 100 + elif '30b' in model or '33b' in model: + req['args']['n_gpu_layers'] = 59 # 24GB + elif '65b' in model: + req['args']['n_gpu_layers'] = 42 # 24GB + elif 'rwkv' in model: + req['args']['rwkv_cuda_on'] = True + if '14b' in model: + req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB + else: + req['args']['rwkv_strategy'] = 'cuda f16' # 24GB + + + return model_api(req) + + +if __name__ == '__main__': + for model in model_api({'action': 'list'})['result']: + try: + resp = complex_model_load(model) + + if 'error' in resp: + print (f"❌ {model} FAIL Error: {resp['error']['message']}") + continue + else: + print_basic_model_info(resp) + + ans = generate("0,1,1,2,3,5,8,13,", tokens=2) + + if '21' in ans: + print (f"✅ {model} PASS ({ans})") + else: + print (f"❌ {model} FAIL ({ans})") + + except Exception as e: + print (f"❌ {model} FAIL Exception: {repr(e)}") + + +# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21. +# Some results below. +""" $ ./model-api-example.py +Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda +Lora(s): [] +truncation_length = 2048 +instruction_template = Alpaca +✅ 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21) +Model: 4bit_WizardLM-13B-Uncensored-4bit-128g +Lora(s): [] +truncation_length = 2048 +instruction_template = WizardLM +✅ 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21) +Model: Aeala_VicUnlocked-alpaca-30b-4bit +Lora(s): [] +truncation_length = 2048 +instruction_template = Alpaca +✅ Aeala_VicUnlocked-alpaca-30b-4bit PASS (21) +Model: alpaca-30b-4bit +Lora(s): [] +truncation_length = 2048 +instruction_template = Alpaca +✅ alpaca-30b-4bit PASS (21) +""" diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py index 6bcd840..c787cd0 100644 --- a/extensions/api/blocking_api.py +++ b/extensions/api/blocking_api.py @@ -6,7 +6,19 @@ from extensions.api.util import build_parameters, try_start_cloudflared from modules import shared from modules.chat import generate_chat_reply from modules.text_generation import encode, generate_reply, stop_everything_event +from modules.models import load_model, unload_model +from modules.LoRA import add_lora_to_model +from modules.utils import get_available_models +from server import get_model_specific_settings, update_model_parameters +def get_model_info(): + return { + 'model_name': shared.model_name, + 'lora_names': shared.lora_names, + # dump + 'shared.settings': shared.settings, + 'shared.args': vars(shared.args), + } class Handler(BaseHTTPRequestHandler): def do_GET(self): @@ -91,6 +103,67 @@ class Handler(BaseHTTPRequestHandler): self.wfile.write(response.encode('utf-8')) + elif self.path == '/api/v1/model': + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + + # by default return the same as the GET interface + result = shared.model_name + + # Actions: info, load, list, unload + action = body.get('action', '') + + if action == 'load': + model_name = body['model_name'] + args = body.get('args', {}) + print('args', args) + for k in args: + setattr(shared.args, k, args[k]) + + shared.model_name = model_name + unload_model() + + model_settings = get_model_specific_settings(shared.model_name) + shared.settings.update(model_settings) + update_model_parameters(model_settings, initial=True) + + if shared.settings['mode'] != 'instruct': + shared.settings['instruction_template'] = None + + try: + shared.model, shared.tokenizer = load_model(shared.model_name) + if shared.args.lora: + add_lora_to_model(shared.args.lora) # list + + except Exception as e: + response = json.dumps({'error': { 'message': repr(e) } }) + + self.wfile.write(response.encode('utf-8')) + raise e + + shared.args.model = shared.model_name + + result = get_model_info() + + elif action == 'unload': + unload_model() + shared.model_name = None + shared.args.model = None + result = get_model_info() + + elif action == 'list': + result = get_available_models() + + elif action == 'info': + result = get_model_info() + + response = json.dumps({ + 'result': result, + }) + + self.wfile.write(response.encode('utf-8')) + elif self.path == '/api/v1/token-count': self.send_response(200) self.send_header('Content-Type', 'application/json') diff --git a/extensions/multimodal/pipelines/llava/llava.py b/extensions/multimodal/pipelines/llava/llava.py index 16f0e06..eca2be5 100644 --- a/extensions/multimodal/pipelines/llava/llava.py +++ b/extensions/multimodal/pipelines/llava/llava.py @@ -56,7 +56,12 @@ class LLaVA_v0_Pipeline(AbstractMultimodalPipeline): @staticmethod def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor: - return shared.model.model.embed_tokens(input_ids).to(shared.model.device, dtype=shared.model.dtype) + if hasattr(shared.model.model, 'embed_tokens'): + func = shared.model.model.embed_tokens + else: + func = shared.model.model.model.embed_tokens # AutoGPTQ case + + return func(input_ids).to(shared.model.device, dtype=shared.model.dtype) @staticmethod def placeholder_embeddings() -> torch.Tensor: From db2cbe7b5aa54a365ac25cd9da65f8b8053239e6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 Jun 2023 11:41:06 -0300 Subject: [PATCH 5/9] Detect WizardLM-30B-V1.0 instruction format --- models/config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/models/config.yaml b/models/config.yaml index 2c09267..36d8696 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -188,3 +188,6 @@ llama-65b-gptq-3bit: mode: 'instruct' instruction_template: 'Vicuna-v1.1' truncation_length: 4096 +.*WizardLM-30B-V1.0: + mode: 'instruct' + instruction_template: 'Vicuna-v1.1' From ac40c59ac38e8d2b20111904b17fb3fee2868eaf Mon Sep 17 00:00:00 2001 From: FartyPants Date: Thu, 8 Jun 2023 11:24:32 -0400 Subject: [PATCH 6/9] Added Guanaco-QLoRA to Instruct character (#2574) --- characters/instruction-following/Guanaco-QLoRA.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 characters/instruction-following/Guanaco-QLoRA.yaml diff --git a/characters/instruction-following/Guanaco-QLoRA.yaml b/characters/instruction-following/Guanaco-QLoRA.yaml new file mode 100644 index 0000000..4c321cb --- /dev/null +++ b/characters/instruction-following/Guanaco-QLoRA.yaml @@ -0,0 +1,4 @@ +user: "### Human:" +bot: "### Assistant:" +turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|>\n" +context: "" \ No newline at end of file From 0f8140e99d8ea311b90b88d188752972b064a07c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 9 Jun 2023 00:25:13 -0300 Subject: [PATCH 7/9] Bump transformers/accelerate/peft/autogptq --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9fdab32..0a5adce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +accelerate==0.20.3 colorama datasets einops @@ -14,12 +15,11 @@ safetensors==0.3.1 sentencepiece tqdm scipy -git+https://github.com/huggingface/peft@3714aa2fff158fdfa637b2b65952580801d890b2 -git+https://github.com/huggingface/transformers@e45e756d22206ca8fa9fb057c8c3d8fa79bf81c6 -git+https://github.com/huggingface/accelerate@0226f750257b3bf2cadc4f189f9eef0c764a0467 +transformers==4.30.0 +git+https://github.com/huggingface/peft@e45529b149c7f91ec1d4d82a5a152ef56c56cb94 bitsandbytes==0.39.0; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows" llama-cpp-python==0.1.57; platform_system != "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.57/llama_cpp_python-0.1.57-cp310-cp310-win_amd64.whl; platform_system == "Windows" -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" From 8a7a8343beec75a4b1877d0af2be182c60d78d59 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 9 Jun 2023 00:26:34 -0300 Subject: [PATCH 8/9] Detect TheBloke_WizardLM-30B-GPTQ --- models/config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/models/config.yaml b/models/config.yaml index 36d8696..ebb7a5a 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -191,3 +191,6 @@ llama-65b-gptq-3bit: .*WizardLM-30B-V1.0: mode: 'instruct' instruction_template: 'Vicuna-v1.1' +TheBloke_WizardLM-30B-GPTQ: + mode: 'instruct' + instruction_template: 'Vicuna-v1.1' From c6552785af09d0e527ee169a22ed11b251e7a0d5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 9 Jun 2023 00:30:22 -0300 Subject: [PATCH 9/9] Minor cleanup --- docs/LLaMA-model.md | 8 +++----- extensions/api/blocking_api.py | 6 ++++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/LLaMA-model.md b/docs/LLaMA-model.md index 6706b16..36e9c30 100644 --- a/docs/LLaMA-model.md +++ b/docs/LLaMA-model.md @@ -28,17 +28,15 @@ Once downloaded, it will be automatically applied to **every** `LlamaForCausalLM pip install protobuf==3.20.1 ``` -2. Use the script below to convert the model in `.pth` format that you, a fellow academic, downloaded using Meta's official link: +2. Use the script below to convert the model in `.pth` format that you, a fellow academic, downloaded using Meta's official link. -### Convert LLaMA to HuggingFace format - -If you have `transformers` installed in place +If you have `transformers` installed in place: ``` python -m transformers.models.llama.convert_llama_weights_to_hf --input_dir /path/to/LLaMA --model_size 7B --output_dir /tmp/outputs/llama-7b ``` -Otherwise download script [convert_llama_weights_to_hf.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) +Otherwise download [convert_llama_weights_to_hf.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) first and run: ``` python convert_llama_weights_to_hf.py --input_dir /path/to/LLaMA --model_size 7B --output_dir /tmp/outputs/llama-7b diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py index c787cd0..0d22888 100644 --- a/extensions/api/blocking_api.py +++ b/extensions/api/blocking_api.py @@ -5,12 +5,14 @@ from threading import Thread from extensions.api.util import build_parameters, try_start_cloudflared from modules import shared from modules.chat import generate_chat_reply -from modules.text_generation import encode, generate_reply, stop_everything_event -from modules.models import load_model, unload_model from modules.LoRA import add_lora_to_model +from modules.models import load_model, unload_model +from modules.text_generation import (encode, generate_reply, + stop_everything_event) from modules.utils import get_available_models from server import get_model_specific_settings, update_model_parameters + def get_model_info(): return { 'model_name': shared.model_name,