diff --git a/README.md b/README.md index 22f2d9f..c3e0474 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features * 3 interface modes: default (two columns), notebook, and chat -* Multiple model backends: [transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlama](https://github.com/turboderp/exllama), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) +* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlama](https://github.com/turboderp/exllama), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers) * Dropdown menu for quickly switching between different models * LoRA: load and unload LoRAs on the fly, train a new LoRA using QLoRA * Precise instruction templates for chat mode, including Llama-2-chat, Alpaca, Vicuna, WizardLM, StableLM, and many others @@ -283,7 +283,7 @@ Optionally, you can use the following command-line flags: | Flag | Description | |--------------------------------------------|-------------| -| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, exllama_hf, exllamav2_hf, exllama, exllamav2, autogptq, gptq-for-llama, llama.cpp, llamacpp_hf, ctransformers, autoawq. | +| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlama_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ExLlama, ExLlamav2, ctransformers. | #### Accelerate/transformers diff --git a/modules/exllamav2.py b/modules/exllamav2.py index cb142af..b92e884 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -131,7 +131,7 @@ class Exllamav2Model: token, _, _ = ExLlamaV2Sampler.sample(logits, settings, ids, random.random(), self.tokenizer) ids = torch.cat([ids, token], dim=1) - if i == 0 and self.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): + if i == 0 and self.tokenizer.tokenizer.id_to_piece(int(token)).startswith('▁'): has_leading_space = True decoded_text = self.tokenizer.decode(ids[:, initial_len:], decode_special_tokens=not state['skip_special_tokens'])[0] diff --git a/modules/loaders.py b/modules/loaders.py index 42a5cfd..062f353 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -71,7 +71,6 @@ loaders_and_params = OrderedDict({ 'compress_pos_emb', 'cfg_cache', 'no_use_fast', - 'exllama_HF_info', ], 'ExLlamav2_HF': [ 'gpu_split', @@ -133,6 +132,7 @@ loaders_and_params = OrderedDict({ 'cache_8bit', 'alpha_value', 'compress_pos_emb', + 'exllamav2_info', ], 'ctransformers': [ 'n_ctx', diff --git a/modules/shared.py b/modules/shared.py index 66703a5..c0899a9 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -43,7 +43,7 @@ settings = { 'seed': -1, 'truncation_length': 2048, 'truncation_length_min': 0, - 'truncation_length_max': 32768, + 'truncation_length_max': 200000, 'max_tokens_second': 0, 'custom_stopping_strings': '', 'custom_token_bans': '', @@ -79,7 +79,7 @@ parser.add_argument('--verbose', action='store_true', help='Print the prompts to parser.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.') # Model loader -parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, exllama_hf, exllamav2_hf, exllama, exllamav2, autogptq, gptq-for-llama, llama.cpp, llamacpp_hf, ctransformers, autoawq.') +parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlama_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ExLlama, ExLlamav2, ctransformers.') # Accelerate/transformers parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') diff --git a/modules/training.py b/modules/training.py index c01f27d..ca1fffb 100644 --- a/modules/training.py +++ b/modules/training.py @@ -165,7 +165,7 @@ def create_ui(): stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.') with gr.Column(): - max_length = gr.Slider(label='max_length', minimum=0, maximum=32768, value=0, step=256, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.') + max_length = gr.Slider(label='max_length', minimum=0, maximum=shared.settings['truncation_length_max'], value=0, step=256, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.') with gr.Row(): start_current_evaluation = gr.Button("Evaluate loaded model", interactive=not mu) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 12edeed..8f4d685 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -95,7 +95,7 @@ def create_ui(): shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None") shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None") shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) - shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.') + shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len) shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) @@ -128,9 +128,9 @@ def create_ui(): shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') - shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') - shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).') - shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.') + shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlama_HF or AutoGPTQ are preferred for GPTQ models when supported.') + shared.gradio['exllama_info'] = gr.Markdown("ExLlama_HF is recommended over ExLlama for better integration with extensions and more consistent sampling behavior across loaders.") + shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer.\n\nOption 2: place your .gguf in a subfolder of models/ along with these 3 files: tokenizer.model, tokenizer_config.json, and special_tokens_map.json. This takes precedence over Option 1.') with gr.Column(): diff --git a/requirements.txt b/requirements.txt index 3d25bfd..f4cbaea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8; platform_system != "Darwin" and platform_machine != "x86_64" +exllamav2==0.0.10; platform_system != "Darwin" and platform_machine != "x86_64" gradio==3.50.* markdown numpy==1.24.* @@ -53,14 +53,14 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121 https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" diff --git a/requirements_amd.txt b/requirements_amd.txt index e27f301..892cae7 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8 +exllamav2==0.0.10 gradio==3.50.* markdown numpy==1.24.* diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index f78832e..228c517 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8 +exllamav2==0.0.10 gradio==3.50.* markdown numpy==1.24.* diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index febb060..377373f 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8 +exllamav2==0.0.10 gradio==3.50.* markdown numpy==1.24.* diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index 2997c98..e280108 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8 +exllamav2==0.0.10 gradio==3.50.* markdown numpy==1.24.* diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index ec0e84f..7e83c66 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8 +exllamav2==0.0.10 gradio==3.50.* markdown numpy==1.24.* diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index 02e5184..b31eec1 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8 +exllamav2==0.0.10 gradio==3.50.* markdown numpy==1.24.* diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index eeff8eb..c00d9bd 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8; platform_system != "Darwin" and platform_machine != "x86_64" +exllamav2==0.0.10; platform_system != "Darwin" and platform_machine != "x86_64" gradio==3.50.* markdown numpy==1.24.* @@ -53,14 +53,14 @@ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121 https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/turboderp/exllamav2/releases/download/v0.0.8/exllamav2-0.0.8+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" +https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index d08204f..b285e4c 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -1,8 +1,8 @@ -accelerate==0.24.* +accelerate==0.25.* colorama datasets einops -exllamav2==0.0.8 +exllamav2==0.0.10 gradio==3.50.* markdown numpy==1.24.* diff --git a/settings-template.yaml b/settings-template.yaml index 4ca8422..cb16844 100644 --- a/settings-template.yaml +++ b/settings-template.yaml @@ -13,7 +13,7 @@ seed: -1 negative_prompt: '' truncation_length: 2048 truncation_length_min: 0 -truncation_length_max: 32768 +truncation_length_max: 200000 custom_stopping_strings: '' auto_max_new_tokens: false max_tokens_second: 0