diff --git a/README.md b/README.md index bd356a1..1326835 100644 --- a/README.md +++ b/README.md @@ -337,8 +337,9 @@ Optionally, you can use the following command-line flags: | Flag | Description | |------------------|-------------| -|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. | -|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. | +| `--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both. +| `--rope_freq_base ROPE_FREQ_BASE` | If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63) +| `--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale. #### Gradio diff --git a/modules/exllama.py b/modules/exllama.py index 25bf0e5..7df1d32 100644 --- a/modules/exllama.py +++ b/modules/exllama.py @@ -3,7 +3,7 @@ from pathlib import Path import torch.nn.functional as F from torch import version as torch_version -from modules import shared +from modules import RoPE, shared from modules.logging_colors import logger from modules.models import clear_torch_cache from modules.text_generation import get_max_prompt_length @@ -56,8 +56,8 @@ class ExllamaModel: config.set_auto_map(shared.args.gpu_split) config.gpu_peer_fix = True - if shared.args.alpha_value: - config.alpha_value = shared.args.alpha_value + if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0: + config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base) config.calculate_rotary_embedding_base() if torch_version.hip: diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py index 129ee52..eab9264 100644 --- a/modules/exllama_hf.py +++ b/modules/exllama_hf.py @@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithPast -from modules import shared +from modules import RoPE, shared from modules.logging_colors import logger try: @@ -134,8 +134,8 @@ class ExllamaHF(PreTrainedModel): config.set_auto_map(shared.args.gpu_split) config.gpu_peer_fix = True - if shared.args.alpha_value: - config.alpha_value = shared.args.alpha_value + if shared.args.alpha_value > 1 or shared.args.rope_freq_base > 0: + config.alpha_value = RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base) config.calculate_rotary_embedding_base() if torch.version.hip: diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index f37d710..4d42394 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -7,7 +7,7 @@ from torch.nn import CrossEntropyLoss from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithPast -from modules import shared +from modules import RoPE, shared from modules.logging_colors import logger import llama_cpp @@ -185,7 +185,7 @@ class LlamacppHF(PreTrainedModel): 'mul_mat_q': shared.args.mul_mat_q, 'low_vram': shared.args.low_vram, 'n_gpu_layers': shared.args.n_gpu_layers, - 'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.), + 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'n_gqa': shared.args.n_gqa or None, diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 69cbd23..d2893b0 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -3,7 +3,7 @@ from functools import partial import torch -from modules import shared +from modules import RoPE, shared from modules.callbacks import Iteratorize from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length @@ -72,7 +72,7 @@ class LlamaCppModel: 'mul_mat_q': shared.args.mul_mat_q, 'low_vram': shared.args.low_vram, 'n_gpu_layers': shared.args.n_gpu_layers, - 'rope_freq_base': 10000 * shared.args.alpha_value ** (64 / 63.), + 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'n_gqa': shared.args.n_gqa or None, diff --git a/modules/loaders.py b/modules/loaders.py index dde40c9..b949c32 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -21,6 +21,7 @@ loaders_and_params = OrderedDict({ 'compute_dtype', 'trust_remote_code', 'alpha_value', + 'rope_freq_base', 'compress_pos_emb', 'transformers_info' ], @@ -28,6 +29,7 @@ loaders_and_params = OrderedDict({ 'gpu_split', 'max_seq_len', 'alpha_value', + 'rope_freq_base', 'compress_pos_emb', 'cfg_cache', 'exllama_HF_info', @@ -36,6 +38,7 @@ loaders_and_params = OrderedDict({ 'gpu_split', 'max_seq_len', 'alpha_value', + 'rope_freq_base', 'compress_pos_emb', 'exllama_info', ], @@ -77,6 +80,7 @@ loaders_and_params = OrderedDict({ 'mul_mat_q', 'llama_cpp_seed', 'alpha_value', + 'rope_freq_base', 'compress_pos_emb', 'cpu', ], @@ -93,6 +97,7 @@ loaders_and_params = OrderedDict({ 'mlock', 'mul_mat_q', 'alpha_value', + 'rope_freq_base', 'compress_pos_emb', 'cpu', 'cfg_cache', diff --git a/modules/models.py b/modules/models.py index d60aecd..48a384c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -18,7 +18,7 @@ from transformers import ( ) import modules.shared as shared -from modules import llama_attn_hijack, sampler_hijack +from modules import llama_attn_hijack, RoPE, sampler_hijack from modules.logging_colors import logger from modules.models_settings import infer_loader @@ -219,7 +219,7 @@ def huggingface_loader(model_name): if shared.args.compress_pos_emb > 1: params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} elif shared.args.alpha_value > 1: - params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value} + params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)} model = LoaderClass.from_pretrained(checkpoint, **params) diff --git a/modules/shared.py b/modules/shared.py index c89c906..49557a8 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -159,8 +159,9 @@ parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The s parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.') # RoPE -parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.") parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.") +parser.add_argument('--rope_freq_base', type=int, default=1, help="If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)") +parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.") # Gradio parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.') diff --git a/modules/ui.py b/modules/ui.py index f6e9ac1..aa72f28 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -79,7 +79,8 @@ def list_model_elements(): 'gpu_split', 'max_seq_len', 'compress_pos_emb', - 'alpha_value' + 'alpha_value', + 'rope_freq_base' ] for i in range(torch.cuda.device_count()): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 05fe3af..b1eb6ad 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -91,7 +91,8 @@ def create_ui(): shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len) shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.1, info='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) - shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length).', value=shared.args.compress_pos_emb) + shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=100000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) + shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) with gr.Column(): shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)