diff --git a/README.md b/README.md index fb63994..6cba1b3 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,7 @@ Optionally, you can use the following command-line flags: |`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` | |`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. | |`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. | +|`--alpha_value ALPHA_VALUE` | Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both. ` #### GPTQ-for-LLaMa diff --git a/modules/exllama.py b/modules/exllama.py index 0d16f4d..8060757 100644 --- a/modules/exllama.py +++ b/modules/exllama.py @@ -53,13 +53,17 @@ class ExllamaModel: if shared.args.gpu_split: config.set_auto_map(shared.args.gpu_split) config.gpu_peer_fix = True + + if shared.args.alpha_value: + config.alpha_value = shared.args.alpha_value + config.calculate_rotary_embedding_base() + if torch_version.hip: config.rmsnorm_no_half2 = True config.rope_no_half2 = True config.matmul_no_half2 = True config.silu_no_half2 = True - model = ExLlama(config) tokenizer = ExLlamaTokenizer(str(tokenizer_model_path)) cache = ExLlamaCache(model) diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py index 181a77a..a25c3f4 100644 --- a/modules/exllama_hf.py +++ b/modules/exllama_hf.py @@ -97,6 +97,11 @@ class ExllamaHF(PreTrainedModel): if shared.args.gpu_split: config.set_auto_map(shared.args.gpu_split) config.gpu_peer_fix = True + + if shared.args.alpha_value: + config.alpha_value = shared.args.alpha_value + config.calculate_rotary_embedding_base() + if torch.version.hip: config.rmsnorm_no_half2 = True config.rope_no_half2 = True diff --git a/modules/loaders.py b/modules/loaders.py index 44e893f..8ec575a 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -57,12 +57,14 @@ loaders_and_params = { 'gpu_split', 'max_seq_len', 'compress_pos_emb', + 'alpha_value', 'exllama_info', ], 'ExLlama_HF' : [ 'gpu_split', 'max_seq_len', 'compress_pos_emb', + 'alpha_value', 'exllama_HF_info', ] } diff --git a/modules/shared.py b/modules/shared.py index 68d1088..2b2fa06 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -150,6 +150,7 @@ parser.add_argument('--desc_act', action='store_true', help='For models that don parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7") parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.") parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.") +parser.add_argument('--alpha_value', type=int, default=1, help="Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both.") # FlexGen parser.add_argument('--flexgen', action='store_true', help='DEPRECATED') diff --git a/modules/ui.py b/modules/ui.py index b13b6a3..c9f0018 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -63,9 +63,11 @@ def list_model_elements(): 'llama_cpp_seed', 'gpu_split', 'max_seq_len', - 'compress_pos_emb' + 'compress_pos_emb', + 'alpha_value' ] + for i in range(torch.cuda.device_count()): elements.append(f'gpu_memory_{i}') diff --git a/server.py b/server.py index 0dae7ff..8de895b 100644 --- a/server.py +++ b/server.py @@ -226,6 +226,7 @@ def create_model_menus(): shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=2048, maximum=16384, step=256, info='Maximum sequence length.', value=shared.args.max_seq_len) shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.', value=shared.args.compress_pos_emb) + shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=1, info='Positional embeddings alpha factor for NTK RoPE scaling. Same as above. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) with gr.Column(): shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)