AutoGPTQ: Add UI and command line support for disabling fused attention and fused MLP (#2648)
This commit is contained in:
parent
909d8c6ae3
commit
646b0c889f
5 changed files with 11 additions and 3 deletions
|
|
@ -43,6 +43,8 @@ def load_quantized(model_name):
|
|||
'model_basename': pt_path.stem,
|
||||
'device': "cuda:0" if not shared.args.cpu else "cpu",
|
||||
'use_triton': shared.args.triton,
|
||||
'inject_fused_attention': not shared.args.no_inject_fused_attention,
|
||||
'inject_fused_mlp': not shared.args.no_inject_fused_mlp,
|
||||
'use_safetensors': use_safetensors,
|
||||
'trust_remote_code': shared.args.trust_remote_code,
|
||||
'max_memory': get_max_memory_dict(),
|
||||
|
|
|
|||
|
|
@ -142,6 +142,8 @@ parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fu
|
|||
parser.add_argument('--gptq-for-llama', action='store_true', help='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ.')
|
||||
parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
|
||||
parser.add_argument('--triton', action='store_true', help='Use triton.')
|
||||
parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do not use fused attention (lowers VRAM requirements).')
|
||||
parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
|
||||
parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
|
||||
|
||||
# FlexGen
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ theme = gr.themes.Default(
|
|||
|
||||
|
||||
def list_model_elements():
|
||||
elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'gptq_for_llama', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
|
||||
elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'gptq_for_llama', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'no_inject_fused_attention', 'no_inject_fused_mlp', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
|
||||
for i in range(torch.cuda.device_count()):
|
||||
elements.append(f'gpu_memory_{i}')
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue