AutoGPTQ: Add UI and command line support for disabling fused attention and fused MLP (#2648)

2023-06-16 03:59:54 +01:00 · 2023-06-16 03:59:54 +01:00 · 646b0c889f
commit 646b0c889f
parent 909d8c6ae3
5 changed files with 11 additions and 3 deletions
--- a/modules/shared.py
+++ b/modules/shared.py
@ -142,6 +142,8 @@ parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fu
 parser.add_argument('--gptq-for-llama', action='store_true', help='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ.')
 parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
 parser.add_argument('--triton', action='store_true', help='Use triton.')
+parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do not use fused attention (lowers VRAM requirements).')
+parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
 parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')

 # FlexGen