AutoGPTQ: Add UI and command line support for disabling fused attention and fused MLP (#2648)

This commit is contained in:
Tom Jobbins 2023-06-16 03:59:54 +01:00 committed by GitHub
parent 909d8c6ae3
commit 646b0c889f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 11 additions and 3 deletions

View file

@ -142,6 +142,8 @@ parser.add_argument('--fused_mlp', action='store_true', help='(triton) Enable fu
parser.add_argument('--gptq-for-llama', action='store_true', help='Use GPTQ-for-LLaMa to load the GPTQ model instead of AutoGPTQ.')
parser.add_argument('--autogptq', action='store_true', help='DEPRECATED')
parser.add_argument('--triton', action='store_true', help='Use triton.')
parser.add_argument('--no_inject_fused_attention', action='store_true', help='Do not use fused attention (lowers VRAM requirements).')
parser.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: Do not use fused MLP (lowers VRAM requirements).')
parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
# FlexGen