Add --load-in-4bit parameter (#2320)
This commit is contained in:
parent
63ce5f9c28
commit
361451ba60
6 changed files with 61 additions and 22 deletions
|
|
@ -149,7 +149,7 @@ def huggingface_loader(model_name):
|
|||
LoaderClass = AutoModelForCausalLM
|
||||
|
||||
# Load the model in simple 16-bit mode by default
|
||||
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
|
||||
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
|
||||
model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
|
||||
if torch.has_mps:
|
||||
device = torch.device('mps')
|
||||
|
|
@ -179,7 +179,21 @@ def huggingface_loader(model_name):
|
|||
params["torch_dtype"] = torch.float32
|
||||
else:
|
||||
params["device_map"] = 'auto'
|
||||
if shared.args.load_in_8bit and any((shared.args.auto_devices, shared.args.gpu_memory)):
|
||||
if shared.args.load_in_4bit:
|
||||
|
||||
# See https://github.com/huggingface/transformers/pull/23479/files
|
||||
# and https://huggingface.co/blog/4bit-transformers-bitsandbytes
|
||||
quantization_config_params = {
|
||||
'load_in_4bit': True,
|
||||
'bnb_4bit_compute_dtype': eval("torch.{}".format(shared.args.compute_dtype)) if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
|
||||
'bnb_4bit_quant_type': shared.args.quant_type,
|
||||
'bnb_4bit_use_double_quant': shared.args.use_double_quant,
|
||||
}
|
||||
|
||||
logger.warning("Using the following 4-bit params: " + str(quantization_config_params))
|
||||
params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
|
||||
|
||||
elif shared.args.load_in_8bit and any((shared.args.auto_devices, shared.args.gpu_memory)):
|
||||
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
|
||||
elif shared.args.load_in_8bit:
|
||||
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
|
||||
|
|
|
|||
|
|
@ -114,13 +114,19 @@ parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maxmimum GPU memo
|
|||
parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
|
||||
parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
|
||||
parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".')
|
||||
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
|
||||
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
|
||||
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
|
||||
parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
|
||||
parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.")
|
||||
parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.")
|
||||
parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM.")
|
||||
|
||||
# Accelerate 4-bit
|
||||
parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
|
||||
parser.add_argument('--compute_dtype', type=str, default="bfloat16", help="compute dtype for 4-bit. Valid options: bfloat16, float16, float32.")
|
||||
parser.add_argument('--quant_type', type=str, default="nf4", help='quant_type for 4-bit. Valid options: nf4, fp4.')
|
||||
parser.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.')
|
||||
|
||||
# llama.cpp
|
||||
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
|
||||
parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
||||
|
|
|
|||
|
|
@ -30,9 +30,10 @@ theme = gr.themes.Default(
|
|||
|
||||
|
||||
def list_model_elements():
|
||||
elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers']
|
||||
elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers']
|
||||
for i in range(torch.cuda.device_count()):
|
||||
elements.append(f'gpu_memory_{i}')
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue