diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 662182e..7045a09 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -1,3 +1,4 @@ +import re import sys from pathlib import Path @@ -56,16 +57,20 @@ def load_quantized(model_name): # Multiple GPUs or GPU+CPU if shared.args.gpu_memory: + memory_map = list(map(lambda x : x.strip(), shared.args.gpu_memory)) + max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' max_memory = {} - for i in range(len(shared.args.gpu_memory)): - max_memory[i] = f"{shared.args.gpu_memory[i]}GiB" - max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB" + for i in range(len(memory_map)): + max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i] + max_memory['cpu'] = max_cpu_memory device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"]) - model = accelerate.dispatch_model(model, device_map=device_map) + print("Using the following device map for the 4-bit model:", device_map) + # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model + model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True) # Single GPU - else: + elif not shared.args.cpu: model = model.to(torch.device('cuda:0')) return model