From 9a3bed50c3f51c505b7ea57433c8018c7375d535 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 20 Mar 2023 15:11:56 -0300 Subject: [PATCH] Attempt at fixing 4-bit with CPU offload --- modules/GPTQ_loader.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 662182e..7045a09 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -1,3 +1,4 @@ +import re import sys from pathlib import Path @@ -56,16 +57,20 @@ def load_quantized(model_name): # Multiple GPUs or GPU+CPU if shared.args.gpu_memory: + memory_map = list(map(lambda x : x.strip(), shared.args.gpu_memory)) + max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' max_memory = {} - for i in range(len(shared.args.gpu_memory)): - max_memory[i] = f"{shared.args.gpu_memory[i]}GiB" - max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB" + for i in range(len(memory_map)): + max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i] + max_memory['cpu'] = max_cpu_memory device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"]) - model = accelerate.dispatch_model(model, device_map=device_map) + print("Using the following device map for the 4-bit model:", device_map) + # https://huggingface.co/docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model + model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True) # Single GPU - else: + elif not shared.args.cpu: model = model.to(torch.device('cuda:0')) return model