Intel Gpu support initialization (#4340)

2023-10-27 08:09:51 +05:30 · 2023-10-27 08:09:51 +05:30 · 778a010df8
commit 778a010df8
parent 317e2c857e
14 changed files with 106 additions and 42 deletions
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@ -5,15 +5,15 @@ from pathlib import Path
 import accelerate
 import torch
 import transformers
+from accelerate import is_xpu_available
+from gptq_for_llama import llama_inference_offload
+from gptq_for_llama.modelutils import find_layers
+from gptq_for_llama.quant import make_quant
 from transformers import AutoConfig, AutoModelForCausalLM

 import modules.shared as shared
 from modules.logging_colors import logger

-from gptq_for_llama import llama_inference_offload
-from gptq_for_llama.modelutils import find_layers
-from gptq_for_llama.quant import make_quant
-

 # This function is a replacement for the load_quant function in the
 # GPTQ-for_LLaMa repository. It supports more models and branches.
@ -144,7 +144,7 @@ def load_quantized(model_name):
        model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold)

        # accelerate offload (doesn't work properly)
-        if shared.args.gpu_memory or torch.cuda.device_count() > 1:
+        if shared.args.gpu_memory or torch.cuda.device_count() > 1 or (is_xpu_available() and torch.xpu.device_count() > 1):
            if shared.args.gpu_memory:
                memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
                max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
@ -163,6 +163,9 @@ def load_quantized(model_name):

        # No offload
        elif not shared.args.cpu:
-            model = model.to(torch.device('cuda:0'))
+            if is_xpu_available():
+                model = model.to(torch.device("xpu:0"))
+            else:
+                model = model.to(torch.device('cuda:0'))

    return model