Further refactor

This commit is contained in:
oobabooga 2023-02-23 13:28:30 -03:00
parent ce7feb3641
commit 1dacd34165
8 changed files with 185 additions and 173 deletions

View file

@ -1,6 +1,4 @@
import requests
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration
from transformers import BlipProcessor

View file

@ -1,7 +1,10 @@
import base64
import copy
import io
import json
import re
from datetime import datetime
from io import BytesIO
from pathlib import Path
import modules.shared as shared
@ -10,6 +13,7 @@ from modules.html_generator import generate_chat_html
from modules.text_generation import encode
from modules.text_generation import generate_reply
from modules.text_generation import get_max_prompt_length
from PIL import Image
if shared.args.picture and (shared.args.cai_chat or shared.args.chat):
import modules.bot_picture as bot_picture
@ -328,8 +332,8 @@ def load_character(_character, name1, name2):
history['visible'] += [['', "Hello there!"]]
else:
character = None
context = settings['context_pygmalion']
name2 = settings['name2_pygmalion']
context = shared.settings['context_pygmalion']
name2 = shared.settings['name2_pygmalion']
if Path(f'logs/{character}_persistent.json').exists():
load_history(open(Path(f'logs/{character}_persistent.json'), 'rb').read(), name1, name2)

143
modules/models.py Normal file
View file

@ -0,0 +1,143 @@
import json
import os
import time
import zipfile
from pathlib import Path
import modules.shared as shared
import numpy as np
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
local_rank = None
if shared.args.flexgen:
from flexgen.flex_opt import (Policy, OptLM, TorchDevice, TorchDisk, TorchMixedDevice, CompressionConfig, Env, get_opt_config)
if shared.args.deepspeed:
import deepspeed
from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_zero3_enabled
from modules.deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
def load_model(model_name):
print(f"Loading {model_name}...")
t0 = time.time()
# Default settings
if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen):
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
else:
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16).cuda()
# FlexGen
elif shared.args.flexgen:
gpu = TorchDevice("cuda:0")
cpu = TorchDevice("cpu")
disk = TorchDisk(shared.args.disk_cache_dir)
env = Env(gpu=gpu, cpu=cpu, disk=disk, mixed=TorchMixedDevice([gpu, cpu, disk]))
# Offloading policy
policy = Policy(1, 1,
shared.args.percent[0], shared.args.percent[1],
shared.args.percent[2], shared.args.percent[3],
shared.args.percent[4], shared.args.percent[5],
overlap=True, sep_layer=True, pin_weight=True,
cpu_cache_compute=False, attn_sparsity=1.0,
compress_weight=shared.args.compress_weight,
comp_weight_config=CompressionConfig(
num_bits=4, group_size=64,
group_dim=0, symmetric=False),
compress_cache=False,
comp_cache_config=CompressionConfig(
num_bits=4, group_size=64,
group_dim=2, symmetric=False))
opt_config = get_opt_config(f"facebook/{shared.model_name}")
model = OptLM(opt_config, env, "models", policy)
model.init_all_weights()
# DeepSpeed ZeRO-3
elif shared.args.deepspeed:
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16)
model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0]
model.module.eval() # Inference
print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}")
# Custom
else:
command = "AutoModelForCausalLM.from_pretrained"
params = ["low_cpu_mem_usage=True"]
if not shared.args.cpu and not torch.cuda.is_available():
print("Warning: no GPU has been detected.\nFalling back to CPU mode.\n")
shared.args.cpu = True
if shared.args.cpu:
params.append("low_cpu_mem_usage=True")
params.append("torch_dtype=torch.float32")
else:
params.append("device_map='auto'")
params.append("load_in_8bit=True" if shared.args.load_in_8bit else "torch_dtype=torch.bfloat16" if shared.args.bf16 else "torch_dtype=torch.float16")
if shared.args.gpu_memory:
params.append(f"max_memory={{0: '{shared.args.gpu_memory or '99'}GiB', 'cpu': '{shared.args.cpu_memory or '99'}GiB'}}")
elif not shared.args.load_in_8bit:
total_mem = (torch.cuda.get_device_properties(0).total_memory/(1024*1024))
suggestion = round((total_mem-1000)/1000)*1000
if total_mem-suggestion < 800:
suggestion -= 1000
suggestion = int(round(suggestion/1000))
print(f"\033[1;32;1mAuto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors.\nYou can manually set other values.\033[0;37;0m")
params.append(f"max_memory={{0: '{suggestion}GiB', 'cpu': '{shared.args.cpu_memory or '99'}GiB'}}")
if shared.args.disk:
params.append(f"offload_folder='{shared.args.disk_cache_dir}'")
command = f"{command}(Path(f'models/{shared.model_name}'), {', '.join(set(params))})"
model = eval(command)
# Loading the tokenizer
if shared.model_name.lower().startswith(('gpt4chan', 'gpt-4chan', '4chan')) and Path(f"models/gpt-j-6B/").exists():
tokenizer = AutoTokenizer.from_pretrained(Path("models/gpt-j-6B/"))
else:
tokenizer = AutoTokenizer.from_pretrained(Path(f"models/{shared.model_name}/"))
tokenizer.truncation_side = 'left'
print(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
return model, tokenizer
def load_soft_prompt(name):
if name == 'None':
shared.soft_prompt = False
shared.soft_prompt_tensor = None
else:
with zipfile.ZipFile(Path(f'softprompts/{name}.zip')) as zf:
zf.extract('tensor.npy')
zf.extract('meta.json')
j = json.loads(open('meta.json', 'r').read())
print(f"\nLoading the softprompt \"{name}\".")
for field in j:
if field != 'name':
if type(j[field]) is list:
print(f"{field}: {', '.join(j[field])}")
else:
print(f"{field}: {j[field]}")
print()
tensor = np.load('tensor.npy')
Path('tensor.npy').unlink()
Path('meta.json').unlink()
tensor = torch.Tensor(tensor).to(device=shared.model.device, dtype=shared.model.dtype)
tensor = torch.reshape(tensor, (1, tensor.shape[0], tensor.shape[1]))
shared.soft_prompt = True
shared.soft_prompt_tensor = tensor
return name

View file

@ -6,6 +6,7 @@ model_name = ""
soft_prompt_tensor = None
soft_prompt = False
stop_everything = False
settings = {}
parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog,max_help_position=54))
parser.add_argument('--model', type=str, help='Name of the model to load by default.')

View file

@ -1,15 +1,17 @@
import re
import time
import modules.shared as shared
import numpy as np
import torch
import transformers
from modules.extensions import apply_extensions
from modules.html_generator import generate_4chan_html
from modules.html_generator import generate_basic_html
from modules.models import local_rank
from modules.stopping_criteria import _SentinelTokenStoppingCriteria
from tqdm import tqdm
def get_max_prompt_length(tokens):
max_length = 2048-tokens
if shared.soft_prompt: