This commit is contained in:
Xan 2023-03-08 22:08:54 +11:00
commit 5648a41a27
16 changed files with 352 additions and 166 deletions

View file

@ -1,96 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
import json
import os
import sys
import time
from pathlib import Path
from typing import Tuple
import fire
import torch
from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from llama import LLaMA, ModelArgs, Tokenizer, Transformer
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['MP'] = '1'
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '2223'
def setup_model_parallel() -> Tuple[int, int]:
local_rank = int(os.environ.get("LOCAL_RANK", -1))
world_size = int(os.environ.get("WORLD_SIZE", -1))
torch.distributed.init_process_group("gloo")
initialize_model_parallel(world_size)
torch.cuda.set_device(local_rank)
# seed must be the same in all processes
torch.manual_seed(1)
return local_rank, world_size
def load(
ckpt_dir: str,
tokenizer_path: str,
local_rank: int,
world_size: int,
max_seq_len: int,
max_batch_size: int,
) -> LLaMA:
start_time = time.time()
checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
assert world_size == len(
checkpoints
), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
ckpt_path = checkpoints[local_rank]
print("Loading")
checkpoint = torch.load(ckpt_path, map_location="cpu")
with open(Path(ckpt_dir) / "params.json", "r") as f:
params = json.loads(f.read())
model_args: ModelArgs = ModelArgs(
max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
)
tokenizer = Tokenizer(model_path=tokenizer_path)
model_args.vocab_size = tokenizer.n_words
torch.set_default_tensor_type(torch.cuda.HalfTensor)
model = Transformer(model_args)
torch.set_default_tensor_type(torch.FloatTensor)
model.load_state_dict(checkpoint, strict=False)
generator = LLaMA(model, tokenizer)
print(f"Loaded in {time.time() - start_time:.2f} seconds")
return generator
class LLaMAModel:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path, max_seq_len=2048, max_batch_size=1):
tokenizer_path = path / "tokenizer.model"
path = os.path.abspath(path)
tokenizer_path = os.path.abspath(tokenizer_path)
local_rank, world_size = setup_model_parallel()
if local_rank > 0:
sys.stdout = open(os.devnull, "w")
generator = load(
path, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
)
result = self()
result.pipeline = generator
return result
def generate(self, prompt, token_count=512, temperature=0.8, top_p=0.95):
results = self.pipeline.generate(
[prompt], max_gen_len=token_count, temperature=temperature, top_p=top_p
)
return results[0]

View file

@ -1,14 +1,17 @@
import os
from pathlib import Path
from queue import Queue
from threading import Thread
import numpy as np
from tokenizers import Tokenizer
import modules.shared as shared
np.set_printoptions(precision=4, suppress=True, linewidth=200)
os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much faster)
os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster)
from rwkv.model import RWKV
from rwkv.utils import PIPELINE, PIPELINE_ARGS
@ -32,10 +35,11 @@ class RWKVModel:
result.pipeline = pipeline
return result
def generate(self, context, token_count=20, temperature=1, top_p=1, alpha_frequency=0.25, alpha_presence=0.25, token_ban=[0], token_stop=[], callback=None):
def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
args = PIPELINE_ARGS(
temperature = temperature,
top_p = top_p,
top_k = top_k,
alpha_frequency = alpha_frequency, # Frequency Penalty (as in GPT-3)
alpha_presence = alpha_presence, # Presence Penalty (as in GPT-3)
token_ban = token_ban, # ban the generation of some tokens
@ -43,3 +47,64 @@ class RWKVModel:
)
return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
def generate_with_streaming(self, **kwargs):
iterable = Iteratorize(self.generate, kwargs, callback=None)
reply = kwargs['context']
for token in iterable:
reply += token
yield reply
class RWKVTokenizer:
def __init__(self):
pass
@classmethod
def from_pretrained(self, path):
tokenizer_path = path / "20B_tokenizer.json"
tokenizer = Tokenizer.from_file(os.path.abspath(tokenizer_path))
result = self()
result.tokenizer = tokenizer
return result
def encode(self, prompt):
return self.tokenizer.encode(prompt).ids
def decode(self, ids):
return self.tokenizer.decode(ids)
class Iteratorize:
"""
Transforms a function that takes a callback
into a lazy iterator (generator).
"""
def __init__(self, func, kwargs={}, callback=None):
self.mfunc=func
self.c_callback=callback
self.q = Queue(maxsize=1)
self.sentinel = object()
self.kwargs = kwargs
def _callback(val):
self.q.put(val)
def gentask():
ret = self.mfunc(callback=_callback, **self.kwargs)
self.q.put(self.sentinel)
if self.c_callback:
self.c_callback(ret)
Thread(target=gentask).start()
def __iter__(self):
return self
def __next__(self):
obj = self.q.get(True,None)
if obj is self.sentinel:
raise StopIteration
else:
return obj

View file

@ -51,23 +51,29 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
prompt = ''.join(rows)
return prompt
def extract_message_from_reply(question, reply, current, other, check, extensions=False):
def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False):
next_character_found = False
substring_found = False
previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(current)}:", question)]
idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(current)}:", reply)]
idx = idx[len(previous_idx)-1]
asker = name1 if not impersonate else name2
replier = name2 if not impersonate else name1
if extensions:
reply = reply[idx + 1 + len(apply_extensions(f"{current}:", "bot_prefix")):]
previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)]
idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)]
idx = idx[max(len(previous_idx)-1, 0)]
if not impersonate:
reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):]
else:
reply = reply[idx + 1 + len(f"{current}:"):]
reply = reply[idx + 1 + len(f"{replier}:"):]
if check:
reply = reply.split('\n')[0].strip()
lines = reply.split('\n')
reply = lines[0].strip()
if len(lines) > 1:
next_character_found = True
else:
idx = reply.find(f"\n{other}:")
idx = reply.find(f"\n{asker}:")
if idx != -1:
reply = reply[:idx]
next_character_found = True
@ -75,7 +81,7 @@ def extract_message_from_reply(question, reply, current, other, check, extension
# Detect if something like "\nYo" is generated just before
# "\nYou:" is completed
tmp = f"\n{other}:"
tmp = f"\n{asker}:"
for j in range(1, len(tmp)):
if reply[-j:] == tmp[:j]:
substring_found = True
@ -89,6 +95,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
shared.stop_everything = False
just_started = True
eos_token = '\n' if check else None
name1_original = name1
if 'pygmalion' in shared.model_name.lower():
name1 = "You"
@ -119,8 +126,9 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
# Extracting the reply
reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name2, name1, check, extensions=True)
visible_reply = apply_extensions(reply, "output")
reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check)
visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
visible_reply = apply_extensions(visible_reply, "output")
if shared.args.chat:
visible_reply = visible_reply.replace('\n', '<br>')
@ -139,6 +147,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
yield shared.history['visible']
if next_character_found:
break
yield shared.history['visible']
def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
@ -152,7 +161,7 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
reply = ''
for i in range(chat_generation_attempts):
for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, extensions=False)
reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
if not substring_found:
yield reply
if next_character_found:

View file

@ -39,10 +39,9 @@ def load_model(model_name):
t0 = time.time()
shared.is_RWKV = model_name.lower().startswith('rwkv-')
shared.is_LLaMA = model_name.lower().startswith('llama-')
# Default settings
if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV or shared.is_LLaMA):
if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
else:
@ -80,20 +79,12 @@ def load_model(model_name):
# RMKV model (not on HuggingFace)
elif shared.is_RWKV:
from modules.RWKV import RWKVModel
from modules.RWKV import RWKVModel, RWKVTokenizer
model = RWKVModel.from_pretrained(Path(f'models/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
tokenizer = RWKVTokenizer.from_pretrained(Path('models'))
return model, None
# LLaMA model (not on HuggingFace)
elif shared.is_LLaMA:
import modules.LLaMA
from modules.LLaMA import LLaMAModel
model = LLaMAModel.from_pretrained(Path(f'models/{model_name}'))
return model, None
return model, tokenizer
# Custom
else:

View file

@ -6,7 +6,6 @@ model_name = ""
soft_prompt_tensor = None
soft_prompt = False
is_RWKV = False
is_LLaMA = False
# Chat variables
history = {'internal': [], 'visible': []}
@ -44,7 +43,6 @@ settings = {
'default': 'NovelAI-Sphinx Moth',
'pygmalion-*': 'Pygmalion',
'RWKV-*': 'Naive',
'llama-*': 'Naive',
'(rosey|chip|joi)_.*_instruct.*': 'Instruct Joi (Contrastive Search)'
},
'prompts': {
@ -84,9 +82,10 @@ parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, defaul
parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
parser.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
parser.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
parser.add_argument('--rwkv-strategy', type=str, default=None, help='The strategy to use while loading RWKV models. Examples: "cpu fp32", "cuda fp16", "cuda fp16 *30 -> cpu fp32".')
parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time. This improves the text generation performance.')
parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example.')
parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example. If you create a file called settings.json, this file will be loaded by default without the need to use the --settings flag.')
parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')

View file

@ -21,21 +21,20 @@ def get_max_prompt_length(tokens):
return max_length
def encode(prompt, tokens_to_generate=0, add_special_tokens=True):
# These models do not have explicit tokenizers for now, so
# we return an estimate for the number of tokens
if shared.is_RWKV or shared.is_LLaMA:
return np.zeros((1, len(prompt)//4))
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', truncation=True, max_length=get_max_prompt_length(tokens_to_generate), add_special_tokens=add_special_tokens)
if shared.args.cpu:
if shared.is_RWKV:
input_ids = shared.tokenizer.encode(str(prompt))
input_ids = np.array(input_ids).reshape(1, len(input_ids))
return input_ids
elif shared.args.flexgen:
return input_ids.numpy()
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
else:
return input_ids.cuda()
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', truncation=True, max_length=get_max_prompt_length(tokens_to_generate), add_special_tokens=add_special_tokens)
if shared.args.cpu:
return input_ids
elif shared.args.flexgen:
return input_ids.numpy()
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
else:
return input_ids.cuda()
def decode(output_ids):
reply = shared.tokenizer.decode(output_ids, skip_special_tokens=True)
@ -81,26 +80,30 @@ def formatted_outputs(reply, model_name):
else:
return reply
def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
torch.cuda.empty_cache()
def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
clear_torch_cache()
t0 = time.time()
# These models are not part of Hugging Face, so we handle them
# separately and terminate the function call earlier
if shared.is_RWKV or shared.is_LLaMA:
if shared.is_RWKV:
if shared.args.no_stream:
reply = shared.model.generate(question, token_count=max_new_tokens, temperature=temperature, top_p=top_p)
t1 = time.time()
print(f"Output generated in {(t1-t0):.2f} seconds.")
reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
yield formatted_outputs(reply, shared.model_name)
else:
for i in tqdm(range(max_new_tokens//8+1)):
reply = shared.model.generate(question, token_count=8, temperature=temperature, top_p=top_p)
yield formatted_outputs(question, shared.model_name)
# RWKV has proper streaming, which is very nice.
# No need to generate 8 tokens at a time.
for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
yield formatted_outputs(reply, shared.model_name)
question = reply
t1 = time.time()
print(f"Output generated in {(t1-t0):.2f} seconds.")
return
original_question = question
@ -111,8 +114,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
input_ids = encode(question, max_new_tokens)
cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
n = shared.tokenizer.eos_token_id if eos_token is None else encode(eos_token)[0][-1]
n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
if stopping_string is not None:
# The stopping_criteria code below was copied from
# https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
@ -149,14 +151,12 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
f"temperature={temperature}",
f"stop={n}",
]
if shared.args.deepspeed:
generate_params.append("synced_gpus=True")
if shared.args.no_stream:
generate_params.append("max_new_tokens=max_new_tokens")
else:
generate_params.append("max_new_tokens=8")
if shared.soft_prompt:
inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
generate_params.insert(0, "inputs_embeds=inputs_embeds")
@ -184,6 +184,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
yield formatted_outputs(original_question, shared.model_name)
shared.still_streaming = True
for i in tqdm(range(max_new_tokens//8+1)):
clear_torch_cache()
with torch.no_grad():
output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
if shared.soft_prompt: