Remove flexgen support

This commit is contained in:
oobabooga 2023-07-25 15:15:29 -07:00
parent 5134d5b1c6
commit 75c2dd38cf
8 changed files with 3 additions and 233 deletions

View file

@ -56,7 +56,6 @@ def load_model(model_name, loader=None):
'GPTQ-for-LLaMa': GPTQ_loader,
'llama.cpp': llamacpp_loader,
'llamacpp_HF': llamacpp_HF_loader,
'FlexGen': flexgen_loader,
'RWKV': RWKV_loader,
'ExLlama': ExLlama_loader,
'ExLlama_HF': ExLlama_HF_loader
@ -221,32 +220,6 @@ def huggingface_loader(model_name):
return model
def flexgen_loader(model_name):
from flexgen.flex_opt import CompressionConfig, ExecutionEnv, OptLM, Policy
# Initialize environment
env = ExecutionEnv.create(shared.args.disk_cache_dir)
# Offloading policy
policy = Policy(1, 1,
shared.args.percent[0], shared.args.percent[1],
shared.args.percent[2], shared.args.percent[3],
shared.args.percent[4], shared.args.percent[5],
overlap=True, sep_layer=True, pin_weight=shared.args.pin_weight,
cpu_cache_compute=False, attn_sparsity=1.0,
compress_weight=shared.args.compress_weight,
comp_weight_config=CompressionConfig(
num_bits=4, group_size=64,
group_dim=0, symmetric=False),
compress_cache=False,
comp_cache_config=CompressionConfig(
num_bits=4, group_size=64,
group_dim=2, symmetric=False))
model = OptLM(f"facebook/{model_name}", env, shared.args.model_dir, policy)
return model
def RWKV_loader(model_name):
from modules.RWKV import RWKVModel, RWKVTokenizer

View file

@ -30,8 +30,6 @@ def infer_loader(model_name):
loader = 'llama.cpp'
elif re.match('.*rwkv.*\.pth', model_name.lower()):
loader = 'RWKV'
elif shared.args.flexgen:
loader = 'FlexGen'
else:
loader = 'Transformers'

View file

@ -95,7 +95,7 @@ parser.add_argument('--extensions', type=str, nargs="+", help='The list of exten
parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
# Model loader
parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen')
parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv')
# Accelerate/transformers
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
@ -156,7 +156,6 @@ parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM
parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
# FlexGen
parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).")
@ -202,9 +201,6 @@ if args.autogptq:
if args.gptq_for_llama:
logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.')
args.loader = 'gptq-for-llama'
if args.flexgen:
logger.warning('--flexgen has been deprecated and will be removed soon. Use --loader flexgen instead.')
args.loader = 'FlexGen'
# Security warnings
if args.trust_remote_code:

View file

@ -53,8 +53,6 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
return input_ids
elif shared.args.flexgen:
return input_ids.numpy()
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
elif torch.backends.mps.is_available():
@ -182,8 +180,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
generate_func = generate_reply_custom
elif shared.args.flexgen:
generate_func = generate_reply_flexgen
else:
generate_func = generate_reply_HF
@ -339,66 +335,3 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return
def generate_reply_flexgen(question, original_question, seed, state, stopping_strings=None, is_chat=False):
generate_params = {}
for k in ['max_new_tokens', 'do_sample', 'temperature']:
generate_params[k] = state[k]
if state['stream']:
generate_params['max_new_tokens'] = 8
# Encode the input
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
output = input_ids[0]
# Find the eos tokens
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
if not state['ban_eos_token']:
generate_params['stop'] = eos_token_ids[-1]
# Add the encoded tokens to generate_params
question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
original_input_ids = input_ids
generate_params.update({'inputs': input_ids})
if inputs_embeds is not None:
generate_params.update({'inputs_embeds': inputs_embeds})
t0 = time.time()
try:
if not is_chat:
yield ''
# Generate the entire reply at once.
if not state['stream']:
with torch.no_grad():
output = shared.model.generate(**generate_params)[0]
yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
else:
for i in range(state['max_new_tokens'] // 8 + 1):
if shared.stop_everything:
break
clear_torch_cache()
with torch.no_grad():
output = shared.model.generate(**generate_params)[0]
if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
break
yield get_reply_from_output_ids(output, original_input_ids, original_question, state)
input_ids = np.reshape(output, (1, output.shape[0]))
generate_params.update({'inputs': input_ids})
except Exception:
traceback.print_exc()
finally:
t1 = time.time()
original_tokens = len(original_input_ids[0])
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return

View file

@ -71,10 +71,7 @@ def natural_keys(text):
def get_available_models():
if shared.args.flexgen:
return sorted([re.sub('-np$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if item.name.endswith('-np')], key=natural_keys)
else:
return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
def get_available_presets():