From a28f0d8bd703f555e72ee688d74a0024fea4f84b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 3 Feb 2023 09:11:11 -0300 Subject: [PATCH] Show it/s in the same units with or without streaming Closes #49 --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index fcbda1c..96c89f5 100644 --- a/server.py +++ b/server.py @@ -242,7 +242,7 @@ def generate_reply(question, tokens, inference_settings, selected_model, eos_tok output = eval(f"model.generate(input_ids, {','.join(generate_params)}, {preset}){cuda}") reply = decode(output[0]) t1 = time.time() - print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output[0])-len(input_ids[0]))/(t1-t0):.2f} it/s)") + print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output[0])-len(input_ids[0]))/(t1-t0)/8:.2f} it/s, {len(output[0])-len(input_ids[0])} tokens)") if not (args.chat or args.cai_chat): reply = original_question + apply_extensions(reply[len(question):], "output") yield formatted_outputs(reply, model_name)