Add max_tokens_second param (#3533)

This commit is contained in:
oobabooga 2023-08-29 17:44:31 -03:00 committed by GitHub
parent fe1f7c6513
commit cec8db52e5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 24 additions and 3 deletions

View file

@ -80,10 +80,22 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if is_stream:
cur_time = time.time()
if cur_time - last_update > 0.041666666666666664: # Limit streaming to 24 fps
last_update = cur_time
# Maximum number of tokens/second
if state['max_tokens_second'] > 0:
diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
if diff > 0:
time.sleep(diff)
last_update = time.time()
yield reply
# Limit updates to 24 per second to not stress low latency networks
else:
if cur_time - last_update > 0.041666666666666664:
last_update = cur_time
yield reply
if stop_found:
break