Disable logits_all in llamacpp_HF (makes processing 3x faster)

This commit is contained in:
oobabooga 2023-11-07 14:35:48 -08:00
parent 5c3eb22ce6
commit af3d25a503
5 changed files with 5 additions and 1 deletions

View file

@ -113,6 +113,7 @@ parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layer
parser.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.')
parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default is 0 (random).')
parser.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
parser.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
# ExLlama