Bump llama-cpp-python to 0.2.24 (#5001)

2023-12-19 15:22:21 -03:00 · 2023-12-19 15:22:21 -03:00 · 0a299d5959
commit 0a299d5959
parent 83cf1a6b67
15 changed files with 104 additions and 96 deletions
--- a/modules/shared.py
+++ b/modules/shared.py
@ -117,6 +117,7 @@ parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layer
 parser.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.')
 parser.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 parser.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
+parser.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
 parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')

 # ExLlama