Added xformers support to Llama (#950)
This commit is contained in:
parent
625d81f495
commit
992663fa20
4 changed files with 185 additions and 0 deletions
|
@ -98,6 +98,8 @@ parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directo
|
|||
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
|
||||
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
|
||||
parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
|
||||
parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.")
|
||||
parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.")
|
||||
|
||||
# llama.cpp
|
||||
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use in llama.cpp.')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue