Added xformers support to Llama (#950)

This commit is contained in:
MarkovInequality 2023-04-09 22:08:40 -04:00 committed by GitHub
parent 625d81f495
commit 992663fa20
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 185 additions and 0 deletions

View file

@ -98,6 +98,8 @@ parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directo
parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.")
parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.")
# llama.cpp
parser.add_argument('--threads', type=int, default=0, help='Number of threads to use in llama.cpp.')