commit
6d3a9b8689
11 changed files with 174 additions and 177 deletions
9
.gitignore
vendored
9
.gitignore
vendored
|
|
@ -26,9 +26,10 @@
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.eslintrc.js
|
.eslintrc.js
|
||||||
.idea
|
.idea
|
||||||
.env
|
|
||||||
.venv
|
.venv
|
||||||
venv
|
venv
|
||||||
|
.envrc
|
||||||
|
.direnv
|
||||||
.vscode
|
.vscode
|
||||||
*.bak
|
*.bak
|
||||||
*.ipynb
|
*.ipynb
|
||||||
|
|
@ -40,3 +41,9 @@ package.json
|
||||||
package-lock.json
|
package-lock.json
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
|
# ignore user docker config and top level links to docker files
|
||||||
|
/docker-compose.yaml
|
||||||
|
/docker-compose.yml
|
||||||
|
/Dockerfile
|
||||||
|
.env
|
||||||
|
|
|
||||||
|
|
@ -163,9 +163,12 @@ The requirments*.txt above contain various precompiled wheels. If you wish to co
|
||||||
### Alternative: Docker
|
### Alternative: Docker
|
||||||
|
|
||||||
```
|
```
|
||||||
ln -s docker/{Dockerfile,docker-compose.yml,.dockerignore} .
|
ln -s docker/{nvidia/Dockerfile,docker-compose.yml,.dockerignore} .
|
||||||
cp docker/.env.example .env
|
cp docker/.env.example .env
|
||||||
# Edit .env and set TORCH_CUDA_ARCH_LIST based on your GPU model
|
# Edit .env and set:
|
||||||
|
# TORCH_CUDA_ARCH_LIST based on your GPU model
|
||||||
|
# APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal)
|
||||||
|
# BUILD_EXTENIONS optionally add comma separated list of extensions to build
|
||||||
docker compose up --build
|
docker compose up --build
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,19 +2,21 @@
|
||||||
# however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
|
# however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
|
||||||
# https://developer.nvidia.com/cuda-gpus you can find the version for your card here
|
# https://developer.nvidia.com/cuda-gpus you can find the version for your card here
|
||||||
TORCH_CUDA_ARCH_LIST=7.5
|
TORCH_CUDA_ARCH_LIST=7.5
|
||||||
|
|
||||||
# your command-line flags go here:
|
# your command-line flags go here:
|
||||||
CLI_ARGS=
|
CLI_ARGS=--listen
|
||||||
|
|
||||||
# the port the webui binds to on the host
|
# the port the webui binds to on the host
|
||||||
HOST_PORT=7860
|
HOST_PORT=7860
|
||||||
# the port the webui binds to inside the container
|
# the port the webui binds to inside the container
|
||||||
CONTAINER_PORT=7860
|
CONTAINER_PORT=7860
|
||||||
|
|
||||||
# the port the api binds to on the host
|
# the port the api binds to on the host
|
||||||
HOST_API_PORT=5000
|
HOST_API_PORT=5000
|
||||||
# the port the api binds to inside the container
|
# the port the api binds to inside the container
|
||||||
CONTAINER_API_PORT=5000
|
CONTAINER_API_PORT=5000
|
||||||
|
# Comma separated extensions to build
|
||||||
# the version used to install text-generation-webui from
|
BUILD_EXTENSIONS=""
|
||||||
WEBUI_VERSION=HEAD
|
# Set APP_RUNTIME_GID to an appropriate host system group to enable access to mounted volumes
|
||||||
|
# You can find your current host user group id with the command `id -g`
|
||||||
|
APP_RUNTIME_GID=6972
|
||||||
|
# override default app build permissions (handy for deploying to cloud)
|
||||||
|
#APP_GID=6972
|
||||||
|
#APP_UID=6972
|
||||||
|
|
|
||||||
|
|
@ -1,77 +0,0 @@
|
||||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as builder
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \
|
|
||||||
apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN git clone --depth=1 https://github.com/oobabooga/GPTQ-for-LLaMa /build
|
|
||||||
|
|
||||||
WORKDIR /build
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip,rw \
|
|
||||||
python3 -m venv /build/venv && \
|
|
||||||
. /build/venv/bin/activate && \
|
|
||||||
pip3 install --upgrade pip setuptools wheel ninja && \
|
|
||||||
pip3 install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121 && \
|
|
||||||
pip3 install -r requirements.txt
|
|
||||||
|
|
||||||
# https://developer.nvidia.com/cuda-gpus
|
|
||||||
# for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
|
|
||||||
ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
|
|
||||||
RUN . /build/venv/bin/activate && \
|
|
||||||
python3 setup_cuda.py bdist_wheel -d .
|
|
||||||
|
|
||||||
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
|
|
||||||
|
|
||||||
LABEL maintainer="Your Name <your.email@example.com>"
|
|
||||||
LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \
|
|
||||||
apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ ffmpeg && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip,rw pip3 install virtualenv
|
|
||||||
|
|
||||||
RUN mkdir /app
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ARG WEBUI_VERSION
|
|
||||||
RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source"
|
|
||||||
|
|
||||||
# Create virtualenv
|
|
||||||
RUN virtualenv /app/venv
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip,rw \
|
|
||||||
. /app/venv/bin/activate && \
|
|
||||||
pip3 install --upgrade pip setuptools wheel ninja && \
|
|
||||||
pip3 install torch xformers --index-url https://download.pytorch.org/whl/cu121 && \
|
|
||||||
pip3 install torchvision torchaudio sentence_transformers
|
|
||||||
|
|
||||||
# Copy and install GPTQ-for-LLaMa
|
|
||||||
COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip,rw \
|
|
||||||
. /app/venv/bin/activate && \
|
|
||||||
pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl
|
|
||||||
|
|
||||||
# Install main requirements
|
|
||||||
COPY requirements.txt /app/requirements.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip,rw \
|
|
||||||
. /app/venv/bin/activate && \
|
|
||||||
pip3 install -r requirements.txt
|
|
||||||
|
|
||||||
COPY . /app/
|
|
||||||
|
|
||||||
RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
|
|
||||||
|
|
||||||
# Install extension requirements
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip,rw \
|
|
||||||
. /app/venv/bin/activate && \
|
|
||||||
for ext in /app/extensions/*/requirements.txt; do \
|
|
||||||
cd "$(dirname "$ext")"; \
|
|
||||||
pip3 install -r requirements.txt; \
|
|
||||||
done
|
|
||||||
|
|
||||||
ENV CLI_ARGS=""
|
|
||||||
|
|
||||||
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000}
|
|
||||||
CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}
|
|
||||||
|
|
@ -6,27 +6,30 @@ services:
|
||||||
args:
|
args:
|
||||||
# specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
|
# specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
|
||||||
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
||||||
WEBUI_VERSION: ${WEBUI_VERSION:-HEAD}
|
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
||||||
|
APP_GID: ${APP_GID:-6972}
|
||||||
|
APP_UID: ${APP_UID-6972}
|
||||||
env_file: .env
|
env_file: .env
|
||||||
|
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
||||||
ports:
|
ports:
|
||||||
- "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
|
- "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
|
||||||
- "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
|
- "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
|
||||||
stdin_open: true
|
stdin_open: true
|
||||||
tty: true
|
tty: true
|
||||||
volumes:
|
volumes:
|
||||||
- ./characters:/app/characters
|
- ./characters:/home/app/text-generation-webui/characters
|
||||||
- ./extensions:/app/extensions
|
- ./extensions:/home/app/text-generation-webui/extensions
|
||||||
- ./loras:/app/loras
|
- ./loras:/home/app/text-generation-webui/loras
|
||||||
- ./models:/app/models
|
- ./models:/home/app/text-generation-webui/models
|
||||||
- ./presets:/app/presets
|
- ./presets:/home/app/text-generation-webui/presets
|
||||||
- ./prompts:/app/prompts
|
- ./prompts:/home/app/text-generation-webui/prompts
|
||||||
- ./softprompts:/app/softprompts
|
- ./softprompts:/home/app/text-generation-webui/softprompts
|
||||||
- ./training:/app/training
|
- ./training:/home/app/text-generation-webui/training
|
||||||
- ./cloudflared:/etc/cloudflared
|
- ./cloudflared:/etc/cloudflared
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
devices:
|
devices:
|
||||||
- driver: nvidia
|
- driver: nvidia
|
||||||
device_ids: ['0']
|
count: all
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
|
||||||
56
docker/nvidia/Dockerfile
Normal file
56
docker/nvidia/Dockerfile
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
# BUILDER
|
||||||
|
FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 as builder
|
||||||
|
WORKDIR /builder
|
||||||
|
ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
|
||||||
|
ARG BUILD_EXTENSIONS="${BUILD_EXTENSIONS:-}"
|
||||||
|
ARG APP_UID="${APP_UID:-6972}"
|
||||||
|
ARG APP_GID="${APP_GID:-6972}"
|
||||||
|
# create / update build env
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
|
||||||
|
apt update && \
|
||||||
|
apt install --no-install-recommends -y git vim build-essential python3-dev pip && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip,rw \
|
||||||
|
pip3 install --global --upgrade pip wheel setuptools && \
|
||||||
|
# make shared builder & runtime app user
|
||||||
|
addgroup --gid $APP_GID app_grp && \
|
||||||
|
useradd -m -u $APP_UID --gid app_grp app
|
||||||
|
USER app:app_grp
|
||||||
|
# build wheels for runtime
|
||||||
|
WORKDIR /home/app/build
|
||||||
|
COPY --chown=app:app_grp requirements.txt /home/app/build
|
||||||
|
COPY --chown=app:app_grp extensions /home/app/build/extensions
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip,rw \
|
||||||
|
# build all requirements files as wheel dists
|
||||||
|
pip3 wheel -w wheels -r requirements.txt `echo "$BUILD_EXTENSIONS" | sed -r 's/([^,]+)\s*,?\s*/ -r \/home\/app\/build\/extensions\/\1\/requirements.txt/g'`
|
||||||
|
# drop wheel and setuptools .whl to avoid install issues
|
||||||
|
RUN rm wheels/setuptools*.whl
|
||||||
|
|
||||||
|
# RUNTIME
|
||||||
|
FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
|
||||||
|
ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6}"
|
||||||
|
ARG APP_UID="${APP_UID:-6972}"
|
||||||
|
ARG APP_GID="${APP_GID:-6972}"
|
||||||
|
ENV CLI_ARGS=""
|
||||||
|
# create / update runtime env
|
||||||
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
|
||||||
|
apt update && \
|
||||||
|
apt install --no-install-recommends -y git python3 pip && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
pip3 install --global --no-cache --upgrade pip wheel setuptools && \
|
||||||
|
# make shared builder & runtime app user
|
||||||
|
addgroup --gid $APP_GID app_grp && \
|
||||||
|
useradd -m -u $APP_UID --gid app_grp app
|
||||||
|
USER app:app_grp
|
||||||
|
# install locally built wheels for app
|
||||||
|
WORKDIR /home/app/wheels
|
||||||
|
COPY --from=builder /home/app/build/wheels /home/app/wheels
|
||||||
|
COPY --chown=app:app_grp . /home/app/text-generation-webui
|
||||||
|
RUN umask 0002 && \
|
||||||
|
chmod g+rwX /home/app/text-generation-webui && \
|
||||||
|
pip3 install --global --no-build-isolation --no-cache --no-index ./*.whl && \
|
||||||
|
rm -r /home/app/wheels
|
||||||
|
WORKDIR /home/app/text-generation-webui
|
||||||
|
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
|
||||||
|
# set umask to ensure group read / write at runtime
|
||||||
|
CMD umask 0002 && export HOME=/home/app && python3 server.py ${CLI_ARGS}
|
||||||
|
|
@ -364,7 +364,7 @@ def run_server():
|
||||||
|
|
||||||
logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n')
|
logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n')
|
||||||
|
|
||||||
if shared.args.admin_key:
|
if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
|
||||||
logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
|
logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
|
||||||
|
|
||||||
uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
|
uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
|
||||||
|
|
|
||||||
|
|
@ -182,3 +182,7 @@
|
||||||
instruction_template: 'Alpaca'
|
instruction_template: 'Alpaca'
|
||||||
.*orca-2-(13|7)b:
|
.*orca-2-(13|7)b:
|
||||||
instruction_template: 'ChatML'
|
instruction_template: 'ChatML'
|
||||||
|
.*openhermes.*mistral:
|
||||||
|
instruction_template: 'ChatML'
|
||||||
|
.*Yi-34B-Chat:
|
||||||
|
instruction_template: 'ChatML'
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ class LlamaCppModel:
|
||||||
self.grammar = None
|
self.grammar = None
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.model.__del__()
|
del self.model
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(self, path):
|
def from_pretrained(self, path):
|
||||||
|
|
@ -64,7 +64,8 @@ class LlamaCppModel:
|
||||||
else:
|
else:
|
||||||
cache_capacity = int(shared.args.cache_capacity)
|
cache_capacity = int(shared.args.cache_capacity)
|
||||||
|
|
||||||
logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
|
if cache_capacity > 0:
|
||||||
|
logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
|
||||||
|
|
||||||
if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
|
if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
|
||||||
tensor_split_list = None
|
tensor_split_list = None
|
||||||
|
|
@ -118,9 +119,7 @@ class LlamaCppModel:
|
||||||
self.grammar = None
|
self.grammar = None
|
||||||
|
|
||||||
def generate(self, prompt, state, callback=None):
|
def generate(self, prompt, state, callback=None):
|
||||||
|
|
||||||
LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
|
LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
|
||||||
|
|
||||||
prompt = prompt if type(prompt) is str else prompt.decode()
|
prompt = prompt if type(prompt) is str else prompt.decode()
|
||||||
|
|
||||||
# Handle truncation
|
# Handle truncation
|
||||||
|
|
@ -163,6 +162,7 @@ class LlamaCppModel:
|
||||||
for completion_chunk in completion_chunks:
|
for completion_chunk in completion_chunks:
|
||||||
if shared.stop_everything:
|
if shared.stop_everything:
|
||||||
break
|
break
|
||||||
|
|
||||||
text = completion_chunk['choices'][0]['text']
|
text = completion_chunk['choices'][0]['text']
|
||||||
output += text
|
output += text
|
||||||
if callback:
|
if callback:
|
||||||
|
|
|
||||||
|
|
@ -27,68 +27,6 @@ loaders_and_params = OrderedDict({
|
||||||
'disable_exllama',
|
'disable_exllama',
|
||||||
'transformers_info'
|
'transformers_info'
|
||||||
],
|
],
|
||||||
'ExLlama_HF': [
|
|
||||||
'gpu_split',
|
|
||||||
'max_seq_len',
|
|
||||||
'alpha_value',
|
|
||||||
'rope_freq_base',
|
|
||||||
'compress_pos_emb',
|
|
||||||
'cfg_cache',
|
|
||||||
'no_use_fast',
|
|
||||||
'exllama_HF_info',
|
|
||||||
],
|
|
||||||
'ExLlamav2_HF': [
|
|
||||||
'gpu_split',
|
|
||||||
'max_seq_len',
|
|
||||||
'cfg_cache',
|
|
||||||
'no_flash_attn',
|
|
||||||
'cache_8bit',
|
|
||||||
'alpha_value',
|
|
||||||
'compress_pos_emb',
|
|
||||||
'no_use_fast',
|
|
||||||
],
|
|
||||||
'ExLlama': [
|
|
||||||
'gpu_split',
|
|
||||||
'max_seq_len',
|
|
||||||
'alpha_value',
|
|
||||||
'rope_freq_base',
|
|
||||||
'compress_pos_emb',
|
|
||||||
'exllama_info',
|
|
||||||
],
|
|
||||||
'ExLlamav2': [
|
|
||||||
'gpu_split',
|
|
||||||
'max_seq_len',
|
|
||||||
'no_flash_attn',
|
|
||||||
'cache_8bit',
|
|
||||||
'alpha_value',
|
|
||||||
'compress_pos_emb',
|
|
||||||
],
|
|
||||||
'AutoGPTQ': [
|
|
||||||
'triton',
|
|
||||||
'no_inject_fused_attention',
|
|
||||||
'no_inject_fused_mlp',
|
|
||||||
'no_use_cuda_fp16',
|
|
||||||
'wbits',
|
|
||||||
'groupsize',
|
|
||||||
'desc_act',
|
|
||||||
'disable_exllama',
|
|
||||||
'gpu_memory',
|
|
||||||
'cpu_memory',
|
|
||||||
'cpu',
|
|
||||||
'disk',
|
|
||||||
'auto_devices',
|
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
|
||||||
'autogptq_info',
|
|
||||||
],
|
|
||||||
'GPTQ-for-LLaMa': [
|
|
||||||
'wbits',
|
|
||||||
'groupsize',
|
|
||||||
'model_type',
|
|
||||||
'pre_layer',
|
|
||||||
'no_use_fast',
|
|
||||||
'gptq_for_llama_info',
|
|
||||||
],
|
|
||||||
'llama.cpp': [
|
'llama.cpp': [
|
||||||
'n_ctx',
|
'n_ctx',
|
||||||
'n_gpu_layers',
|
'n_gpu_layers',
|
||||||
|
|
@ -125,14 +63,43 @@ loaders_and_params = OrderedDict({
|
||||||
'logits_all',
|
'logits_all',
|
||||||
'llamacpp_HF_info',
|
'llamacpp_HF_info',
|
||||||
],
|
],
|
||||||
'ctransformers': [
|
'ExLlama_HF': [
|
||||||
'n_ctx',
|
'gpu_split',
|
||||||
'n_gpu_layers',
|
'max_seq_len',
|
||||||
'n_batch',
|
'alpha_value',
|
||||||
'threads',
|
'rope_freq_base',
|
||||||
'model_type',
|
'compress_pos_emb',
|
||||||
'no_mmap',
|
'cfg_cache',
|
||||||
'mlock'
|
'no_use_fast',
|
||||||
|
'exllama_HF_info',
|
||||||
|
],
|
||||||
|
'ExLlamav2_HF': [
|
||||||
|
'gpu_split',
|
||||||
|
'max_seq_len',
|
||||||
|
'cfg_cache',
|
||||||
|
'no_flash_attn',
|
||||||
|
'cache_8bit',
|
||||||
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
|
'no_use_fast',
|
||||||
|
],
|
||||||
|
'AutoGPTQ': [
|
||||||
|
'triton',
|
||||||
|
'no_inject_fused_attention',
|
||||||
|
'no_inject_fused_mlp',
|
||||||
|
'no_use_cuda_fp16',
|
||||||
|
'wbits',
|
||||||
|
'groupsize',
|
||||||
|
'desc_act',
|
||||||
|
'disable_exllama',
|
||||||
|
'gpu_memory',
|
||||||
|
'cpu_memory',
|
||||||
|
'cpu',
|
||||||
|
'disk',
|
||||||
|
'auto_devices',
|
||||||
|
'trust_remote_code',
|
||||||
|
'no_use_fast',
|
||||||
|
'autogptq_info',
|
||||||
],
|
],
|
||||||
'AutoAWQ': [
|
'AutoAWQ': [
|
||||||
'cpu_memory',
|
'cpu_memory',
|
||||||
|
|
@ -142,7 +109,40 @@ loaders_and_params = OrderedDict({
|
||||||
'no_inject_fused_attention',
|
'no_inject_fused_attention',
|
||||||
'trust_remote_code',
|
'trust_remote_code',
|
||||||
'no_use_fast',
|
'no_use_fast',
|
||||||
]
|
],
|
||||||
|
'GPTQ-for-LLaMa': [
|
||||||
|
'wbits',
|
||||||
|
'groupsize',
|
||||||
|
'model_type',
|
||||||
|
'pre_layer',
|
||||||
|
'no_use_fast',
|
||||||
|
'gptq_for_llama_info',
|
||||||
|
],
|
||||||
|
'ExLlama': [
|
||||||
|
'gpu_split',
|
||||||
|
'max_seq_len',
|
||||||
|
'alpha_value',
|
||||||
|
'rope_freq_base',
|
||||||
|
'compress_pos_emb',
|
||||||
|
'exllama_info',
|
||||||
|
],
|
||||||
|
'ExLlamav2': [
|
||||||
|
'gpu_split',
|
||||||
|
'max_seq_len',
|
||||||
|
'no_flash_attn',
|
||||||
|
'cache_8bit',
|
||||||
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
|
],
|
||||||
|
'ctransformers': [
|
||||||
|
'n_ctx',
|
||||||
|
'n_gpu_layers',
|
||||||
|
'n_batch',
|
||||||
|
'threads',
|
||||||
|
'model_type',
|
||||||
|
'no_mmap',
|
||||||
|
'mlock'
|
||||||
|
],
|
||||||
})
|
})
|
||||||
|
|
||||||
loaders_samplers = {
|
loaders_samplers = {
|
||||||
|
|
|
||||||
|
|
@ -78,10 +78,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
||||||
|
|
||||||
# Generate
|
# Generate
|
||||||
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
|
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
|
||||||
|
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
|
||||||
if escape_html:
|
if escape_html:
|
||||||
reply = html.escape(reply)
|
reply = html.escape(reply)
|
||||||
|
|
||||||
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
|
|
||||||
if is_stream:
|
if is_stream:
|
||||||
cur_time = time.time()
|
cur_time = time.time()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue