Merge pull request #4773 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2023-11-30 02:31:37 -03:00 committed by GitHub
commit 6d3a9b8689
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 174 additions and 177 deletions

11
.gitignore vendored
View file

@ -26,9 +26,10 @@
.DS_Store .DS_Store
.eslintrc.js .eslintrc.js
.idea .idea
.env
.venv .venv
venv venv
.envrc
.direnv
.vscode .vscode
*.bak *.bak
*.ipynb *.ipynb
@ -39,4 +40,10 @@ key.pem
package.json package.json
package-lock.json package-lock.json
Thumbs.db Thumbs.db
wandb wandb
# ignore user docker config and top level links to docker files
/docker-compose.yaml
/docker-compose.yml
/Dockerfile
.env

View file

@ -163,9 +163,12 @@ The requirments*.txt above contain various precompiled wheels. If you wish to co
### Alternative: Docker ### Alternative: Docker
``` ```
ln -s docker/{Dockerfile,docker-compose.yml,.dockerignore} . ln -s docker/{nvidia/Dockerfile,docker-compose.yml,.dockerignore} .
cp docker/.env.example .env cp docker/.env.example .env
# Edit .env and set TORCH_CUDA_ARCH_LIST based on your GPU model # Edit .env and set:
# TORCH_CUDA_ARCH_LIST based on your GPU model
# APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal)
# BUILD_EXTENIONS optionally add comma separated list of extensions to build
docker compose up --build docker compose up --build
``` ```

View file

@ -2,19 +2,21 @@
# however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5 # however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
# https://developer.nvidia.com/cuda-gpus you can find the version for your card here # https://developer.nvidia.com/cuda-gpus you can find the version for your card here
TORCH_CUDA_ARCH_LIST=7.5 TORCH_CUDA_ARCH_LIST=7.5
# your command-line flags go here: # your command-line flags go here:
CLI_ARGS= CLI_ARGS=--listen
# the port the webui binds to on the host # the port the webui binds to on the host
HOST_PORT=7860 HOST_PORT=7860
# the port the webui binds to inside the container # the port the webui binds to inside the container
CONTAINER_PORT=7860 CONTAINER_PORT=7860
# the port the api binds to on the host # the port the api binds to on the host
HOST_API_PORT=5000 HOST_API_PORT=5000
# the port the api binds to inside the container # the port the api binds to inside the container
CONTAINER_API_PORT=5000 CONTAINER_API_PORT=5000
# Comma separated extensions to build
# the version used to install text-generation-webui from BUILD_EXTENSIONS=""
WEBUI_VERSION=HEAD # Set APP_RUNTIME_GID to an appropriate host system group to enable access to mounted volumes
# You can find your current host user group id with the command `id -g`
APP_RUNTIME_GID=6972
# override default app build permissions (handy for deploying to cloud)
#APP_GID=6972
#APP_UID=6972

View file

@ -1,77 +0,0 @@
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as builder
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \
apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \
rm -rf /var/lib/apt/lists/*
RUN git clone --depth=1 https://github.com/oobabooga/GPTQ-for-LLaMa /build
WORKDIR /build
RUN --mount=type=cache,target=/root/.cache/pip,rw \
python3 -m venv /build/venv && \
. /build/venv/bin/activate && \
pip3 install --upgrade pip setuptools wheel ninja && \
pip3 install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121 && \
pip3 install -r requirements.txt
# https://developer.nvidia.com/cuda-gpus
# for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
RUN . /build/venv/bin/activate && \
python3 setup_cuda.py bdist_wheel -d .
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
LABEL maintainer="Your Name <your.email@example.com>"
LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \
apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ ffmpeg && \
rm -rf /var/lib/apt/lists/*
RUN --mount=type=cache,target=/root/.cache/pip,rw pip3 install virtualenv
RUN mkdir /app
WORKDIR /app
ARG WEBUI_VERSION
RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source"
# Create virtualenv
RUN virtualenv /app/venv
RUN --mount=type=cache,target=/root/.cache/pip,rw \
. /app/venv/bin/activate && \
pip3 install --upgrade pip setuptools wheel ninja && \
pip3 install torch xformers --index-url https://download.pytorch.org/whl/cu121 && \
pip3 install torchvision torchaudio sentence_transformers
# Copy and install GPTQ-for-LLaMa
COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
RUN --mount=type=cache,target=/root/.cache/pip,rw \
. /app/venv/bin/activate && \
pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl
# Install main requirements
COPY requirements.txt /app/requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip,rw \
. /app/venv/bin/activate && \
pip3 install -r requirements.txt
COPY . /app/
RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
# Install extension requirements
RUN --mount=type=cache,target=/root/.cache/pip,rw \
. /app/venv/bin/activate && \
for ext in /app/extensions/*/requirements.txt; do \
cd "$(dirname "$ext")"; \
pip3 install -r requirements.txt; \
done
ENV CLI_ARGS=""
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000}
CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}

View file

@ -5,28 +5,31 @@ services:
context: . context: .
args: args:
# specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
WEBUI_VERSION: ${WEBUI_VERSION:-HEAD} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
env_file: .env env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports: ports:
- "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}" - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
- "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}" - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
stdin_open: true stdin_open: true
tty: true tty: true
volumes: volumes:
- ./characters:/app/characters - ./characters:/home/app/text-generation-webui/characters
- ./extensions:/app/extensions - ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/app/loras - ./loras:/home/app/text-generation-webui/loras
- ./models:/app/models - ./models:/home/app/text-generation-webui/models
- ./presets:/app/presets - ./presets:/home/app/text-generation-webui/presets
- ./prompts:/app/prompts - ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/app/softprompts - ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/app/training - ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared - ./cloudflared:/etc/cloudflared
deploy: deploy:
resources: resources:
reservations: reservations:
devices: devices:
- driver: nvidia - driver: nvidia
device_ids: ['0'] count: all
capabilities: [gpu] capabilities: [gpu]

56
docker/nvidia/Dockerfile Normal file
View file

@ -0,0 +1,56 @@
# BUILDER
FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 as builder
WORKDIR /builder
ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
ARG BUILD_EXTENSIONS="${BUILD_EXTENSIONS:-}"
ARG APP_UID="${APP_UID:-6972}"
ARG APP_GID="${APP_GID:-6972}"
# create / update build env
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
apt update && \
apt install --no-install-recommends -y git vim build-essential python3-dev pip && \
rm -rf /var/lib/apt/lists/*
RUN --mount=type=cache,target=/root/.cache/pip,rw \
pip3 install --global --upgrade pip wheel setuptools && \
# make shared builder & runtime app user
addgroup --gid $APP_GID app_grp && \
useradd -m -u $APP_UID --gid app_grp app
USER app:app_grp
# build wheels for runtime
WORKDIR /home/app/build
COPY --chown=app:app_grp requirements.txt /home/app/build
COPY --chown=app:app_grp extensions /home/app/build/extensions
RUN --mount=type=cache,target=/root/.cache/pip,rw \
# build all requirements files as wheel dists
pip3 wheel -w wheels -r requirements.txt `echo "$BUILD_EXTENSIONS" | sed -r 's/([^,]+)\s*,?\s*/ -r \/home\/app\/build\/extensions\/\1\/requirements.txt/g'`
# drop wheel and setuptools .whl to avoid install issues
RUN rm wheels/setuptools*.whl
# RUNTIME
FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6}"
ARG APP_UID="${APP_UID:-6972}"
ARG APP_GID="${APP_GID:-6972}"
ENV CLI_ARGS=""
# create / update runtime env
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
apt update && \
apt install --no-install-recommends -y git python3 pip && \
rm -rf /var/lib/apt/lists/* && \
pip3 install --global --no-cache --upgrade pip wheel setuptools && \
# make shared builder & runtime app user
addgroup --gid $APP_GID app_grp && \
useradd -m -u $APP_UID --gid app_grp app
USER app:app_grp
# install locally built wheels for app
WORKDIR /home/app/wheels
COPY --from=builder /home/app/build/wheels /home/app/wheels
COPY --chown=app:app_grp . /home/app/text-generation-webui
RUN umask 0002 && \
chmod g+rwX /home/app/text-generation-webui && \
pip3 install --global --no-build-isolation --no-cache --no-index ./*.whl && \
rm -r /home/app/wheels
WORKDIR /home/app/text-generation-webui
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
# set umask to ensure group read / write at runtime
CMD umask 0002 && export HOME=/home/app && python3 server.py ${CLI_ARGS}

View file

@ -364,7 +364,7 @@ def run_server():
logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n') logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n')
if shared.args.admin_key: if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n') logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile) uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)

View file

@ -182,3 +182,7 @@
instruction_template: 'Alpaca' instruction_template: 'Alpaca'
.*orca-2-(13|7)b: .*orca-2-(13|7)b:
instruction_template: 'ChatML' instruction_template: 'ChatML'
.*openhermes.*mistral:
instruction_template: 'ChatML'
.*Yi-34B-Chat:
instruction_template: 'ChatML'

View file

@ -46,7 +46,7 @@ class LlamaCppModel:
self.grammar = None self.grammar = None
def __del__(self): def __del__(self):
self.model.__del__() del self.model
@classmethod @classmethod
def from_pretrained(self, path): def from_pretrained(self, path):
@ -64,7 +64,8 @@ class LlamaCppModel:
else: else:
cache_capacity = int(shared.args.cache_capacity) cache_capacity = int(shared.args.cache_capacity)
logger.info("Cache capacity is " + str(cache_capacity) + " bytes") if cache_capacity > 0:
logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '': if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
tensor_split_list = None tensor_split_list = None
@ -118,9 +119,7 @@ class LlamaCppModel:
self.grammar = None self.grammar = None
def generate(self, prompt, state, callback=None): def generate(self, prompt, state, callback=None):
LogitsProcessorList = llama_cpp_lib().LogitsProcessorList LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
prompt = prompt if type(prompt) is str else prompt.decode() prompt = prompt if type(prompt) is str else prompt.decode()
# Handle truncation # Handle truncation
@ -163,6 +162,7 @@ class LlamaCppModel:
for completion_chunk in completion_chunks: for completion_chunk in completion_chunks:
if shared.stop_everything: if shared.stop_everything:
break break
text = completion_chunk['choices'][0]['text'] text = completion_chunk['choices'][0]['text']
output += text output += text
if callback: if callback:

View file

@ -27,68 +27,6 @@ loaders_and_params = OrderedDict({
'disable_exllama', 'disable_exllama',
'transformers_info' 'transformers_info'
], ],
'ExLlama_HF': [
'gpu_split',
'max_seq_len',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'cfg_cache',
'no_use_fast',
'exllama_HF_info',
],
'ExLlamav2_HF': [
'gpu_split',
'max_seq_len',
'cfg_cache',
'no_flash_attn',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
'no_use_fast',
],
'ExLlama': [
'gpu_split',
'max_seq_len',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'exllama_info',
],
'ExLlamav2': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
],
'AutoGPTQ': [
'triton',
'no_inject_fused_attention',
'no_inject_fused_mlp',
'no_use_cuda_fp16',
'wbits',
'groupsize',
'desc_act',
'disable_exllama',
'gpu_memory',
'cpu_memory',
'cpu',
'disk',
'auto_devices',
'trust_remote_code',
'no_use_fast',
'autogptq_info',
],
'GPTQ-for-LLaMa': [
'wbits',
'groupsize',
'model_type',
'pre_layer',
'no_use_fast',
'gptq_for_llama_info',
],
'llama.cpp': [ 'llama.cpp': [
'n_ctx', 'n_ctx',
'n_gpu_layers', 'n_gpu_layers',
@ -125,14 +63,43 @@ loaders_and_params = OrderedDict({
'logits_all', 'logits_all',
'llamacpp_HF_info', 'llamacpp_HF_info',
], ],
'ctransformers': [ 'ExLlama_HF': [
'n_ctx', 'gpu_split',
'n_gpu_layers', 'max_seq_len',
'n_batch', 'alpha_value',
'threads', 'rope_freq_base',
'model_type', 'compress_pos_emb',
'no_mmap', 'cfg_cache',
'mlock' 'no_use_fast',
'exllama_HF_info',
],
'ExLlamav2_HF': [
'gpu_split',
'max_seq_len',
'cfg_cache',
'no_flash_attn',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
'no_use_fast',
],
'AutoGPTQ': [
'triton',
'no_inject_fused_attention',
'no_inject_fused_mlp',
'no_use_cuda_fp16',
'wbits',
'groupsize',
'desc_act',
'disable_exllama',
'gpu_memory',
'cpu_memory',
'cpu',
'disk',
'auto_devices',
'trust_remote_code',
'no_use_fast',
'autogptq_info',
], ],
'AutoAWQ': [ 'AutoAWQ': [
'cpu_memory', 'cpu_memory',
@ -142,7 +109,40 @@ loaders_and_params = OrderedDict({
'no_inject_fused_attention', 'no_inject_fused_attention',
'trust_remote_code', 'trust_remote_code',
'no_use_fast', 'no_use_fast',
] ],
'GPTQ-for-LLaMa': [
'wbits',
'groupsize',
'model_type',
'pre_layer',
'no_use_fast',
'gptq_for_llama_info',
],
'ExLlama': [
'gpu_split',
'max_seq_len',
'alpha_value',
'rope_freq_base',
'compress_pos_emb',
'exllama_info',
],
'ExLlamav2': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
],
'ctransformers': [
'n_ctx',
'n_gpu_layers',
'n_batch',
'threads',
'model_type',
'no_mmap',
'mlock'
],
}) })
loaders_samplers = { loaders_samplers = {

View file

@ -78,10 +78,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
# Generate # Generate
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat): for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if escape_html: if escape_html:
reply = html.escape(reply) reply = html.escape(reply)
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if is_stream: if is_stream:
cur_time = time.time() cur_time = time.time()