diff --git a/.gitignore b/.gitignore index a30fd14..cf47b62 100644 --- a/.gitignore +++ b/.gitignore @@ -26,9 +26,10 @@ .DS_Store .eslintrc.js .idea -.env .venv venv +.envrc +.direnv .vscode *.bak *.ipynb @@ -39,4 +40,10 @@ key.pem package.json package-lock.json Thumbs.db -wandb \ No newline at end of file +wandb + +# ignore user docker config and top level links to docker files +/docker-compose.yaml +/docker-compose.yml +/Dockerfile +.env diff --git a/README.md b/README.md index 1e22c18..d08a9c5 100644 --- a/README.md +++ b/README.md @@ -163,9 +163,12 @@ The requirments*.txt above contain various precompiled wheels. If you wish to co ### Alternative: Docker ``` -ln -s docker/{Dockerfile,docker-compose.yml,.dockerignore} . +ln -s docker/{nvidia/Dockerfile,docker-compose.yml,.dockerignore} . cp docker/.env.example .env -# Edit .env and set TORCH_CUDA_ARCH_LIST based on your GPU model +# Edit .env and set: +# TORCH_CUDA_ARCH_LIST based on your GPU model +# APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal) +# BUILD_EXTENIONS optionally add comma separated list of extensions to build docker compose up --build ``` diff --git a/docker/.env.example b/docker/.env.example index b254f53..bc46e95 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -2,19 +2,21 @@ # however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5 # https://developer.nvidia.com/cuda-gpus you can find the version for your card here TORCH_CUDA_ARCH_LIST=7.5 - # your command-line flags go here: -CLI_ARGS= - +CLI_ARGS=--listen # the port the webui binds to on the host HOST_PORT=7860 # the port the webui binds to inside the container CONTAINER_PORT=7860 - # the port the api binds to on the host HOST_API_PORT=5000 # the port the api binds to inside the container CONTAINER_API_PORT=5000 - -# the version used to install text-generation-webui from -WEBUI_VERSION=HEAD +# Comma separated extensions to build +BUILD_EXTENSIONS="" +# Set APP_RUNTIME_GID to an appropriate host system group to enable access to mounted volumes +# You can find your current host user group id with the command `id -g` +APP_RUNTIME_GID=6972 +# override default app build permissions (handy for deploying to cloud) +#APP_GID=6972 +#APP_UID=6972 diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 9feea6e..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,77 +0,0 @@ -FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as builder - -RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \ - apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \ - rm -rf /var/lib/apt/lists/* - -RUN git clone --depth=1 https://github.com/oobabooga/GPTQ-for-LLaMa /build - -WORKDIR /build - -RUN --mount=type=cache,target=/root/.cache/pip,rw \ - python3 -m venv /build/venv && \ - . /build/venv/bin/activate && \ - pip3 install --upgrade pip setuptools wheel ninja && \ - pip3 install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121 && \ - pip3 install -r requirements.txt - -# https://developer.nvidia.com/cuda-gpus -# for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5" -ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}" -RUN . /build/venv/bin/activate && \ - python3 setup_cuda.py bdist_wheel -d . - -FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 - -LABEL maintainer="Your Name " -LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI" - -RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \ - apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ ffmpeg && \ - rm -rf /var/lib/apt/lists/* - -RUN --mount=type=cache,target=/root/.cache/pip,rw pip3 install virtualenv - -RUN mkdir /app - -WORKDIR /app - -ARG WEBUI_VERSION -RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source" - -# Create virtualenv -RUN virtualenv /app/venv -RUN --mount=type=cache,target=/root/.cache/pip,rw \ - . /app/venv/bin/activate && \ - pip3 install --upgrade pip setuptools wheel ninja && \ - pip3 install torch xformers --index-url https://download.pytorch.org/whl/cu121 && \ - pip3 install torchvision torchaudio sentence_transformers - -# Copy and install GPTQ-for-LLaMa -COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa -RUN --mount=type=cache,target=/root/.cache/pip,rw \ - . /app/venv/bin/activate && \ - pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl - -# Install main requirements -COPY requirements.txt /app/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip,rw \ - . /app/venv/bin/activate && \ - pip3 install -r requirements.txt - -COPY . /app/ - -RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so - -# Install extension requirements -RUN --mount=type=cache,target=/root/.cache/pip,rw \ - . /app/venv/bin/activate && \ - for ext in /app/extensions/*/requirements.txt; do \ - cd "$(dirname "$ext")"; \ - pip3 install -r requirements.txt; \ - done - -ENV CLI_ARGS="" - -EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} -CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 29767d2..2aa6608 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -5,28 +5,31 @@ services: context: . args: # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus - TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} - WEBUI_VERSION: ${WEBUI_VERSION:-HEAD} + TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} + BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} + APP_GID: ${APP_GID:-6972} + APP_UID: ${APP_UID-6972} env_file: .env + user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}" - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}" stdin_open: true tty: true volumes: - - ./characters:/app/characters - - ./extensions:/app/extensions - - ./loras:/app/loras - - ./models:/app/models - - ./presets:/app/presets - - ./prompts:/app/prompts - - ./softprompts:/app/softprompts - - ./training:/app/training + - ./characters:/home/app/text-generation-webui/characters + - ./extensions:/home/app/text-generation-webui/extensions + - ./loras:/home/app/text-generation-webui/loras + - ./models:/home/app/text-generation-webui/models + - ./presets:/home/app/text-generation-webui/presets + - ./prompts:/home/app/text-generation-webui/prompts + - ./softprompts:/home/app/text-generation-webui/softprompts + - ./training:/home/app/text-generation-webui/training - ./cloudflared:/etc/cloudflared deploy: resources: reservations: devices: - driver: nvidia - device_ids: ['0'] + count: all capabilities: [gpu] diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile new file mode 100644 index 0000000..8da5564 --- /dev/null +++ b/docker/nvidia/Dockerfile @@ -0,0 +1,56 @@ +# BUILDER +FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 as builder +WORKDIR /builder +ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}" +ARG BUILD_EXTENSIONS="${BUILD_EXTENSIONS:-}" +ARG APP_UID="${APP_UID:-6972}" +ARG APP_GID="${APP_GID:-6972}" +# create / update build env +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \ + apt update && \ + apt install --no-install-recommends -y git vim build-essential python3-dev pip && \ + rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,target=/root/.cache/pip,rw \ + pip3 install --global --upgrade pip wheel setuptools && \ + # make shared builder & runtime app user + addgroup --gid $APP_GID app_grp && \ + useradd -m -u $APP_UID --gid app_grp app +USER app:app_grp +# build wheels for runtime +WORKDIR /home/app/build +COPY --chown=app:app_grp requirements.txt /home/app/build +COPY --chown=app:app_grp extensions /home/app/build/extensions +RUN --mount=type=cache,target=/root/.cache/pip,rw \ + # build all requirements files as wheel dists + pip3 wheel -w wheels -r requirements.txt `echo "$BUILD_EXTENSIONS" | sed -r 's/([^,]+)\s*,?\s*/ -r \/home\/app\/build\/extensions\/\1\/requirements.txt/g'` + # drop wheel and setuptools .whl to avoid install issues +RUN rm wheels/setuptools*.whl + +# RUNTIME +FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04 +ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6}" +ARG APP_UID="${APP_UID:-6972}" +ARG APP_GID="${APP_GID:-6972}" +ENV CLI_ARGS="" +# create / update runtime env +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \ + apt update && \ + apt install --no-install-recommends -y git python3 pip && \ + rm -rf /var/lib/apt/lists/* && \ + pip3 install --global --no-cache --upgrade pip wheel setuptools && \ + # make shared builder & runtime app user + addgroup --gid $APP_GID app_grp && \ + useradd -m -u $APP_UID --gid app_grp app +USER app:app_grp +# install locally built wheels for app +WORKDIR /home/app/wheels +COPY --from=builder /home/app/build/wheels /home/app/wheels +COPY --chown=app:app_grp . /home/app/text-generation-webui +RUN umask 0002 && \ + chmod g+rwX /home/app/text-generation-webui && \ + pip3 install --global --no-build-isolation --no-cache --no-index ./*.whl && \ + rm -r /home/app/wheels +WORKDIR /home/app/text-generation-webui +EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} +# set umask to ensure group read / write at runtime +CMD umask 0002 && export HOME=/home/app && python3 server.py ${CLI_ARGS} diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 047c339..1958c30 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -364,7 +364,7 @@ def run_server(): logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n') - if shared.args.admin_key: + if shared.args.admin_key and shared.args.admin_key != shared.args.api_key: logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n') uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile) diff --git a/models/config.yaml b/models/config.yaml index b39e0eb..f18687a 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -182,3 +182,7 @@ instruction_template: 'Alpaca' .*orca-2-(13|7)b: instruction_template: 'ChatML' +.*openhermes.*mistral: + instruction_template: 'ChatML' +.*Yi-34B-Chat: + instruction_template: 'ChatML' diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 7167301..aa0fedb 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -46,7 +46,7 @@ class LlamaCppModel: self.grammar = None def __del__(self): - self.model.__del__() + del self.model @classmethod def from_pretrained(self, path): @@ -64,7 +64,8 @@ class LlamaCppModel: else: cache_capacity = int(shared.args.cache_capacity) - logger.info("Cache capacity is " + str(cache_capacity) + " bytes") + if cache_capacity > 0: + logger.info("Cache capacity is " + str(cache_capacity) + " bytes") if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '': tensor_split_list = None @@ -118,9 +119,7 @@ class LlamaCppModel: self.grammar = None def generate(self, prompt, state, callback=None): - LogitsProcessorList = llama_cpp_lib().LogitsProcessorList - prompt = prompt if type(prompt) is str else prompt.decode() # Handle truncation @@ -163,6 +162,7 @@ class LlamaCppModel: for completion_chunk in completion_chunks: if shared.stop_everything: break + text = completion_chunk['choices'][0]['text'] output += text if callback: diff --git a/modules/loaders.py b/modules/loaders.py index 5d9836c..42a5cfd 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -27,68 +27,6 @@ loaders_and_params = OrderedDict({ 'disable_exllama', 'transformers_info' ], - 'ExLlama_HF': [ - 'gpu_split', - 'max_seq_len', - 'alpha_value', - 'rope_freq_base', - 'compress_pos_emb', - 'cfg_cache', - 'no_use_fast', - 'exllama_HF_info', - ], - 'ExLlamav2_HF': [ - 'gpu_split', - 'max_seq_len', - 'cfg_cache', - 'no_flash_attn', - 'cache_8bit', - 'alpha_value', - 'compress_pos_emb', - 'no_use_fast', - ], - 'ExLlama': [ - 'gpu_split', - 'max_seq_len', - 'alpha_value', - 'rope_freq_base', - 'compress_pos_emb', - 'exllama_info', - ], - 'ExLlamav2': [ - 'gpu_split', - 'max_seq_len', - 'no_flash_attn', - 'cache_8bit', - 'alpha_value', - 'compress_pos_emb', - ], - 'AutoGPTQ': [ - 'triton', - 'no_inject_fused_attention', - 'no_inject_fused_mlp', - 'no_use_cuda_fp16', - 'wbits', - 'groupsize', - 'desc_act', - 'disable_exllama', - 'gpu_memory', - 'cpu_memory', - 'cpu', - 'disk', - 'auto_devices', - 'trust_remote_code', - 'no_use_fast', - 'autogptq_info', - ], - 'GPTQ-for-LLaMa': [ - 'wbits', - 'groupsize', - 'model_type', - 'pre_layer', - 'no_use_fast', - 'gptq_for_llama_info', - ], 'llama.cpp': [ 'n_ctx', 'n_gpu_layers', @@ -125,14 +63,43 @@ loaders_and_params = OrderedDict({ 'logits_all', 'llamacpp_HF_info', ], - 'ctransformers': [ - 'n_ctx', - 'n_gpu_layers', - 'n_batch', - 'threads', - 'model_type', - 'no_mmap', - 'mlock' + 'ExLlama_HF': [ + 'gpu_split', + 'max_seq_len', + 'alpha_value', + 'rope_freq_base', + 'compress_pos_emb', + 'cfg_cache', + 'no_use_fast', + 'exllama_HF_info', + ], + 'ExLlamav2_HF': [ + 'gpu_split', + 'max_seq_len', + 'cfg_cache', + 'no_flash_attn', + 'cache_8bit', + 'alpha_value', + 'compress_pos_emb', + 'no_use_fast', + ], + 'AutoGPTQ': [ + 'triton', + 'no_inject_fused_attention', + 'no_inject_fused_mlp', + 'no_use_cuda_fp16', + 'wbits', + 'groupsize', + 'desc_act', + 'disable_exllama', + 'gpu_memory', + 'cpu_memory', + 'cpu', + 'disk', + 'auto_devices', + 'trust_remote_code', + 'no_use_fast', + 'autogptq_info', ], 'AutoAWQ': [ 'cpu_memory', @@ -142,7 +109,40 @@ loaders_and_params = OrderedDict({ 'no_inject_fused_attention', 'trust_remote_code', 'no_use_fast', - ] + ], + 'GPTQ-for-LLaMa': [ + 'wbits', + 'groupsize', + 'model_type', + 'pre_layer', + 'no_use_fast', + 'gptq_for_llama_info', + ], + 'ExLlama': [ + 'gpu_split', + 'max_seq_len', + 'alpha_value', + 'rope_freq_base', + 'compress_pos_emb', + 'exllama_info', + ], + 'ExLlamav2': [ + 'gpu_split', + 'max_seq_len', + 'no_flash_attn', + 'cache_8bit', + 'alpha_value', + 'compress_pos_emb', + ], + 'ctransformers': [ + 'n_ctx', + 'n_gpu_layers', + 'n_batch', + 'threads', + 'model_type', + 'no_mmap', + 'mlock' + ], }) loaders_samplers = { diff --git a/modules/text_generation.py b/modules/text_generation.py index 6034ef3..3a4c55b 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -78,10 +78,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap # Generate for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat): + reply, stop_found = apply_stopping_strings(reply, all_stop_strings) if escape_html: reply = html.escape(reply) - - reply, stop_found = apply_stopping_strings(reply, all_stop_strings) if is_stream: cur_time = time.time()