Merge pull request #4773 from oobabooga/dev

Merge dev branch
2023-11-30 02:31:37 -03:00 · 2023-11-30 02:31:37 -03:00 · 6d3a9b8689
commit 6d3a9b8689
parent 51add248c8 000b77a17d
11 changed files with 174 additions and 177 deletions
--- a/.gitignore
+++ b/.gitignore
@ -26,9 +26,10 @@
 .DS_Store
 .eslintrc.js
 .idea
 .env
 .venv
 venv
 .envrc
 .direnv
 .vscode
 *.bak
 *.ipynb
@ -40,3 +41,9 @@ package.json
 package-lock.json
 Thumbs.db
 wandb
 # ignore user docker config and top level links to docker files
 /docker-compose.yaml
 /docker-compose.yml
 /Dockerfile
 .env
--- a/README.md
+++ b/README.md
@ -163,9 +163,12 @@ The requirments*.txt above contain various precompiled wheels. If you wish to co
 ### Alternative: Docker
 ```
-ln -s docker/{Dockerfile,docker-compose.yml,.dockerignore} .
+ln -s docker/{nvidia/Dockerfile,docker-compose.yml,.dockerignore} .
 cp docker/.env.example .env
-# Edit .env and set TORCH_CUDA_ARCH_LIST based on your GPU model
+# Edit .env and set: 
 #   TORCH_CUDA_ARCH_LIST based on your GPU model
 #   APP_RUNTIME_GID      your host user's group id (run `id -g` in a terminal)
 #   BUILD_EXTENIONS      optionally add comma separated list of extensions to build
 docker compose up --build
 ```
--- a/docker/.env.example
+++ b/docker/.env.example
@ -2,19 +2,21 @@
 # however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5
 # https://developer.nvidia.com/cuda-gpus you can find the version for your card here
 TORCH_CUDA_ARCH_LIST=7.5
 # your command-line flags go here:
-CLI_ARGS=
+CLI_ARGS=--listen
 # the port the webui binds to on the host
 HOST_PORT=7860
 # the port the webui binds to inside the container
 CONTAINER_PORT=7860
 # the port the api binds to on the host
 HOST_API_PORT=5000
 # the port the api binds to inside the container
 CONTAINER_API_PORT=5000
-
+# Comma separated extensions to build
-# the version used to install text-generation-webui from
+BUILD_EXTENSIONS=""
-WEBUI_VERSION=HEAD
+# Set APP_RUNTIME_GID to an appropriate host system group to enable access to mounted volumes 
 # You can find your current host user group id with the command `id -g`
 APP_RUNTIME_GID=6972
 # override default app build permissions (handy for deploying to cloud)
 #APP_GID=6972
 #APP_UID=6972
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,77 +0,0 @@
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as builder
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \
    apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \
    rm -rf /var/lib/apt/lists/*
 RUN git clone --depth=1 https://github.com/oobabooga/GPTQ-for-LLaMa /build
 WORKDIR /build
 RUN --mount=type=cache,target=/root/.cache/pip,rw \
    python3 -m venv /build/venv && \
    . /build/venv/bin/activate && \
    pip3 install --upgrade pip setuptools wheel ninja && \
    pip3 install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121 && \
    pip3 install -r requirements.txt
 # https://developer.nvidia.com/cuda-gpus
 # for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5"
 ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
 RUN . /build/venv/bin/activate && \
    python3 setup_cuda.py bdist_wheel -d .
 FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
 LABEL maintainer="Your Name <your.email@example.com>"
 LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI"
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw apt-get update && \
    apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ ffmpeg && \
    rm -rf /var/lib/apt/lists/*
 RUN --mount=type=cache,target=/root/.cache/pip,rw pip3 install virtualenv
 RUN mkdir /app
 WORKDIR /app
 ARG WEBUI_VERSION
 RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source"
 # Create virtualenv
 RUN virtualenv /app/venv
 RUN --mount=type=cache,target=/root/.cache/pip,rw \
    . /app/venv/bin/activate && \
    pip3 install --upgrade pip setuptools wheel ninja && \
    pip3 install torch xformers --index-url https://download.pytorch.org/whl/cu121 && \
    pip3 install torchvision torchaudio sentence_transformers
 # Copy and install GPTQ-for-LLaMa
 COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
 RUN --mount=type=cache,target=/root/.cache/pip,rw \
    . /app/venv/bin/activate && \
    pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl
 # Install main requirements
 COPY requirements.txt /app/requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip,rw \
    . /app/venv/bin/activate && \
    pip3 install -r requirements.txt
 COPY . /app/
 RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
 # Install extension requirements
 RUN --mount=type=cache,target=/root/.cache/pip,rw \
    . /app/venv/bin/activate && \
    for ext in /app/extensions/*/requirements.txt; do \
    cd "$(dirname "$ext")"; \
    pip3 install -r requirements.txt; \
    done
 ENV CLI_ARGS=""
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000}
 CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS}
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@ -6,27 +6,30 @@ services:
      args:
        # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
        TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
-        WEBUI_VERSION: ${WEBUI_VERSION:-HEAD}
+        BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
        APP_GID: ${APP_GID:-6972} 
        APP_UID: ${APP_UID-6972} 
    env_file: .env
    user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
    ports:
      - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
      - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
    stdin_open: true
    tty: true
    volumes:
-      - ./characters:/app/characters
+      - ./characters:/home/app/text-generation-webui/characters
-      - ./extensions:/app/extensions
+      - ./extensions:/home/app/text-generation-webui/extensions
-      - ./loras:/app/loras
+      - ./loras:/home/app/text-generation-webui/loras
-      - ./models:/app/models
+      - ./models:/home/app/text-generation-webui/models
-      - ./presets:/app/presets
+      - ./presets:/home/app/text-generation-webui/presets
-      - ./prompts:/app/prompts
+      - ./prompts:/home/app/text-generation-webui/prompts
-      - ./softprompts:/app/softprompts
+      - ./softprompts:/home/app/text-generation-webui/softprompts
-      - ./training:/app/training
+      - ./training:/home/app/text-generation-webui/training
      - ./cloudflared:/etc/cloudflared
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
-              device_ids: ['0']
+              count: all
              capabilities: [gpu]
--- a/docker/nvidia/Dockerfile
+++ b/docker/nvidia/Dockerfile
@ -0,0 +1,56 @@
 # BUILDER
 FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 as builder
 WORKDIR /builder
 ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX}"
 ARG BUILD_EXTENSIONS="${BUILD_EXTENSIONS:-}"
 ARG APP_UID="${APP_UID:-6972}"
 ARG APP_GID="${APP_GID:-6972}"
 # create / update build env
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
    apt update && \
    apt install --no-install-recommends -y git vim build-essential python3-dev pip && \
    rm -rf /var/lib/apt/lists/*
 RUN --mount=type=cache,target=/root/.cache/pip,rw \
    pip3 install --global --upgrade pip wheel setuptools && \
    # make shared builder & runtime app user
    addgroup --gid $APP_GID app_grp && \
    useradd -m -u $APP_UID --gid app_grp app
 USER app:app_grp
 # build wheels for runtime
 WORKDIR /home/app/build
 COPY --chown=app:app_grp requirements.txt /home/app/build
 COPY --chown=app:app_grp extensions /home/app/build/extensions
 RUN --mount=type=cache,target=/root/.cache/pip,rw \
    # build all requirements files as wheel dists
    pip3 wheel -w wheels -r requirements.txt `echo "$BUILD_EXTENSIONS" | sed -r 's/([^,]+)\s*,?\s*/ -r \/home\/app\/build\/extensions\/\1\/requirements.txt/g'`
    # drop wheel and setuptools .whl to avoid install issues
 RUN rm wheels/setuptools*.whl
 # RUNTIME
 FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
 ARG TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6}"
 ARG APP_UID="${APP_UID:-6972}"
 ARG APP_GID="${APP_GID:-6972}"
 ENV CLI_ARGS=""
 # create / update runtime env
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
    apt update && \
    apt install --no-install-recommends -y git python3 pip && \
    rm -rf /var/lib/apt/lists/* && \
    pip3 install --global --no-cache --upgrade pip wheel setuptools && \
    # make shared builder & runtime app user
    addgroup --gid $APP_GID app_grp && \
    useradd -m -u $APP_UID --gid app_grp app
 USER app:app_grp
 # install locally built wheels for app
 WORKDIR /home/app/wheels
 COPY --from=builder /home/app/build/wheels /home/app/wheels
 COPY --chown=app:app_grp . /home/app/text-generation-webui
 RUN umask 0002 && \
    chmod g+rwX /home/app/text-generation-webui && \
    pip3 install --global --no-build-isolation --no-cache --no-index ./*.whl && \
    rm -r /home/app/wheels
 WORKDIR /home/app/text-generation-webui
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
 CMD umask 0002 && export HOME=/home/app && python3 server.py ${CLI_ARGS}
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@ -364,7 +364,7 @@ def run_server():
        logger.info(f'OpenAI API key:\n\n{shared.args.api_key}\n')
-    if shared.args.admin_key:
+    if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
        logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
    uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
--- a/models/config.yaml
+++ b/models/config.yaml
@ -182,3 +182,7 @@
  instruction_template: 'Alpaca'
 .*orca-2-(13|7)b:
  instruction_template: 'ChatML'
 .*openhermes.*mistral:
  instruction_template: 'ChatML'
 .*Yi-34B-Chat:
  instruction_template: 'ChatML'
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -46,7 +46,7 @@ class LlamaCppModel:
        self.grammar = None
    def __del__(self):
-        self.model.__del__()
+        del self.model
    @classmethod
    def from_pretrained(self, path):
@ -64,7 +64,8 @@ class LlamaCppModel:
            else:
                cache_capacity = int(shared.args.cache_capacity)
-        logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
+        if cache_capacity > 0:
            logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
            tensor_split_list = None
@ -118,9 +119,7 @@ class LlamaCppModel:
                self.grammar = None
    def generate(self, prompt, state, callback=None):
        LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
        prompt = prompt if type(prompt) is str else prompt.decode()
        # Handle truncation
@ -163,6 +162,7 @@ class LlamaCppModel:
        for completion_chunk in completion_chunks:
            if shared.stop_everything:
                break
            text = completion_chunk['choices'][0]['text']
            output += text
            if callback:
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -27,68 +27,6 @@ loaders_and_params = OrderedDict({
        'disable_exllama',
        'transformers_info'
    ],
    'ExLlama_HF': [
        'gpu_split',
        'max_seq_len',
        'alpha_value',
        'rope_freq_base',
        'compress_pos_emb',
        'cfg_cache',
        'no_use_fast',
        'exllama_HF_info',
    ],
    'ExLlamav2_HF': [
        'gpu_split',
        'max_seq_len',
        'cfg_cache',
        'no_flash_attn',
        'cache_8bit',
        'alpha_value',
        'compress_pos_emb',
        'no_use_fast',
    ],
    'ExLlama': [
        'gpu_split',
        'max_seq_len',
        'alpha_value',
        'rope_freq_base',
        'compress_pos_emb',
        'exllama_info',
    ],
    'ExLlamav2': [
        'gpu_split',
        'max_seq_len',
        'no_flash_attn',
        'cache_8bit',
        'alpha_value',
        'compress_pos_emb',
    ],
    'AutoGPTQ': [
        'triton',
        'no_inject_fused_attention',
        'no_inject_fused_mlp',
        'no_use_cuda_fp16',
        'wbits',
        'groupsize',
        'desc_act',
        'disable_exllama',
        'gpu_memory',
        'cpu_memory',
        'cpu',
        'disk',
        'auto_devices',
        'trust_remote_code',
        'no_use_fast',
        'autogptq_info',
    ],
    'GPTQ-for-LLaMa': [
        'wbits',
        'groupsize',
        'model_type',
        'pre_layer',
        'no_use_fast',
        'gptq_for_llama_info',
    ],
    'llama.cpp': [
        'n_ctx',
        'n_gpu_layers',
@ -125,14 +63,43 @@ loaders_and_params = OrderedDict({
        'logits_all',
        'llamacpp_HF_info',
    ],
-    'ctransformers': [
+    'ExLlama_HF': [
-        'n_ctx',
+        'gpu_split',
-        'n_gpu_layers',
+        'max_seq_len',
-        'n_batch',
+        'alpha_value',
-        'threads',
+        'rope_freq_base',
-        'model_type',
+        'compress_pos_emb',
-        'no_mmap',
+        'cfg_cache',
-        'mlock'
+        'no_use_fast',
        'exllama_HF_info',
    ],
    'ExLlamav2_HF': [
        'gpu_split',
        'max_seq_len',
        'cfg_cache',
        'no_flash_attn',
        'cache_8bit',
        'alpha_value',
        'compress_pos_emb',
        'no_use_fast',
    ],
    'AutoGPTQ': [
        'triton',
        'no_inject_fused_attention',
        'no_inject_fused_mlp',
        'no_use_cuda_fp16',
        'wbits',
        'groupsize',
        'desc_act',
        'disable_exllama',
        'gpu_memory',
        'cpu_memory',
        'cpu',
        'disk',
        'auto_devices',
        'trust_remote_code',
        'no_use_fast',
        'autogptq_info',
    ],
    'AutoAWQ': [
        'cpu_memory',
@ -142,7 +109,40 @@ loaders_and_params = OrderedDict({
        'no_inject_fused_attention',
        'trust_remote_code',
        'no_use_fast',
-    ]
+    ],
    'GPTQ-for-LLaMa': [
        'wbits',
        'groupsize',
        'model_type',
        'pre_layer',
        'no_use_fast',
        'gptq_for_llama_info',
    ],
    'ExLlama': [
        'gpu_split',
        'max_seq_len',
        'alpha_value',
        'rope_freq_base',
        'compress_pos_emb',
        'exllama_info',
    ],
    'ExLlamav2': [
        'gpu_split',
        'max_seq_len',
        'no_flash_attn',
        'cache_8bit',
        'alpha_value',
        'compress_pos_emb',
    ],
    'ctransformers': [
        'n_ctx',
        'n_gpu_layers',
        'n_batch',
        'threads',
        'model_type',
        'no_mmap',
        'mlock'
    ],
 })
 loaders_samplers = {
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -78,10 +78,9 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
    # Generate
    for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
        if escape_html:
            reply = html.escape(reply)
        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
        if is_stream:
            cur_time = time.time()