Merge pull request #5496 from oobabooga/dev

Merge dev branch
oobabooga · Feb 14, 2024 · dc6adef · dc6adef
2 parents 0f134bf + 069ed7c
commit dc6adef
Show file tree

Hide file tree

Showing 19 changed files with 230 additions and 130 deletions.
diff --git a/README.md b/README.md
@@ -75,12 +75,12 @@ conda activate textgen
 
 | System | GPU | Command |
 |--------|---------|---------|
-| Linux/WSL | NVIDIA | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
-| Linux/WSL | CPU only | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6` |
-| MacOS + MPS | Any | `pip3 install torch torchvision torchaudio` |
-| Windows | NVIDIA | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
-| Windows | CPU only | `pip3 install torch torchvision torchaudio` |
+| Linux/WSL | NVIDIA | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cu121` |
+| Linux/WSL | CPU only | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cpu` |
+| Linux | AMD | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/rocm5.6` |
+| MacOS + MPS | Any | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.*` |
+| Windows | NVIDIA | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cu121` |
+| Windows | CPU only | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.*` |
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.
 
@@ -145,7 +145,7 @@ Then browse to
 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:
 
 ```
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cu118
 conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime
 ```
 

diff --git a/css/main.css b/css/main.css
@@ -408,6 +408,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     margin-bottom: 0 !important;
 }
 
+#default-tab .prose pre, #notebook-tab .prose pre {
+    overflow: scroll;
+}
+
 .message-body code {
     white-space: pre-wrap !important;
     word-wrap: break-word !important;

diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
@@ -0,0 +1,63 @@
+from typing import Sequence
+
+from tqdm import tqdm
+
+try:
+    import llama_cpp
+except:
+    llama_cpp = None
+
+try:
+    import llama_cpp_cuda
+except:
+    llama_cpp_cuda = None
+
+try:
+    import llama_cpp_cuda_tensorcores
+except:
+    llama_cpp_cuda_tensorcores = None
+
+
+def eval_with_progress(self, tokens: Sequence[int]):
+    """
+    A copy of
+
+    https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py
+
+    with tqdm to show prompt processing progress.
+    """
+    assert self._ctx.ctx is not None
+    assert self._batch.batch is not None
+    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+
+    if len(tokens) > 1:
+        progress_bar = tqdm(range(0, len(tokens), self.n_batch), desc="Prompt evaluation", leave=False)
+    else:
+        progress_bar = range(0, len(tokens), self.n_batch)
+
+    for i in progress_bar:
+        batch = tokens[i: min(len(tokens), i + self.n_batch)]
+        n_past = self.n_tokens
+        n_tokens = len(batch)
+        self._batch.set_batch(
+            batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
+        )
+        self._ctx.decode(self._batch)
+        # Save tokens
+        self.input_ids[n_past: n_past + n_tokens] = batch
+        # Save logits
+        rows = n_tokens
+        cols = self._n_vocab
+        offset = (
+            0 if self.context_params.logits_all else n_tokens - 1
+        )  # NOTE: Only save the last token logits if logits_all is False
+        self.scores[n_past + offset: n_past + n_tokens, :].reshape(-1)[
+            :
+        ] = self._ctx.get_logits()[offset * cols: rows * cols]
+        # Update n_tokens
+        self.n_tokens += n_tokens
+
+
+for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
+    if lib is not None:
+        lib.Llama.eval = eval_with_progress
diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
@@ -7,7 +7,7 @@
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from modules import RoPE, shared
+from modules import RoPE, llama_cpp_python_hijack, shared
 from modules.logging_colors import logger
 
 try:

diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 
-from modules import RoPE, shared
+from modules import RoPE, llama_cpp_python_hijack, shared
 from modules.callbacks import Iteratorize
 from modules.logging_colors import logger
 from modules.text_generation import get_max_prompt_length

diff --git a/modules/models.py b/modules/models.py
@@ -54,7 +54,7 @@
 
 
 def load_model(model_name, loader=None):
-    logger.info(f"Loading {model_name}")
+    logger.info(f"Loading \"{model_name}\"")
     t0 = time.time()
 
     shared.is_seq2seq = False
@@ -246,7 +246,7 @@ def llamacpp_loader(model_name):
     else:
         model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
 
-    logger.info(f"llama.cpp weights detected: {model_file}")
+    logger.info(f"llama.cpp weights detected: \"{model_file}\"")
     model, tokenizer = LlamaCppModel.from_pretrained(model_file)
     return model, tokenizer
 
@@ -257,7 +257,7 @@ def llamacpp_HF_loader(model_name):
     for fname in [model_name, "oobabooga_llama-tokenizer", "llama-tokenizer"]:
         path = Path(f'{shared.args.model_dir}/{fname}')
         if all((path / file).exists() for file in ['tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.model']):
-            logger.info(f'Using tokenizer from: {path}')
+            logger.info(f'Using tokenizer from: \"{path}\"')
             break
     else:
         logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
@@ -298,7 +298,7 @@ def ctransformers_loader(model_name):
                 logger.error("Could not find a model for ctransformers.")
                 return None, None
 
-    logger.info(f'ctransformers weights detected: {model_file}')
+    logger.info(f'ctransformers weights detected: \"{model_file}\"')
     model, tokenizer = ctrans.from_pretrained(model_file)
     return model, tokenizer
 
@@ -393,7 +393,7 @@ def HQQ_loader(model_name):
     from hqq.core.quantize import HQQBackend, HQQLinear
     from hqq.engine.hf import HQQModelForCausalLM
 
-    logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")
+    logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
 
     model_dir = Path(f'{shared.args.model_dir}/{model_name}')
     model = HQQModelForCausalLM.from_quantized(str(model_dir))

diff --git a/modules/presets.py b/modules/presets.py
@@ -1,4 +1,5 @@
 import functools
+import pprint
 import random
 from pathlib import Path
 
@@ -90,7 +91,25 @@ def random_preset(state):
             'eta_cutoff': [3, 6, 9, 12, 15, 18],
         },
         'flatten_distribution': {
-            'temperature': [0.5, 0.7, 0.8, 1, 1.2, 1.5, 2.0],
+            'temperature': [0.1, 0.5, 0.7, 0.8, 1, 1.2, 1.5, 2.0, 5.0],
+            'dynamic_temperature': [
+                [0.1, 1],
+                [0.1, 1.5],
+                [0.1, 2],
+                [0.1, 5],
+                [0.5, 1],
+                [0.5, 1.5],
+                [0.5, 2],
+                [0.5, 5],
+                [0.8, 1],
+                [0.8, 1.5],
+                [0.8, 2],
+                [0.8, 5],
+                [1, 1.5],
+                [1, 2],
+                [1, 5]
+            ],
+            'smoothing_factor': [0.2, 0.3, 0.6, 1.2]
         },
         'repetition': {
             'repetition_penalty': [1, 1.05, 1.1, 1.15, 1.20, 1.25],
@@ -106,26 +125,42 @@ def random_preset(state):
     for cat in params_and_values:
         choices = list(params_and_values[cat].keys())
         if shared.args.loader is not None:
-            choices = [x for x in choices if x in loaders_samplers[shared.args.loader]]
+            choices = [x for x in choices if loader_contains(x)]
 
         if len(choices) > 0:
             choice = random.choice(choices)
-            generate_params[choice] = random.choice(params_and_values[cat][choice])
+            value = random.choice(params_and_values[cat][choice])
+            if choice == 'dynamic_temperature':
+                generate_params['dynamic_temperature'] = True
+                generate_params['dynatemp_low'] = value[0]
+                generate_params['dynatemp_high'] = value[1]
+            else:
+                generate_params[choice] = value
 
     state.update(generate_params)
+    logger.info("GENERATED_PRESET=")
+    pprint.PrettyPrinter(indent=4, width=1, sort_dicts=False).pprint(remove_defaults(state))
     return state, *[generate_params[k] for k in presets_params()]
 
 
-def generate_preset_yaml(state):
+def loader_contains(sampler):
+    if sampler == 'dynamic_temperature' and 'dynatemp_low' in loaders_samplers[shared.args.loader]:
+        return True
+    else:
+        return sampler in loaders_samplers[shared.args.loader]
+
+
+def remove_defaults(state):
     defaults = default_preset()
     data = {k: state[k] for k in presets_params()}
 
-    # Remove entries that are identical to the defaults.
-    # sampler_priority is always saved because it is experimental
-    # and the default order may change.
-
     for k in list(data.keys()):
-        if data[k] == defaults[k] and k != 'sampler_priority':
+        if data[k] == defaults[k]:
             del data[k]
 
+    return data
+
+
+def generate_preset_yaml(state):
+    data = remove_defaults(state)
     return yaml.dump(data, sort_keys=False)
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
@@ -431,6 +431,7 @@ def custom_sort_key(obj):
     if shared.args.verbose:
         logger.info("WARPERS=")
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint([x.__class__.__name__ for x in warpers])
+        print()
 
     if normalize is not None:
         warpers.append(normalize)

diff --git a/modules/text_generation.py b/modules/text_generation.py
@@ -290,9 +290,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         if k in state:
             generate_params[k] = state[k]
 
-    if isinstance(state['sampler_priority'], list):
+    if isinstance(state['sampler_priority'], list) and len(state['sampler_priority']) > 0:
         generate_params['sampler_priority'] = state['sampler_priority']
-    elif isinstance(state['sampler_priority'], str):
+    elif isinstance(state['sampler_priority'], str) and state['sampler_priority'].strip() != '':
         generate_params['sampler_priority'] = [x.strip() for x in state['sampler_priority'].replace('\n', ',').split(',') if x.strip()]
 
     if state['negative_prompt'] != '':

diff --git a/requirements.txt b/requirements.txt
@@ -2,16 +2,15 @@ accelerate==0.25.*
 colorama
 datasets
 einops
-exllamav2==0.0.12; platform_system != "Darwin" and platform_machine != "x86_64"
 gradio==3.50.*
-hqq==0.1.2.post1
+hqq==0.1.3
 jinja2==3.1.2
 lm_eval==0.3.0
 markdown
-numpy==1.24.*
+numpy==1.26.*
 optimum==1.16.*
 pandas
-peft==0.7.*
+peft==0.8.*
 Pillow>=9.5.0
 pyyaml
 requests
@@ -29,32 +28,33 @@ bitsandbytes==0.41.1; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"
 
 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, no tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # llama-cpp-python (CUDA, tensor cores)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
 
 # CUDA wheels
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
 https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"