Skip to content

Commit

Permalink
Merge pull request #5496 from oobabooga/dev
Browse files Browse the repository at this point in the history
Merge dev branch
  • Loading branch information
oobabooga authored Feb 14, 2024
2 parents 0f134bf + 069ed7c commit dc6adef
Show file tree
Hide file tree
Showing 19 changed files with 230 additions and 130 deletions.
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,12 @@ conda activate textgen

| System | GPU | Command |
|--------|---------|---------|
| Linux/WSL | NVIDIA | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
| Linux/WSL | CPU only | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu` |
| Linux | AMD | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6` |
| MacOS + MPS | Any | `pip3 install torch torchvision torchaudio` |
| Windows | NVIDIA | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121` |
| Windows | CPU only | `pip3 install torch torchvision torchaudio` |
| Linux/WSL | NVIDIA | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cu121` |
| Linux/WSL | CPU only | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cpu` |
| Linux | AMD | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/rocm5.6` |
| MacOS + MPS | Any | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.*` |
| Windows | NVIDIA | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cu121` |
| Windows | CPU only | `pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.*` |

The up-to-date commands can be found here: https://pytorch.org/get-started/locally/.

Expand Down Expand Up @@ -145,7 +145,7 @@ Then browse to
1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:

```
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip3 install torch==2.1.* torchvision==0.16.* torchaudio==2.1.* --index-url https://download.pytorch.org/whl/cu118
conda install -y -c "nvidia/label/cuda-11.8.0" cuda-runtime
```

Expand Down
4 changes: 4 additions & 0 deletions css/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
margin-bottom: 0 !important;
}

#default-tab .prose pre, #notebook-tab .prose pre {
overflow: scroll;
}

.message-body code {
white-space: pre-wrap !important;
word-wrap: break-word !important;
Expand Down
63 changes: 63 additions & 0 deletions modules/llama_cpp_python_hijack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from typing import Sequence

from tqdm import tqdm

try:
import llama_cpp
except:
llama_cpp = None

try:
import llama_cpp_cuda
except:
llama_cpp_cuda = None

try:
import llama_cpp_cuda_tensorcores
except:
llama_cpp_cuda_tensorcores = None


def eval_with_progress(self, tokens: Sequence[int]):
"""
A copy of
https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py
with tqdm to show prompt processing progress.
"""
assert self._ctx.ctx is not None
assert self._batch.batch is not None
self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)

if len(tokens) > 1:
progress_bar = tqdm(range(0, len(tokens), self.n_batch), desc="Prompt evaluation", leave=False)
else:
progress_bar = range(0, len(tokens), self.n_batch)

for i in progress_bar:
batch = tokens[i: min(len(tokens), i + self.n_batch)]
n_past = self.n_tokens
n_tokens = len(batch)
self._batch.set_batch(
batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
)
self._ctx.decode(self._batch)
# Save tokens
self.input_ids[n_past: n_past + n_tokens] = batch
# Save logits
rows = n_tokens
cols = self._n_vocab
offset = (
0 if self.context_params.logits_all else n_tokens - 1
) # NOTE: Only save the last token logits if logits_all is False
self.scores[n_past + offset: n_past + n_tokens, :].reshape(-1)[
:
] = self._ctx.get_logits()[offset * cols: rows * cols]
# Update n_tokens
self.n_tokens += n_tokens


for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]:
if lib is not None:
lib.Llama.eval = eval_with_progress
2 changes: 1 addition & 1 deletion modules/llamacpp_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast

from modules import RoPE, shared
from modules import RoPE, llama_cpp_python_hijack, shared
from modules.logging_colors import logger

try:
Expand Down
2 changes: 1 addition & 1 deletion modules/llamacpp_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import torch

from modules import RoPE, shared
from modules import RoPE, llama_cpp_python_hijack, shared
from modules.callbacks import Iteratorize
from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length
Expand Down
10 changes: 5 additions & 5 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@


def load_model(model_name, loader=None):
logger.info(f"Loading {model_name}")
logger.info(f"Loading \"{model_name}\"")
t0 = time.time()

shared.is_seq2seq = False
Expand Down Expand Up @@ -246,7 +246,7 @@ def llamacpp_loader(model_name):
else:
model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]

logger.info(f"llama.cpp weights detected: {model_file}")
logger.info(f"llama.cpp weights detected: \"{model_file}\"")
model, tokenizer = LlamaCppModel.from_pretrained(model_file)
return model, tokenizer

Expand All @@ -257,7 +257,7 @@ def llamacpp_HF_loader(model_name):
for fname in [model_name, "oobabooga_llama-tokenizer", "llama-tokenizer"]:
path = Path(f'{shared.args.model_dir}/{fname}')
if all((path / file).exists() for file in ['tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.model']):
logger.info(f'Using tokenizer from: {path}')
logger.info(f'Using tokenizer from: \"{path}\"')
break
else:
logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
Expand Down Expand Up @@ -298,7 +298,7 @@ def ctransformers_loader(model_name):
logger.error("Could not find a model for ctransformers.")
return None, None

logger.info(f'ctransformers weights detected: {model_file}')
logger.info(f'ctransformers weights detected: \"{model_file}\"')
model, tokenizer = ctrans.from_pretrained(model_file)
return model, tokenizer

Expand Down Expand Up @@ -393,7 +393,7 @@ def HQQ_loader(model_name):
from hqq.core.quantize import HQQBackend, HQQLinear
from hqq.engine.hf import HQQModelForCausalLM

logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")
logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")

model_dir = Path(f'{shared.args.model_dir}/{model_name}')
model = HQQModelForCausalLM.from_quantized(str(model_dir))
Expand Down
53 changes: 44 additions & 9 deletions modules/presets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import functools
import pprint
import random
from pathlib import Path

Expand Down Expand Up @@ -90,7 +91,25 @@ def random_preset(state):
'eta_cutoff': [3, 6, 9, 12, 15, 18],
},
'flatten_distribution': {
'temperature': [0.5, 0.7, 0.8, 1, 1.2, 1.5, 2.0],
'temperature': [0.1, 0.5, 0.7, 0.8, 1, 1.2, 1.5, 2.0, 5.0],
'dynamic_temperature': [
[0.1, 1],
[0.1, 1.5],
[0.1, 2],
[0.1, 5],
[0.5, 1],
[0.5, 1.5],
[0.5, 2],
[0.5, 5],
[0.8, 1],
[0.8, 1.5],
[0.8, 2],
[0.8, 5],
[1, 1.5],
[1, 2],
[1, 5]
],
'smoothing_factor': [0.2, 0.3, 0.6, 1.2]
},
'repetition': {
'repetition_penalty': [1, 1.05, 1.1, 1.15, 1.20, 1.25],
Expand All @@ -106,26 +125,42 @@ def random_preset(state):
for cat in params_and_values:
choices = list(params_and_values[cat].keys())
if shared.args.loader is not None:
choices = [x for x in choices if x in loaders_samplers[shared.args.loader]]
choices = [x for x in choices if loader_contains(x)]

if len(choices) > 0:
choice = random.choice(choices)
generate_params[choice] = random.choice(params_and_values[cat][choice])
value = random.choice(params_and_values[cat][choice])
if choice == 'dynamic_temperature':
generate_params['dynamic_temperature'] = True
generate_params['dynatemp_low'] = value[0]
generate_params['dynatemp_high'] = value[1]
else:
generate_params[choice] = value

state.update(generate_params)
logger.info("GENERATED_PRESET=")
pprint.PrettyPrinter(indent=4, width=1, sort_dicts=False).pprint(remove_defaults(state))
return state, *[generate_params[k] for k in presets_params()]


def generate_preset_yaml(state):
def loader_contains(sampler):
if sampler == 'dynamic_temperature' and 'dynatemp_low' in loaders_samplers[shared.args.loader]:
return True
else:
return sampler in loaders_samplers[shared.args.loader]


def remove_defaults(state):
defaults = default_preset()
data = {k: state[k] for k in presets_params()}

# Remove entries that are identical to the defaults.
# sampler_priority is always saved because it is experimental
# and the default order may change.

for k in list(data.keys()):
if data[k] == defaults[k] and k != 'sampler_priority':
if data[k] == defaults[k]:
del data[k]

return data


def generate_preset_yaml(state):
data = remove_defaults(state)
return yaml.dump(data, sort_keys=False)
1 change: 1 addition & 0 deletions modules/sampler_hijack.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ def custom_sort_key(obj):
if shared.args.verbose:
logger.info("WARPERS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint([x.__class__.__name__ for x in warpers])
print()

if normalize is not None:
warpers.append(normalize)
Expand Down
4 changes: 2 additions & 2 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,9 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
if k in state:
generate_params[k] = state[k]

if isinstance(state['sampler_priority'], list):
if isinstance(state['sampler_priority'], list) and len(state['sampler_priority']) > 0:
generate_params['sampler_priority'] = state['sampler_priority']
elif isinstance(state['sampler_priority'], str):
elif isinstance(state['sampler_priority'], str) and state['sampler_priority'].strip() != '':
generate_params['sampler_priority'] = [x.strip() for x in state['sampler_priority'].replace('\n', ',').split(',') if x.strip()]

if state['negative_prompt'] != '':
Expand Down
40 changes: 20 additions & 20 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@ accelerate==0.25.*
colorama
datasets
einops
exllamav2==0.0.12; platform_system != "Darwin" and platform_machine != "x86_64"
gradio==3.50.*
hqq==0.1.2.post1
hqq==0.1.3
jinja2==3.1.2
lm_eval==0.3.0
markdown
numpy==1.24.*
numpy==1.26.*
optimum==1.16.*
pandas
peft==0.7.*
peft==0.8.*
Pillow>=9.5.0
pyyaml
requests
Expand All @@ -29,32 +28,33 @@ bitsandbytes==0.41.1; platform_system != "Windows"
https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl; platform_system == "Windows"

# llama-cpp-python (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.38+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.42+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

# llama-cpp-python (CUDA, no tensor cores)
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.38+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.42+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

# llama-cpp-python (CUDA, tensor cores)
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.38+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.42+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

# CUDA wheels
https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/oobabooga/exllamav2/releases/download/v0.0.13.1/exllamav2-0.0.13.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/jllllll/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu121torch2.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.4/flash_attn-2.3.4+cu122torch2.1cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
Expand Down
Loading

0 comments on commit dc6adef

Please sign in to comment.