Merge pull request #6713 from oobabooga/dev

Merge dev branch
oobabooga · Jan 29, 2025 · 9ac4d81 · 9ac4d81
2 parents a1c353a + b614ea6
commit 9ac4d81
Show file tree

Hide file tree

Showing 24 changed files with 359 additions and 226 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,7 @@
 .DS_Store
 .eslintrc.js
 .idea
+.installer_state.json
 .venv
 venv
 .envrc

diff --git a/README.md b/README.md
@@ -380,7 +380,7 @@ text-generation-webui
 │   │   └── tokenizer.model
 ```
 
-In both cases, you can use the "Model" tab of the UI to download the model from Hugging Face automatically. It is also possible to download it via the command-line with 
+In both cases, you can use the "Model" tab of the UI to download the model from Hugging Face automatically. It is also possible to download it via the command-line with:
 
 ```
 python download-model.py organization/model

diff --git a/css/main.css b/css/main.css
@@ -1259,6 +1259,16 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     left: 25px;
 }
 
+.footer-button.footer-continue-button {
+    bottom: -23px;
+    left: 50px;
+}
+
+.footer-button.footer-remove-button {
+    bottom: -23px;
+    left: 75px;
+}
+
 .message:hover .footer-button,
 .user-message:hover .footer-button,
 .assistant-message:hover .footer-button {

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
@@ -14,7 +14,7 @@ Add `--api` to your command-line flags.
 * To create a public Cloudflare URL, add the `--public-api` flag.
 * To listen on your local network, add the `--listen` flag.
 * To change the port, which is 5000 by default, use `--api-port 1234` (change 1234 to your desired port number).
-* To use SSL, add `--ssl-keyfile key.pem --ssl-certfile cert.pem`. Note that it doesn't work with `--public-api`.
+* To use SSL, add `--ssl-keyfile key.pem --ssl-certfile cert.pem`. ⚠️ **Note**: this doesn't work with `--public-api` since Cloudflare already uses HTTPS by default.
 * To use an API key for authentication, add `--api-key yourkey`.
 
 ### Examples
@@ -51,8 +51,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
         "content": "Hello!"
       }
     ],
-    "mode": "instruct",
-    "instruction_template": "Alpaca"
+    "mode": "instruct"
   }'
 ```
 
@@ -86,7 +85,6 @@ curl http://127.0.0.1:5000/v1/chat/completions \
       }
     ],
     "mode": "instruct",
-    "instruction_template": "Alpaca",
     "stream": true
   }'
 ```
@@ -131,9 +129,6 @@ curl -k http://127.0.0.1:5000/v1/internal/model/load \
     "args": {
       "load_in_4bit": true,
       "n_gpu_layers": 12
-    },
-    "settings": {
-      "instruction_template": "Alpaca"
     }
   }'
 ```
@@ -198,7 +193,7 @@ while True:
     assistant_message = ''
     for event in client.events():
         payload = json.loads(event.data)
-        chunk = payload['choices'][0]['message']['content']
+        chunk = payload['choices'][0]['delta']['content']
         assistant_message += chunk
         print(chunk, end='')
 
@@ -241,6 +236,27 @@ for event in client.events():
 print()
 ```
 
+#### Python example with API key
+
+Replace
+
+```python
+headers = {
+    "Content-Type": "application/json"
+}
+```
+
+with
+
+```python
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": "Bearer yourPassword123"
+}
+```
+
+in any of the examples above.
+
 ### Environment variables
 
 The following environment variables can be used (they take precedence over everything else):

diff --git a/download-model.py b/download-model.py
@@ -14,6 +14,7 @@
 import os
 import re
 import sys
+from multiprocessing import Array
 from pathlib import Path
 from time import sleep
 
@@ -27,9 +28,10 @@
 
 
 class ModelDownloader:
-    def __init__(self, max_retries=5):
+    def __init__(self, max_retries=7):
         self.max_retries = max_retries
         self.session = self.get_session()
+        self._progress_bar_slots = None
 
     def get_session(self):
         session = requests.Session()
@@ -186,73 +188,112 @@ def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, model_dir
         output_folder = Path(base_folder) / output_folder
         return output_folder
 
+    @property
+    def progress_bar_slots(self):
+        if self._progress_bar_slots is None:
+            raise RuntimeError("Progress bar slots not initialized. Start download threads first.")
+
+        return self._progress_bar_slots
+
+    def initialize_progress_bar_slots(self, num_threads):
+        self._progress_bar_slots = Array("B", [0] * num_threads)
+
+    def get_progress_bar_position(self):
+        with self.progress_bar_slots.get_lock():
+            for i in range(len(self.progress_bar_slots)):
+                if self.progress_bar_slots[i] == 0:
+                    self.progress_bar_slots[i] = 1
+                    return i
+
+        return 0  # fallback
+
+    def release_progress_bar_position(self, slot):
+        with self.progress_bar_slots.get_lock():
+            self.progress_bar_slots[slot] = 0
+
     def get_single_file(self, url, output_folder, start_from_scratch=False):
         filename = Path(url.rsplit('/', 1)[1])
         output_path = output_folder / filename
+        progress_bar_position = self.get_progress_bar_position()
 
-        max_retries = 7
+        max_retries = self.max_retries
         attempt = 0
-        while attempt < max_retries:
-            attempt += 1
-            session = self.session
-            headers = {}
-            mode = 'wb'
-
-            try:
-                if output_path.exists() and not start_from_scratch:
-                    # Resume download
-                    r = session.get(url, stream=True, timeout=20)
-                    total_size = int(r.headers.get('content-length', 0))
-                    if output_path.stat().st_size >= total_size:
-                        return
-
-                    headers = {'Range': f'bytes={output_path.stat().st_size}-'}
-                    mode = 'ab'
-
-                with session.get(url, stream=True, headers=headers, timeout=30) as r:
-                    r.raise_for_status()  # If status is not 2xx, raise an error
-                    total_size = int(r.headers.get('content-length', 0))
-                    block_size = 1024 * 1024  # 1MB
-
-                    filename_str = str(filename)  # Convert PosixPath to string if necessary
-
-                    tqdm_kwargs = {
-                        'total': total_size,
-                        'unit': 'B',
-                        'unit_scale': True,
-                        'unit_divisor': 1024,
-                        'bar_format': '{desc}{percentage:3.0f}%|{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]',
-                        'desc': f"{filename_str}: "
-                    }
-
-                    if 'COLAB_GPU' in os.environ:
-                        tqdm_kwargs.update({
-                            'position': 0,
-                            'leave': True
-                        })
-
-                    with open(output_path, mode) as f:
-                        with tqdm.tqdm(**tqdm_kwargs) as t:
-                            count = 0
-                            for data in r.iter_content(block_size):
-                                f.write(data)
-                                t.update(len(data))
-                                if total_size != 0 and self.progress_bar is not None:
-                                    count += len(data)
-                                    self.progress_bar(float(count) / float(total_size), f"{filename_str}")
-
-                    break  # Exit loop if successful
-            except (RequestException, ConnectionError, Timeout) as e:
-                print(f"Error downloading {filename}: {e}.")
-                print(f"That was attempt {attempt}/{max_retries}.", end=' ')
-                if attempt < max_retries:
-                    print(f"Retry begins in {2 ** attempt} seconds.")
-                    sleep(2 ** attempt)
-                else:
-                    print("Failed to download after the maximum number of attempts.")
+        try:
+            while attempt < max_retries:
+                attempt += 1
+                session = self.session
+                headers = {}
+                mode = 'wb'
+
+                try:
+                    if output_path.exists() and not start_from_scratch:
+                        # Resume download
+                        r = session.get(url, stream=True, timeout=20)
+                        total_size = int(r.headers.get('content-length', 0))
+                        if output_path.stat().st_size >= total_size:
+                            return
+
+                        headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+                        mode = 'ab'
+
+                    with session.get(url, stream=True, headers=headers, timeout=30) as r:
+                        r.raise_for_status()  # If status is not 2xx, raise an error
+                        total_size = int(r.headers.get('content-length', 0))
+                        block_size = 1024 * 1024  # 1MB
+
+                        filename_str = str(filename)  # Convert PosixPath to string if necessary
+
+                        tqdm_kwargs = {
+                            'total': total_size,
+                            'unit': 'B',
+                            'unit_scale': True,
+                            'unit_divisor': 1024,
+                            'bar_format': '{desc}{percentage:3.0f}%|{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]',
+                            'desc': f"{filename_str}: ",
+                            'position': progress_bar_position,
+                            'leave': False
+                        }
+
+                        if 'COLAB_GPU' in os.environ:
+                            tqdm_kwargs.update({
+                                'position': 0,
+                                'leave': True
+                            })
+
+                        with open(output_path, mode) as f:
+                            with tqdm.tqdm(**tqdm_kwargs) as t:
+                                count = 0
+                                for data in r.iter_content(block_size):
+                                    f.write(data)
+                                    t.update(len(data))
+                                    if total_size != 0 and self.progress_bar is not None:
+                                        count += len(data)
+                                        self.progress_bar(float(count) / float(total_size), f"{filename_str}")
+
+                        break  # Exit loop if successful
+                except (RequestException, ConnectionError, Timeout) as e:
+                    print(f"Error downloading {filename}: {e}.")
+                    print(f"That was attempt {attempt}/{max_retries}.", end=' ')
+                    if attempt < max_retries:
+                        print(f"Retry begins in {2 ** attempt} seconds.")
+                        sleep(2 ** attempt)
+                    else:
+                        print("Failed to download after the maximum number of attempts.")
+        finally:
+            self.release_progress_bar_position(progress_bar_position)
 
     def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=4):
-        thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)
+        self.initialize_progress_bar_slots(threads)
+        tqdm.tqdm.set_lock(tqdm.tqdm.get_lock())
+        try:
+            thread_map(
+                lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch),
+                file_list,
+                max_workers=threads,
+                disable=True
+            )
+        finally:
+            print(f"\nDownload of {len(file_list)} files to {output_folder} completed.")
 
     def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
         self.progress_bar = progress_bar
@@ -318,7 +359,7 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
     parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
     parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
     parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
-    parser.add_argument('--max-retries', type=int, default=5, help='Max retries count when get error in download time.')
+    parser.add_argument('--max-retries', type=int, default=7, help='Max retries count when get error in download time.')
     args = parser.parse_args()
 
     branch = args.branch

diff --git a/extensions/Training_PRO/script.py b/extensions/Training_PRO/script.py
@@ -557,12 +557,6 @@ def calc_trainable_parameters(model):
 
 def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str, precize_slicing_overlap: bool, add_eos_token_type: str, save_steps_under_loss: float, add_bos_token: bool, training_projection: str,sliding_window:bool,warmup_ratio:float, grad_accumulation: int,neft_noise_alpha:float):
 
-    if shared.args.monkey_patch:
-        from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
-            replace_peft_model_with_int4_lora_model
-        )
-        replace_peft_model_with_int4_lora_model()
-
     global train_log_graph
     global WANT_INTERRUPT
     WANT_INTERRUPT = False
@@ -600,10 +594,6 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
 
         time.sleep(5)
 
-    if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch:
-        yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`", zero_pd
-        return
-
     if cutoff_len <= 0 or micro_batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
         yield "Cannot input zeroes.", zero_pd
         return
@@ -865,15 +855,6 @@ def generate_and_tokenize_prompt(data_point):
         yield traceback.format_exc().replace('\n', '\n\n'), zero_pd
         return
 
-    if shared.args.monkey_patch:
-        from alpaca_lora_4bit.autograd_4bit import Autograd4bitQuantLinear
-        from alpaca_lora_4bit.models import Linear4bitLt
-        for _, m in lora_model.named_modules():
-            if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
-                if m.is_v1_model:
-                    m.zeros = m.zeros.half()
-                m.scales = m.scales.half()
-
     class Tracked():
         def __init__(self):
             self.current_steps = 0

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
@@ -146,7 +146,7 @@ def convert_history(history):
                 for item in entry['content']:
                     if not isinstance(item, dict):
                         continue
-                    
+
                     image_url = None
                     content = None
                     if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
@@ -205,7 +205,7 @@ def convert_history(history):
             else:
                 chat_dialogue.append(['', current_reply])
         elif role == "system":
-            system_message = content
+            system_message += f"\n{content}" if system_message else content
 
     if not user_input_last:
         user_input = ""

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
@@ -22,6 +22,14 @@ function regenerateClick() {
   document.getElementById("Regenerate").click();
 }
 
+function continueClick() {
+  document.getElementById("Continue").click();
+}
+
+function removeLastClick() {
+  document.getElementById("Remove-last").click();
+}
+
 function handleMorphdomUpdate(text) {
   morphdom(
     document.getElementById("chat").parentNode,

diff --git a/modules/chat.py b/modules/chat.py
@@ -30,8 +30,13 @@
 )
 from modules.utils import delete_file, get_available_characters, save_file
 
-# Copied from the Transformers library
+
+def strftime_now(format):
+    return datetime.now().strftime(format)
+
+
 jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+jinja_env.globals["strftime_now"] = strftime_now
 
 
 def str_presenter(dumper, data):
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ @@
     .DS_Store
     .eslintrc.js
     .idea
+    .installer_state.json
     .venv
     venv
     .envrc
@@ Expand Down @@