Merge branch '4.1-Stable' into 4.1-Latest

K0sh1R1zumu · Jul 23, 2023 · 317cde2 · 317cde2
2 parents 0ed2fb2 + 691486f
commit 317cde2
Show file tree

Hide file tree

Showing 10 changed files with 51 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
 <div align="center">
+<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
 
 # SoftVC VITS Singing Voice Conversion
 
@@ -265,15 +266,6 @@ Add `--vol_aug` if you want to enable loudness embedding:
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
 ```
 
-**Speed Up preprocess**
-
-If your dataset is pretty large,you can increase the param `--num_processes` like that:
-
-```shell
-python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
-```
-All the worker will be assigned to different GPU if you have more than one GPUs.
-
 After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set.
 
 #### You can modify some parameters in the generated config.json and diffusion.yaml
@@ -332,6 +324,15 @@ If you want shallow diffusion (optional), you need to add the `--use_diff` param
 python preprocess_hubert_f0.py --f0_predictor dio --use_diff
 ```
 
+**Speed Up preprocess**
+
+If your dataset is pretty large,you can increase the param `--num_processes` like that:
+
+```shell
+python preprocess_hubert_f0.py --speech_encoder vec768l12 --vol_aug --num_processes 8
+```
+All the worker will be assigned to different GPU if you have more than one GPUs.
+
 After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted.
 
 ## 🏋️‍ Training

diff --git a/README_zh_CN.md b/README_zh_CN.md
@@ -1,5 +1,6 @@
 <div align="center">
-
+<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />
+
 # SoftVC VITS Singing Voice Conversion
 
 [**English**](./README.md) | [**中文简体**](./README_zh_CN.md)
@@ -268,13 +269,6 @@ wavlmbase+
 ```shell
 python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
 ```
-
-**加速预处理**
-如若您的数据集比较大，可以尝试添加`--num_processes`参数：
-```shell
-python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
-```
-所有的Workers会被自动分配到多个GPU上（如果您有多个GPU的话）
 使用后训练出的模型将匹配到输入源响度，否则为训练集响度。
 
 #### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数
@@ -335,6 +329,13 @@ fcpe
 python preprocess_hubert_f0.py --f0_predictor dio --use_diff
 ```
 
+**加速预处理**
+如若您的数据集比较大，可以尝试添加`--num_processes`参数：
+```shell
+python preprocess_hubert_f0.py --f0_predictor dio --use_diff --num_processes 8
+```
+所有的Workers会被自动分配到多个线程上
+
 执行完以上步骤后 dataset 目录便是预处理完成的数据，可以删除 dataset_raw 文件夹了
 
 ## 🏋️‍ 训练

diff --git a/modules/F0Predictor/rmvpe/inference.py b/modules/F0Predictor/rmvpe/inference.py
@@ -28,7 +28,7 @@ def __init__(self, model_path, device=None, dtype = torch.float32, hop_length=16
     def mel2hidden(self, mel):
         with torch.no_grad():
             n_frames = mel.shape[-1]
-            mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='reflect')
+            mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
             hidden = self.model(mel)
             return hidden[:, :n_frames]
 

diff --git a/preprocess_flist_config.py b/preprocess_flist_config.py
@@ -5,6 +5,7 @@
 import wave
 from random import shuffle
 
+from loguru import logger
 from tqdm import tqdm
 
 import diffusion.logger.utils as du
@@ -47,9 +48,9 @@ def get_wav_duration(file_path):
             if not file.endswith("wav"):
                 continue
             if not pattern.match(file):
-                print(f"warning：文件名{file}中包含非字母数字下划线，可能会导致错误。（也可能不会）")
+                logger.warning(f"文件名{file}中包含非字母数字下划线，可能会导致错误。（也可能不会）")
             if get_wav_duration(file) < 0.3:
-                print("skip too short audio:", file)
+                logger.info("Skip too short audio:" + file)
                 continue
             new_wavs.append(file)
         wavs = new_wavs
@@ -60,13 +61,13 @@ def get_wav_duration(file_path):
     shuffle(train)
     shuffle(val)
 
-    print("Writing", args.train_list)
+    logger.info("Writing" + args.train_list)
     with open(args.train_list, "w") as f:
         for fname in tqdm(train):
             wavpath = fname
             f.write(wavpath + "\n")
 
-    print("Writing", args.val_list)
+    logger.info("Writing" + args.val_list)
     with open(args.val_list, "w") as f:
         for fname in tqdm(val):
             wavpath = fname
@@ -101,8 +102,8 @@ def get_wav_duration(file_path):
     if args.tiny:
         config_template["model"]["filter_channels"] = 512
 
-    print("Writing configs/config.json")
+    logger.info("Writing to configs/config.json")
     with open("configs/config.json", "w") as f:
         json.dump(config_template, f, indent=2)
-    print("Writing configs/diffusion.yaml")
+    logger.info("Writing to configs/diffusion.yaml")
     du.save_config("configs/diffusion.yaml",d_config_template)
diff --git a/preprocess_hubert_f0.py b/preprocess_hubert_f0.py
@@ -10,6 +10,7 @@
 import numpy as np
 import torch
 import torch.multiprocessing as mp
+from loguru import logger
 from tqdm import tqdm
 
 import diffusion.logger.utils as du
@@ -27,13 +28,11 @@
 speech_encoder = hps["model"]["speech_encoder"]
 
 
-def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
-    # print(filename)
+def process_one(filename, hmodel,f0p,rank,diff=False,mel_extractor=None):
     wav, sr = librosa.load(filename, sr=sampling_rate)
     audio_norm = torch.FloatTensor(wav)
     audio_norm = audio_norm.unsqueeze(0)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
+    device = torch.device(f"cuda:{rank}")
     soft_path = filename + ".soft.pt"
     if not os.path.exists(soft_path):
         wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
@@ -106,17 +105,17 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
 
 
 def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
-    print("Loading speech encoder for content...")
+    logger.info("Loading speech encoder for content...")
     rank = mp.current_process()._identity
     rank = rank[0] if len(rank) > 0 else 0
     if torch.cuda.is_available():
         gpu_id = rank % torch.cuda.device_count()
         device = torch.device(f"cuda:{gpu_id}")
-    print("Rank {rank} uses device {device}")
+    logger.info(f"Rank {rank} uses device {device}")
     hmodel = utils.get_speech_encoder(speech_encoder, device=device)
-    print("Loaded speech encoder.")
+    logger.info(f"Loaded speech encoder for rank {rank}")
     for filename in tqdm(file_chunk):
-        process_one(filename, hmodel, f0p, diff, mel_extractor)
+        process_one(filename, hmodel, f0p, gpu_id, diff, mel_extractor)
 
 def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
     with ProcessPoolExecutor(max_workers=num_processes) as executor:
@@ -151,9 +150,11 @@ def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device)
         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
     print(speech_encoder)
-    print(f0p)
-    print("use_diff: ", args.use_diff)
-    print("device: ", device)
+    logger.info("Using device: ", device)
+    logger.info("Using SpeechEncoder: " + speech_encoder)
+    logger.info("Using extractor: " + f0p)
+    logger.info("Using diff Mode: " + str( args.use_diff))
+
     if args.use_diff:
         print("use_diff")
         print("Loading Mel Extractor...")

diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,8 @@ torch
 torchaudio
 torchcrepe
 tqdm
+rich
+loguru
 scikit-maad
 praat-parselmouth
 onnx

diff --git a/requirements_onnx_encoder.txt b/requirements_onnx_encoder.txt
@@ -9,6 +9,8 @@ torch==1.13.1
 torchaudio==0.13.1
 torchcrepe
 tqdm
+rich.progress
+loguru
 scikit-maad
 praat-parselmouth
 onnx

diff --git a/requirements_win.txt b/requirements_win.txt
@@ -15,6 +15,8 @@ sounddevice==0.4.5
 SoundFile==0.10.3.post1
 starlette==0.19.1
 tqdm==4.63.0
+rich
+loguru
 torchcrepe
 scikit-maad
 praat-parselmouth

diff --git a/resample.py b/resample.py
@@ -6,8 +6,8 @@
 
 import librosa
 import numpy as np
+from rich.progress import track
 from scipy.io import wavfile
-from tqdm import tqdm
 
 
 def load_wav(wav_path):
@@ -81,7 +81,7 @@ def process_all_speakers():
             if os.path.isdir(spk_dir):
                 print(spk_dir)
                 futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
-                for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+                for _ in track(concurrent.futures.as_completed(futures), total=len(futures), description="resampling:"):
                     pass
 
 

diff --git a/train_diff.py b/train_diff.py
@@ -1,6 +1,7 @@
 import argparse
 
 import torch
+from loguru import logger
 from torch.optim import lr_scheduler
 
 from diffusion.data_loaders import get_data_loaders
@@ -28,8 +29,8 @@ def parse_args(args=None, namespace=None):
 
     # load config
     args = utils.load_config(cmd.config)
-    print(' > config:', cmd.config)
-    print(' >    exp:', args.env.expdir)
+    logger.info(' > config:'+ cmd.config)
+    logger.info(' > exp:'+ args.env.expdir)
 
     # load vocoder
     vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
@@ -47,7 +48,7 @@ def parse_args(args=None, namespace=None):
                 args.model.k_step_max
                 )
 
-    print(f' > INFO: now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
+    logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
 
     # load parameters
     optimizer = torch.optim.AdamW(model.parameters())