Skip to content

Commit

Permalink
Merge branch '4.1-Stable' into 4.1-Latest
Browse files Browse the repository at this point in the history
  • Loading branch information
magic-akari committed Jul 23, 2023
2 parents 0ed2fb2 + 691486f commit 317cde2
Show file tree
Hide file tree
Showing 10 changed files with 51 additions and 40 deletions.
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<div align="center">
<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />

# SoftVC VITS Singing Voice Conversion

Expand Down Expand Up @@ -265,15 +266,6 @@ Add `--vol_aug` if you want to enable loudness embedding:
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
```

**Speed Up preprocess**

If your dataset is pretty large,you can increase the param `--num_processes` like that:

```shell
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
```
All the worker will be assigned to different GPU if you have more than one GPUs.

After enabling loudness embedding, the trained model will match the loudness of the input source; otherwise, it will match the loudness of the training set.

#### You can modify some parameters in the generated config.json and diffusion.yaml
Expand Down Expand Up @@ -332,6 +324,15 @@ If you want shallow diffusion (optional), you need to add the `--use_diff` param
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
```

**Speed Up preprocess**

If your dataset is pretty large,you can increase the param `--num_processes` like that:

```shell
python preprocess_hubert_f0.py --speech_encoder vec768l12 --vol_aug --num_processes 8
```
All the worker will be assigned to different GPU if you have more than one GPUs.

After completing the above steps, the dataset directory will contain the preprocessed data, and the dataset_raw folder can be deleted.

## 🏋️‍ Training
Expand Down
17 changes: 9 additions & 8 deletions README_zh_CN.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div align="center">

<img alt="LOGO" src="https://avatars.githubusercontent.com/u/127122328?s=400&u=5395a98a4f945a3a50cb0cc96c2747505d190dbc&v=4" width="300" height="300" />

# SoftVC VITS Singing Voice Conversion

[**English**](./README.md) | [**中文简体**](./README_zh_CN.md)
Expand Down Expand Up @@ -268,13 +269,6 @@ wavlmbase+
```shell
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
```

**加速预处理**
如若您的数据集比较大,可以尝试添加`--num_processes`参数:
```shell
python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug --num_processes 8
```
所有的Workers会被自动分配到多个GPU上(如果您有多个GPU的话)
使用后训练出的模型将匹配到输入源响度,否则为训练集响度。

#### 此时可以在生成的 config.json 与 diffusion.yaml 修改部分参数
Expand Down Expand Up @@ -335,6 +329,13 @@ fcpe
python preprocess_hubert_f0.py --f0_predictor dio --use_diff
```

**加速预处理**
如若您的数据集比较大,可以尝试添加`--num_processes`参数:
```shell
python preprocess_hubert_f0.py --f0_predictor dio --use_diff --num_processes 8
```
所有的Workers会被自动分配到多个线程上

执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了

## 🏋️‍ 训练
Expand Down
2 changes: 1 addition & 1 deletion modules/F0Predictor/rmvpe/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, model_path, device=None, dtype = torch.float32, hop_length=16
def mel2hidden(self, mel):
with torch.no_grad():
n_frames = mel.shape[-1]
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='reflect')
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
hidden = self.model(mel)
return hidden[:, :n_frames]

Expand Down
13 changes: 7 additions & 6 deletions preprocess_flist_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import wave
from random import shuffle

from loguru import logger
from tqdm import tqdm

import diffusion.logger.utils as du
Expand Down Expand Up @@ -47,9 +48,9 @@ def get_wav_duration(file_path):
if not file.endswith("wav"):
continue
if not pattern.match(file):
print(f"warning:文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
logger.warning(f"文件名{file}中包含非字母数字下划线,可能会导致错误。(也可能不会)")
if get_wav_duration(file) < 0.3:
print("skip too short audio:", file)
logger.info("Skip too short audio:" + file)
continue
new_wavs.append(file)
wavs = new_wavs
Expand All @@ -60,13 +61,13 @@ def get_wav_duration(file_path):
shuffle(train)
shuffle(val)

print("Writing", args.train_list)
logger.info("Writing" + args.train_list)
with open(args.train_list, "w") as f:
for fname in tqdm(train):
wavpath = fname
f.write(wavpath + "\n")

print("Writing", args.val_list)
logger.info("Writing" + args.val_list)
with open(args.val_list, "w") as f:
for fname in tqdm(val):
wavpath = fname
Expand Down Expand Up @@ -101,8 +102,8 @@ def get_wav_duration(file_path):
if args.tiny:
config_template["model"]["filter_channels"] = 512

print("Writing configs/config.json")
logger.info("Writing to configs/config.json")
with open("configs/config.json", "w") as f:
json.dump(config_template, f, indent=2)
print("Writing configs/diffusion.yaml")
logger.info("Writing to configs/diffusion.yaml")
du.save_config("configs/diffusion.yaml",d_config_template)
23 changes: 12 additions & 11 deletions preprocess_hubert_f0.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import torch
import torch.multiprocessing as mp
from loguru import logger
from tqdm import tqdm

import diffusion.logger.utils as du
Expand All @@ -27,13 +28,11 @@
speech_encoder = hps["model"]["speech_encoder"]


def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):
# print(filename)
def process_one(filename, hmodel,f0p,rank,diff=False,mel_extractor=None):
wav, sr = librosa.load(filename, sr=sampling_rate)
audio_norm = torch.FloatTensor(wav)
audio_norm = audio_norm.unsqueeze(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = torch.device(f"cuda:{rank}")
soft_path = filename + ".soft.pt"
if not os.path.exists(soft_path):
wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000)
Expand Down Expand Up @@ -106,17 +105,17 @@ def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None):


def process_batch(file_chunk, f0p, diff=False, mel_extractor=None, device="cpu"):
print("Loading speech encoder for content...")
logger.info("Loading speech encoder for content...")
rank = mp.current_process()._identity
rank = rank[0] if len(rank) > 0 else 0
if torch.cuda.is_available():
gpu_id = rank % torch.cuda.device_count()
device = torch.device(f"cuda:{gpu_id}")
print("Rank {rank} uses device {device}")
logger.info(f"Rank {rank} uses device {device}")
hmodel = utils.get_speech_encoder(speech_encoder, device=device)
print("Loaded speech encoder.")
logger.info(f"Loaded speech encoder for rank {rank}")
for filename in tqdm(file_chunk):
process_one(filename, hmodel, f0p, diff, mel_extractor)
process_one(filename, hmodel, f0p, gpu_id, diff, mel_extractor)

def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device):
with ProcessPoolExecutor(max_workers=num_processes) as executor:
Expand Down Expand Up @@ -151,9 +150,11 @@ def parallel_process(filenames, num_processes, f0p, diff, mel_extractor, device)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(speech_encoder)
print(f0p)
print("use_diff: ", args.use_diff)
print("device: ", device)
logger.info("Using device: ", device)
logger.info("Using SpeechEncoder: " + speech_encoder)
logger.info("Using extractor: " + f0p)
logger.info("Using diff Mode: " + str( args.use_diff))

if args.use_diff:
print("use_diff")
print("Loading Mel Extractor...")
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ torch
torchaudio
torchcrepe
tqdm
rich
loguru
scikit-maad
praat-parselmouth
onnx
Expand Down
2 changes: 2 additions & 0 deletions requirements_onnx_encoder.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ torch==1.13.1
torchaudio==0.13.1
torchcrepe
tqdm
rich.progress
loguru
scikit-maad
praat-parselmouth
onnx
Expand Down
2 changes: 2 additions & 0 deletions requirements_win.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ sounddevice==0.4.5
SoundFile==0.10.3.post1
starlette==0.19.1
tqdm==4.63.0
rich
loguru
torchcrepe
scikit-maad
praat-parselmouth
Expand Down
4 changes: 2 additions & 2 deletions resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import librosa
import numpy as np
from rich.progress import track
from scipy.io import wavfile
from tqdm import tqdm


def load_wav(wav_path):
Expand Down Expand Up @@ -81,7 +81,7 @@ def process_all_speakers():
if os.path.isdir(spk_dir):
print(spk_dir)
futures = [executor.submit(process, (spk_dir, i, args)) for i in os.listdir(spk_dir) if i.endswith("wav")]
for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
for _ in track(concurrent.futures.as_completed(futures), total=len(futures), description="resampling:"):
pass


Expand Down
7 changes: 4 additions & 3 deletions train_diff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse

import torch
from loguru import logger
from torch.optim import lr_scheduler

from diffusion.data_loaders import get_data_loaders
Expand Down Expand Up @@ -28,8 +29,8 @@ def parse_args(args=None, namespace=None):

# load config
args = utils.load_config(cmd.config)
print(' > config:', cmd.config)
print(' > exp:', args.env.expdir)
logger.info(' > config:'+ cmd.config)
logger.info(' > exp:'+ args.env.expdir)

# load vocoder
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=args.device)
Expand All @@ -47,7 +48,7 @@ def parse_args(args=None, namespace=None):
args.model.k_step_max
)

print(f' > INFO: now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')
logger.info(f' > Now model timesteps is {model.timesteps}, and k_step_max is {model.k_step_max}')

# load parameters
optimizer = torch.optim.AdamW(model.parameters())
Expand Down

0 comments on commit 317cde2

Please sign in to comment.