NsfHifigan在DML中上采样出现错误以及SourceModuleHnNSF这两个BUG的修复

K0sh1R1zumu · Jul 17, 2023 · 90c9ccc · 90c9ccc
1 parent 72deb15
commit 90c9ccc
Show file tree

Hide file tree

Showing 5 changed files with 276 additions and 220 deletions.
diff --git a/onnx_export.py b/onnx_export.py
@@ -1,56 +1,138 @@
+import json
 import torch
-
 import utils
-from onnxexport.model_onnx import SynthesizerTrn
-
-
-def main(NetExport):
-    path = "SoVits4.0"
-    if NetExport:
-        device = torch.device("cpu")
-        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
-        SVCVITS = SynthesizerTrn(
-            hps.data.filter_length // 2 + 1,
-            hps.train.segment_size // hps.data.hop_length,
-            **hps.model)
-        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
-        _ = SVCVITS.eval().to(device)
-        for i in SVCVITS.parameters():
-            i.requires_grad = False
-
-        n_frame = 10
-        test_hidden_unit = torch.rand(1, n_frame, 256)
-        test_pitch = torch.rand(1, n_frame)
-        test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
-        test_uv = torch.ones(1, n_frame, dtype=torch.float32)
-        test_noise = torch.randn(1, 192, n_frame)
-        test_sid = torch.LongTensor([0])
-        input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
-        output_names = ["audio", ]
-
-        torch.onnx.export(SVCVITS,
-                          (
-                              test_hidden_unit.to(device),
-                              test_pitch.to(device),
-                              test_mel2ph.to(device),
-                              test_uv.to(device),
-                              test_noise.to(device),
-                              test_sid.to(device)
-                          ),
-                          f"checkpoints/{path}/model.onnx",
-                          dynamic_axes={
-                              "c": [0, 1],
-                              "f0": [1],
-                              "mel2ph": [1],
-                              "uv": [1],
-                              "noise": [2],
-                          },
-                          do_constant_folding=False,
-                          opset_version=16,
-                          verbose=False,
-                          input_names=input_names,
-                          output_names=output_names)
+from onnxexport.model_onnx_speaker_mix import SynthesizerTrn
+
+
+def main():
+    path = "crs"
+
+    device = torch.device("cpu")
+    hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
+    SVCVITS = SynthesizerTrn(
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        **hps.model)
+    _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
+    _ = SVCVITS.eval().to(device)
+    for i in SVCVITS.parameters():
+        i.requires_grad = False
+
+    num_frames = 200
+
+    test_hidden_unit = torch.rand(1, num_frames, SVCVITS.gin_channels)
+    test_pitch = torch.rand(1, num_frames)
+    test_vol = torch.rand(1, num_frames)
+    test_mel2ph = torch.LongTensor(torch.arange(0, num_frames)).unsqueeze(0)
+    test_uv = torch.ones(1, num_frames, dtype=torch.float32)
+    test_noise = torch.randn(1, 192, num_frames)
+    test_sid = torch.LongTensor([0])
+    export_mix = True
+    if len(hps.spk) < 2:
+        export_mix = False
+
+    if export_mix:
+        spk_mix = []
+        n_spk = len(hps.spk)
+        for i in range(n_spk):
+            spk_mix.append(1.0/float(n_spk))
+        test_sid = torch.tensor(spk_mix)
+        SVCVITS.export_chara_mix(hps.spk)
+        test_sid = test_sid.unsqueeze(0)
+        test_sid = test_sid.repeat(num_frames, 1)
+
+    SVCVITS.eval()
+
+    if export_mix:
+        daxes = {
+            "c": [0, 1],
+            "f0": [1],
+            "mel2ph": [1],
+            "uv": [1],
+            "noise": [2],
+            "sid":[0]
+        }
+    else:
+        daxes = {
+            "c": [0, 1],
+            "f0": [1],
+            "mel2ph": [1],
+            "uv": [1],
+            "noise": [2]
+        }
+
+    input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
+    output_names = ["audio", ]
+
+    if SVCVITS.vol_embedding:
+        input_names.append("vol")
+        vol_dadict = {"vol" : [1]}
+        daxes.update(vol_dadict)
+        test_inputs = (
+            test_hidden_unit.to(device),
+            test_pitch.to(device),
+            test_mel2ph.to(device),
+            test_uv.to(device),
+            test_noise.to(device),
+            test_sid.to(device),
+            test_vol.to(device)
+        )
+    else:
+        test_inputs = (
+            test_hidden_unit.to(device),
+            test_pitch.to(device),
+            test_mel2ph.to(device),
+            test_uv.to(device),
+            test_noise.to(device),
+            test_sid.to(device)
+        )
+
+    # SVCVITS = torch.jit.script(SVCVITS)
+    SVCVITS(test_hidden_unit.to(device),
+            test_pitch.to(device),
+            test_mel2ph.to(device),
+            test_uv.to(device),
+            test_noise.to(device),
+            test_sid.to(device),
+            test_vol.to(device))
+
+    SVCVITS.dec.OnnxExport()
+
+    torch.onnx.export(
+        SVCVITS,
+        test_inputs,
+        f"checkpoints/{path}/{path}_SoVits.onnx",
+        dynamic_axes=daxes,
+        do_constant_folding=False,
+        opset_version=16,
+        verbose=False,
+        input_names=input_names,
+        output_names=output_names
+    )
+
+    vec_lay = "layer-12" if SVCVITS.gin_channels == 768 else "layer-9"
+    spklist = []
+    for key in hps.spk.keys():
+        spklist.append(key)
+
+    MoeVSConf = {
+        "Folder" : f"{path}",
+        "Name" : f"{path}",
+        "Type" : "SoVits",
+        "Rate" : hps.data.sampling_rate,
+        "Hop" : hps.data.hop_length,
+        "Hubert": f"vec-{SVCVITS.gin_channels}-{vec_lay}",
+        "SoVits4": True,
+        "SoVits3": False,
+        "CharaMix": export_mix,
+        "Volume": SVCVITS.vol_embedding,
+        "HiddenSize": SVCVITS.gin_channels,
+        "Characters": spklist
+    }
+
+    with open(f"checkpoints/{path}.json", 'w') as MoeVsConfFile:
+        json.dump(MoeVSConf, MoeVsConfFile, indent = 4)
 
 
 if __name__ == '__main__':
-    main(True)
+    main()
diff --git a/onnx_export_old.py b/onnx_export_old.py
@@ -0,0 +1,56 @@
+import torch
+
+import utils
+from onnxexport.model_onnx import SynthesizerTrn
+
+
+def main(NetExport):
+    path = "SoVits4.0"
+    if NetExport:
+        device = torch.device("cpu")
+        hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
+        SVCVITS = SynthesizerTrn(
+            hps.data.filter_length // 2 + 1,
+            hps.train.segment_size // hps.data.hop_length,
+            **hps.model)
+        _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", SVCVITS, None)
+        _ = SVCVITS.eval().to(device)
+        for i in SVCVITS.parameters():
+            i.requires_grad = False
+
+        n_frame = 10
+        test_hidden_unit = torch.rand(1, n_frame, 256)
+        test_pitch = torch.rand(1, n_frame)
+        test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0)
+        test_uv = torch.ones(1, n_frame, dtype=torch.float32)
+        test_noise = torch.randn(1, 192, n_frame)
+        test_sid = torch.LongTensor([0])
+        input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"]
+        output_names = ["audio", ]
+
+        torch.onnx.export(SVCVITS,
+                          (
+                              test_hidden_unit.to(device),
+                              test_pitch.to(device),
+                              test_mel2ph.to(device),
+                              test_uv.to(device),
+                              test_noise.to(device),
+                              test_sid.to(device)
+                          ),
+                          f"checkpoints/{path}/model.onnx",
+                          dynamic_axes={
+                              "c": [0, 1],
+                              "f0": [1],
+                              "mel2ph": [1],
+                              "uv": [1],
+                              "noise": [2],
+                          },
+                          do_constant_folding=False,
+                          opset_version=16,
+                          verbose=False,
+                          input_names=input_names,
+                          output_names=output_names)
+
+
+if __name__ == '__main__':
+    main(True)
diff --git a/onnx_export_speaker_mix.py b/onnx_export_speaker_mix.py