reha commited on
Commit
0af542d
1 Parent(s): 5d50ffe

Upload 17 files

Browse files
configs/config.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 12,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 17920,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 384,
23
+ "port": "8001"
24
+ },
25
+ "data": {
26
+ "training_files": "filelists/train.txt",
27
+ "validation_files": "filelists/val.txt",
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 32000,
30
+ "filter_length": 1280,
31
+ "hop_length": 320,
32
+ "win_length": 1280,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null
36
+ },
37
+ "model": {
38
+ "inter_channels": 192,
39
+ "hidden_channels": 192,
40
+ "filter_channels": 768,
41
+ "n_heads": 2,
42
+ "n_layers": 6,
43
+ "kernel_size": 3,
44
+ "p_dropout": 0.1,
45
+ "resblock": "1",
46
+ "resblock_kernel_sizes": [
47
+ 3,
48
+ 7,
49
+ 11
50
+ ],
51
+ "resblock_dilation_sizes": [
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ],
57
+ [
58
+ 1,
59
+ 3,
60
+ 5
61
+ ],
62
+ [
63
+ 1,
64
+ 3,
65
+ 5
66
+ ]
67
+ ],
68
+ "upsample_rates": [
69
+ 10,
70
+ 8,
71
+ 2,
72
+ 2
73
+ ],
74
+ "upsample_initial_channel": 512,
75
+ "upsample_kernel_sizes": [
76
+ 16,
77
+ 16,
78
+ 4,
79
+ 4
80
+ ],
81
+ "n_layers_q": 3,
82
+ "use_spectral_norm": false,
83
+ "gin_channels": 256,
84
+ "ssl_dim": 256,
85
+ "n_speakers": 2
86
+ },
87
+ "spk": {
88
+ "Ztech": 0
89
+ }
90
+ }
filelists/test.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ./dataset/32k/yunhao/001829.wav
2
+ ./dataset/32k/yunhao/001827.wav
3
+ ./dataset/32k/jishuang/000104.wav
4
+ ./dataset/32k/nen/kne110_005.wav
5
+ ./dataset/32k/nen/kne110_004.wav
6
+ ./dataset/32k/jishuang/000223.wav
7
+ ./dataset/32k/yunhao/001828.wav
filelists/train.txt ADDED
File without changes
filelists/val.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ./dataset/32k/nen/kne110_005.wav
2
+ ./dataset/32k/yunhao/001827.wav
3
+ ./dataset/32k/jishuang/000104.wav
4
+ ./dataset/32k/jishuang/000223.wav
5
+ ./dataset/32k/nen/kne110_004.wav
6
+ ./dataset/32k/yunhao/001828.wav
hubert/__init__.py ADDED
File without changes
hubert/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (127 Bytes). View file
 
hubert/__pycache__/hubert_model.cpython-310.pyc ADDED
Binary file (7.52 kB). View file
 
hubert/app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import gradio as gr
4
+ import librosa
5
+ import numpy as np
6
+ import soundfile
7
+ import torch
8
+ from inference.infer_tool import Svc
9
+ import logging
10
+
11
+ logging.getLogger('numba').setLevel(logging.WARNING)
12
+
13
+ model_name = "logs/32k/G_98000.pth"
14
+ config_name = "configs/config.json"
15
+
16
+ svc_model = Svc(model_name, config_name)
17
+ sid_map = {
18
+ "Ztech": "Ztech"
19
+ }
20
+
21
+
22
+ def vc_fn(sid, input_audio, vc_transform):
23
+ if input_audio is None:
24
+ return "You need to upload an audio", None
25
+ sampling_rate, audio = input_audio
26
+ # print(audio.shape,sampling_rate)
27
+ duration = audio.shape[0] / sampling_rate
28
+ if duration > 45:
29
+ return "请上传小于45s的音频,需要转换长音频请本地进行转换", None
30
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
31
+ if len(audio.shape) > 1:
32
+ audio = librosa.to_mono(audio.transpose(1, 0))
33
+ if sampling_rate != 16000:
34
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
35
+ print(audio.shape)
36
+ out_wav_path = io.BytesIO()
37
+ soundfile.write(out_wav_path, audio, 16000, format="wav")
38
+ out_wav_path.seek(0)
39
+
40
+ sid = sid_map[sid]
41
+ out_audio, out_sr = svc_model.infer(sid, vc_transform, out_wav_path)
42
+ _audio = out_audio.cpu().numpy()
43
+ return "Success", (32000, _audio)
44
+
45
+
46
+ app = gr.Blocks()
47
+ with app:
48
+ with gr.Tabs():
49
+ with gr.TabItem("Basic"):
50
+ gr.Markdown(value="""
51
+ 这是sovits 3.0 32khz版本ai草莓猫taffy的在线demo
52
+
53
+ 在使用此模型前请阅读[AI粘连科技模型使用协议](https://huggingface.co/spaces/reha/Stick_Tech/blob/main/terms.md)
54
+
55
+ 粘连科技Official@bilibili:[点击关注](https://space.bilibili.com/248582596)
56
+
57
+ 如果要在本地使用该demo,请使用git lfs clone 该仓库,安装requirements.txt后运行app.py即可
58
+
59
+ 项目改写基于 https://huggingface.co/spaces/innnky/nyaru-svc-3.0
60
+
61
+ 本地合成可以删除26、27两行代码以解除合成45s长度限制""")
62
+ sid = gr.Dropdown(label="音色", choices=["taffy"], value="taffy")
63
+ vc_input3 = gr.Audio(label="上传音频(长度小于45秒)")
64
+ vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
65
+ vc_submit = gr.Button("转换", variant="primary")
66
+ vc_output1 = gr.Textbox(label="Output Message")
67
+ vc_output2 = gr.Audio(label="Output Audio")
68
+ vc_submit.click(vc_fn, [sid, vc_input3, vc_transform], [vc_output1, vc_output2])
69
+
70
+ app.launch()
hubert/hubert-soft-0d54a1f4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
3
+ size 378435957
hubert/hubert_model.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as t_func
8
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
+
10
+
11
+ class Hubert(nn.Module):
12
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
+ super().__init__()
14
+ self._mask = mask
15
+ self.feature_extractor = FeatureExtractor()
16
+ self.feature_projection = FeatureProjection()
17
+ self.positional_embedding = PositionalConvEmbedding()
18
+ self.norm = nn.LayerNorm(768)
19
+ self.dropout = nn.Dropout(0.1)
20
+ self.encoder = TransformerEncoder(
21
+ nn.TransformerEncoderLayer(
22
+ 768, 12, 3072, activation="gelu", batch_first=True
23
+ ),
24
+ 12,
25
+ )
26
+ self.proj = nn.Linear(768, 256)
27
+
28
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
+
31
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
+ mask = None
33
+ if self.training and self._mask:
34
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
+ x[mask] = self.masked_spec_embed.to(x.dtype)
36
+ return x, mask
37
+
38
+ def encode(
39
+ self, x: torch.Tensor, layer: Optional[int] = None
40
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
41
+ x = self.feature_extractor(x)
42
+ x = self.feature_projection(x.transpose(1, 2))
43
+ x, mask = self.mask(x)
44
+ x = x + self.positional_embedding(x)
45
+ x = self.dropout(self.norm(x))
46
+ x = self.encoder(x, output_layer=layer)
47
+ return x, mask
48
+
49
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
50
+ logits = torch.cosine_similarity(
51
+ x.unsqueeze(2),
52
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
+ dim=-1,
54
+ )
55
+ return logits / 0.1
56
+
57
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
58
+ x, mask = self.encode(x)
59
+ x = self.proj(x)
60
+ logits = self.logits(x)
61
+ return logits, mask
62
+
63
+
64
+ class HubertSoft(Hubert):
65
+ def __init__(self):
66
+ super().__init__()
67
+
68
+ @torch.inference_mode()
69
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
70
+ wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
71
+ x, _ = self.encode(wav)
72
+ return self.proj(x)
73
+
74
+
75
+ class FeatureExtractor(nn.Module):
76
+ def __init__(self):
77
+ super().__init__()
78
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
79
+ self.norm0 = nn.GroupNorm(512, 512)
80
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
81
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
82
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
83
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
84
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
85
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
86
+
87
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
88
+ x = t_func.gelu(self.norm0(self.conv0(x)))
89
+ x = t_func.gelu(self.conv1(x))
90
+ x = t_func.gelu(self.conv2(x))
91
+ x = t_func.gelu(self.conv3(x))
92
+ x = t_func.gelu(self.conv4(x))
93
+ x = t_func.gelu(self.conv5(x))
94
+ x = t_func.gelu(self.conv6(x))
95
+ return x
96
+
97
+
98
+ class FeatureProjection(nn.Module):
99
+ def __init__(self):
100
+ super().__init__()
101
+ self.norm = nn.LayerNorm(512)
102
+ self.projection = nn.Linear(512, 768)
103
+ self.dropout = nn.Dropout(0.1)
104
+
105
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
106
+ x = self.norm(x)
107
+ x = self.projection(x)
108
+ x = self.dropout(x)
109
+ return x
110
+
111
+
112
+ class PositionalConvEmbedding(nn.Module):
113
+ def __init__(self):
114
+ super().__init__()
115
+ self.conv = nn.Conv1d(
116
+ 768,
117
+ 768,
118
+ kernel_size=128,
119
+ padding=128 // 2,
120
+ groups=16,
121
+ )
122
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
123
+
124
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
125
+ x = self.conv(x.transpose(1, 2))
126
+ x = t_func.gelu(x[:, :, :-1])
127
+ return x.transpose(1, 2)
128
+
129
+
130
+ class TransformerEncoder(nn.Module):
131
+ def __init__(
132
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
133
+ ) -> None:
134
+ super(TransformerEncoder, self).__init__()
135
+ self.layers = nn.ModuleList(
136
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
137
+ )
138
+ self.num_layers = num_layers
139
+
140
+ def forward(
141
+ self,
142
+ src: torch.Tensor,
143
+ mask: torch.Tensor = None,
144
+ src_key_padding_mask: torch.Tensor = None,
145
+ output_layer: Optional[int] = None,
146
+ ) -> torch.Tensor:
147
+ output = src
148
+ for layer in self.layers[:output_layer]:
149
+ output = layer(
150
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
151
+ )
152
+ return output
153
+
154
+
155
+ def _compute_mask(
156
+ shape: Tuple[int, int],
157
+ mask_prob: float,
158
+ mask_length: int,
159
+ device: torch.device,
160
+ min_masks: int = 0,
161
+ ) -> torch.Tensor:
162
+ batch_size, sequence_length = shape
163
+
164
+ if mask_length < 1:
165
+ raise ValueError("`mask_length` has to be bigger than 0.")
166
+
167
+ if mask_length > sequence_length:
168
+ raise ValueError(
169
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
170
+ )
171
+
172
+ # compute number of masked spans in batch
173
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
174
+ num_masked_spans = max(num_masked_spans, min_masks)
175
+
176
+ # make sure num masked indices <= sequence_length
177
+ if num_masked_spans * mask_length > sequence_length:
178
+ num_masked_spans = sequence_length // mask_length
179
+
180
+ # SpecAugment mask to fill
181
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
182
+
183
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
184
+ uniform_dist = torch.ones(
185
+ (batch_size, sequence_length - (mask_length - 1)), device=device
186
+ )
187
+
188
+ # get random indices to mask
189
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
190
+
191
+ # expand masked indices to masked spans
192
+ mask_indices = (
193
+ mask_indices.unsqueeze(dim=-1)
194
+ .expand((batch_size, num_masked_spans, mask_length))
195
+ .reshape(batch_size, num_masked_spans * mask_length)
196
+ )
197
+ offsets = (
198
+ torch.arange(mask_length, device=device)[None, None, :]
199
+ .expand((batch_size, num_masked_spans, mask_length))
200
+ .reshape(batch_size, num_masked_spans * mask_length)
201
+ )
202
+ mask_idxs = mask_indices + offsets
203
+
204
+ # scatter indices to mask
205
+ mask = mask.scatter(1, mask_idxs, True)
206
+
207
+ return mask
208
+
209
+
210
+ def hubert_soft(
211
+ path: str,
212
+ ) -> HubertSoft:
213
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
214
+ Args:
215
+ path (str): path of a pretrained model
216
+ """
217
+ hubert = HubertSoft()
218
+ checkpoint = torch.load(path)
219
+ consume_prefix_in_state_dict_if_present(checkpoint, "module.")
220
+ hubert.load_state_dict(checkpoint)
221
+ hubert.eval()
222
+ return hubert
hubert/put_hubert_ckpt_here ADDED
File without changes
inference/__init__.py ADDED
File without changes
inference/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (130 Bytes). View file
 
inference/__pycache__/infer_tool.cpython-310.pyc ADDED
Binary file (8.6 kB). View file
 
inference/chunks_temp.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"info": "temp_dict", "cd65e3ce661250b7aea16ea398d13925": {"chunks": {"0": {"slice": false, "split_time": "0,556685"}}, "time": 1670798600}, "2959d9aae0e4172f27b54e452bf3b77c": {"chunks": {"0": {"slice": false, "split_time": "0,298726"}, "1": {"slice": true, "split_time": "298726,303854"}, "2": {"slice": false, "split_time": "303854,631469"}}, "time": 1670427122}, "c617b9cc74eeed7940d9e6f47c0c5bb6": {"chunks": {"0": {"slice": true, "split_time": "0,294190"}, "1": {"slice": false, "split_time": "294190,2014632"}, "2": {"slice": true, "split_time": "2014632,2021279"}, "3": {"slice": false, "split_time": "2021279,2800739"}, "4": {"slice": true, "split_time": "2800739,2816932"}, "5": {"slice": false, "split_time": "2816932,4554807"}, "6": {"slice": true, "split_time": "4554807,4572392"}, "7": {"slice": false, "split_time": "4572392,5337074"}, "8": {"slice": true, "split_time": "5337074,6197945"}, "9": {"slice": false, "split_time": "6197945,7043120"}, "10": {"slice": true, "split_time": "7043120,7195239"}}, "time": 1670428761}, "e7bba7ef02e7bba00520d7171a529b02": {"chunks": {"0": {"slice": false, "split_time": "0,435739"}}, "time": 1670472899}, "3517fa06b9fe06618393107005ce145f": {"chunks": {"0": {"slice": false, "split_time": "0,245893"}}, "time": 1670475779}, "2250dc696d4c0025d766e5234912e446": {"chunks": {"0": {"slice": true, "split_time": "0,223394"}, "1": {"slice": false, "split_time": "223394,546311"}, "2": {"slice": true, "split_time": "546311,572526"}, "3": {"slice": false, "split_time": "572526,1004349"}, "4": {"slice": true, "split_time": "1004349,1090615"}, "5": {"slice": false, "split_time": "1090615,1415280"}, "6": {"slice": true, "split_time": "1415280,1418069"}, "7": {"slice": false, "split_time": "1418069,1659131"}, "8": {"slice": true, "split_time": "1659131,1661453"}, "9": {"slice": false, "split_time": "1661453,1888827"}, "10": {"slice": true, "split_time": "1888827,1960051"}, "11": {"slice": false, "split_time": "1960051,2230836"}, "12": {"slice": true, "split_time": "2230836,2306854"}, "13": {"slice": false, "split_time": "2306854,2583422"}, "14": {"slice": true, "split_time": "2583422,2649271"}, "15": {"slice": false, "split_time": "2649271,2929916"}, "16": {"slice": true, "split_time": "2929916,2977116"}, "17": {"slice": false, "split_time": "2977116,3431901"}, "18": {"slice": true, "split_time": "3431901,3504853"}}, "time": 1670476034}, "7030c457119b4710d0091ce67f58a125": {"chunks": {"0": {"slice": true, "split_time": "0,9640"}, "1": {"slice": false, "split_time": "9640,209081"}, "2": {"slice": true, "split_time": "209081,210126"}, "3": {"slice": false, "split_time": "210126,504084"}, "4": {"slice": true, "split_time": "504084,505625"}, "5": {"slice": false, "split_time": "505625,768061"}, "6": {"slice": true, "split_time": "768061,795550"}}, "time": 1670627906}, "7d9274b960035df4dacfdc95b492cf7c": {"chunks": {"0": {"slice": false, "split_time": "0,337196"}, "1": {"slice": true, "split_time": "337196,347378"}, "2": {"slice": false, "split_time": "347378,1022501"}, "3": {"slice": true, "split_time": "1022501,1034918"}, "4": {"slice": false, "split_time": "1034918,2070080"}}, "time": 1670487808}, "0f3c73ebbda2101325eb6453551514af": {"chunks": {"0": {"slice": true, "split_time": "0,475043"}, "1": {"slice": false, "split_time": "475043,1288182"}, "2": {"slice": true, "split_time": "1288182,1303033"}, "3": {"slice": false, "split_time": "1303033,2101474"}, "4": {"slice": true, "split_time": "2101474,2106811"}, "5": {"slice": false, "split_time": "2106811,3055223"}, "6": {"slice": true, "split_time": "3055223,3516745"}, "7": {"slice": false, "split_time": "3516745,4348812"}, "8": {"slice": true, "split_time": "4348812,4354034"}, "9": {"slice": false, "split_time": "4354034,4756434"}, "10": {"slice": true, "split_time": "4756434,4757558"}, "11": {"slice": false, "split_time": "4757558,7830503"}, "12": {"slice": true, "split_time": "7830503,7839320"}, "13": {"slice": false, "split_time": "7839320,8051918"}, "14": {"slice": true, "split_time": "8051918,8184993"}}, "time": 1670713815}, "c544d7842bb2fa325a0aa9a21b7f9503": {"chunks": {"0": {"slice": true, "split_time": "0,134375"}, "1": {"slice": false, "split_time": "134375,947514"}, "2": {"slice": true, "split_time": "947514,962365"}, "3": {"slice": false, "split_time": "962365,1760806"}, "4": {"slice": true, "split_time": "1760806,1766143"}, "5": {"slice": false, "split_time": "1766143,2714555"}, "6": {"slice": true, "split_time": "2714555,2888274"}}, "time": 1670586817}, "b663f58fd3b1da710febfa0a45f447f7": {"chunks": {"0": {"slice": false, "split_time": "0,333120"}}, "time": 1670628277}, "dccf823870a0c278d53690469e26ce5e": {"chunks": {"0": {"slice": false, "split_time": "0,174322"}, "1": {"slice": true, "split_time": "174322,178984"}, "2": {"slice": false, "split_time": "178984,592293"}}, "time": 1670750467}, "1ea41e9dc88c7c28ac2ac7fc637f929f": {"chunks": {"0": {"slice": false, "split_time": "0,177025"}, "1": {"slice": true, "split_time": "177025,180753"}, "2": {"slice": false, "split_time": "180753,307984"}, "3": {"slice": true, "split_time": "307984,309679"}, "4": {"slice": false, "split_time": "309679,602116"}, "5": {"slice": true, "split_time": "602116,604118"}, "6": {"slice": false, "split_time": "604118,608816"}}, "time": 1670750647}, "70bfe94e1cfe1b9eb8e575d15a7e8bb2": {"chunks": {"0": {"slice": false, "split_time": "0,240584"}}, "time": 1670751256}, "a4296c8efc33bd7857cb3b2f6f9b912b": {"chunks": {"0": {"slice": false, "split_time": "0,314870"}}, "time": 1670751262}, "4f0e18a681b221edfb66f601cb0257dc": {"chunks": {"0": {"slice": true, "split_time": "0,13985"}, "1": {"slice": false, "split_time": "13985,114094"}, "2": {"slice": true, "split_time": "114094,117459"}, "3": {"slice": false, "split_time": "117459,326432"}, "4": {"slice": true, "split_time": "326432,328157"}, "5": {"slice": false, "split_time": "328157,462942"}, "6": {"slice": true, "split_time": "462942,480279"}, "7": {"slice": false, "split_time": "480279,615909"}, "8": {"slice": true, "split_time": "615909,620472"}}, "time": 1670751397}}
inference/infer_tool.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import librosa
9
+ import maad
10
+ import numpy as np
11
+ # import onnxruntime
12
+ import parselmouth
13
+ import soundfile
14
+ import torch
15
+ import torchaudio
16
+
17
+ from hubert import hubert_model
18
+ import utils
19
+ from models import SynthesizerTrn
20
+
21
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
+
23
+
24
+ def read_temp(file_name):
25
+ if not os.path.exists(file_name):
26
+ with open(file_name, "w") as f:
27
+ f.write(json.dumps({"info": "temp_dict"}))
28
+ return {}
29
+ else:
30
+ try:
31
+ with open(file_name, "r") as f:
32
+ data = f.read()
33
+ data_dict = json.loads(data)
34
+ if os.path.getsize(file_name) > 50 * 1024 * 1024:
35
+ f_name = file_name.split("/")[-1]
36
+ print(f"clean {f_name}")
37
+ for wav_hash in list(data_dict.keys()):
38
+ if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
39
+ del data_dict[wav_hash]
40
+ except Exception as e:
41
+ print(e)
42
+ print(f"{file_name} error,auto rebuild file")
43
+ data_dict = {"info": "temp_dict"}
44
+ return data_dict
45
+
46
+
47
+ def write_temp(file_name, data):
48
+ with open(file_name, "w") as f:
49
+ f.write(json.dumps(data))
50
+
51
+
52
+ def timeit(func):
53
+ def run(*args, **kwargs):
54
+ t = time.time()
55
+ res = func(*args, **kwargs)
56
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
57
+ return res
58
+
59
+ return run
60
+
61
+
62
+ def format_wav(audio_path):
63
+ if Path(audio_path).suffix == '.wav':
64
+ return
65
+ raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
66
+ soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
67
+
68
+
69
+ def get_end_file(dir_path, end):
70
+ file_lists = []
71
+ for root, dirs, files in os.walk(dir_path):
72
+ files = [f for f in files if f[0] != '.']
73
+ dirs[:] = [d for d in dirs if d[0] != '.']
74
+ for f_file in files:
75
+ if f_file.endswith(end):
76
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
77
+ return file_lists
78
+
79
+
80
+ def get_md5(content):
81
+ return hashlib.new("md5", content).hexdigest()
82
+
83
+
84
+ def resize2d_f0(x, target_len):
85
+ source = np.array(x)
86
+ source[source < 0.001] = np.nan
87
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
88
+ source)
89
+ res = np.nan_to_num(target)
90
+ return res
91
+
92
+ def get_f0(x, p_len,f0_up_key=0):
93
+
94
+ time_step = 160 / 16000 * 1000
95
+ f0_min = 50
96
+ f0_max = 1100
97
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
98
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
99
+
100
+ f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
101
+ time_step=time_step / 1000, voicing_threshold=0.6,
102
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
103
+
104
+ pad_size=(p_len - len(f0) + 1) // 2
105
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
106
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
107
+
108
+ f0 *= pow(2, f0_up_key / 12)
109
+ f0_mel = 1127 * np.log(1 + f0 / 700)
110
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
111
+ f0_mel[f0_mel <= 1] = 1
112
+ f0_mel[f0_mel > 255] = 255
113
+ f0_coarse = np.rint(f0_mel).astype(np.int)
114
+ return f0_coarse, f0
115
+
116
+ def clean_pitch(input_pitch):
117
+ num_nan = np.sum(input_pitch == 1)
118
+ if num_nan / len(input_pitch) > 0.9:
119
+ input_pitch[input_pitch != 1] = 1
120
+ return input_pitch
121
+
122
+
123
+ def plt_pitch(input_pitch):
124
+ input_pitch = input_pitch.astype(float)
125
+ input_pitch[input_pitch == 1] = np.nan
126
+ return input_pitch
127
+
128
+
129
+ def f0_to_pitch(ff):
130
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
131
+ return f0_pitch
132
+
133
+
134
+ def fill_a_to_b(a, b):
135
+ if len(a) < len(b):
136
+ for _ in range(0, len(b) - len(a)):
137
+ a.append(a[0])
138
+
139
+
140
+ def mkdir(paths: list):
141
+ for path in paths:
142
+ if not os.path.exists(path):
143
+ os.mkdir(path)
144
+
145
+
146
+ class Svc(object):
147
+ def __init__(self, net_g_path, config_path, hubert_path="hubert/hubert-soft-0d54a1f4.pt",
148
+ onnx=False):
149
+ self.onnx = onnx
150
+ self.net_g_path = net_g_path
151
+ self.hubert_path = hubert_path
152
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
153
+ self.net_g_ms = None
154
+ self.hps_ms = utils.get_hparams_from_file(config_path)
155
+ self.target_sample = self.hps_ms.data.sampling_rate
156
+ self.hop_size = self.hps_ms.data.hop_length
157
+ self.speakers = {}
158
+ for spk, sid in self.hps_ms.spk.items():
159
+ self.speakers[sid] = spk
160
+ self.spk2id = self.hps_ms.spk
161
+ # 加载hubert
162
+ self.hubert_soft = hubert_model.hubert_soft(hubert_path)
163
+ if torch.cuda.is_available():
164
+ self.hubert_soft = self.hubert_soft.cuda()
165
+ self.load_model()
166
+
167
+ def load_model(self):
168
+ # 获取模型配置
169
+ if self.onnx:
170
+ raise NotImplementedError
171
+ # self.net_g_ms = SynthesizerTrnForONNX(
172
+ # 178,
173
+ # self.hps_ms.data.filter_length // 2 + 1,
174
+ # self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
175
+ # n_speakers=self.hps_ms.data.n_speakers,
176
+ # **self.hps_ms.model)
177
+ # _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
178
+ else:
179
+ self.net_g_ms = SynthesizerTrn(
180
+ self.hps_ms.data.filter_length // 2 + 1,
181
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
182
+ **self.hps_ms.model)
183
+ _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
184
+ if "half" in self.net_g_path and torch.cuda.is_available():
185
+ _ = self.net_g_ms.half().eval().to(self.dev)
186
+ else:
187
+ _ = self.net_g_ms.eval().to(self.dev)
188
+
189
+ def get_units(self, source, sr):
190
+
191
+ source = source.unsqueeze(0).to(self.dev)
192
+ with torch.inference_mode():
193
+ start = time.time()
194
+ units = self.hubert_soft.units(source)
195
+ use_time = time.time() - start
196
+ print("hubert use time:{}".format(use_time))
197
+ return units
198
+
199
+
200
+ def get_unit_pitch(self, in_path, tran):
201
+ source, sr = torchaudio.load(in_path)
202
+ source = torchaudio.functional.resample(source, sr, 16000)
203
+ if len(source.shape) == 2 and source.shape[1] >= 2:
204
+ source = torch.mean(source, dim=0).unsqueeze(0)
205
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
206
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
207
+ return soft, f0
208
+
209
+ def infer(self, speaker_id, tran, raw_path):
210
+ if type(speaker_id) == str:
211
+ speaker_id = self.spk2id[speaker_id]
212
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
213
+ soft, pitch = self.get_unit_pitch(raw_path, tran)
214
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev)
215
+ if "half" in self.net_g_path and torch.cuda.is_available():
216
+ stn_tst = torch.HalfTensor(soft)
217
+ else:
218
+ stn_tst = torch.FloatTensor(soft)
219
+ with torch.no_grad():
220
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
221
+ start = time.time()
222
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
223
+ audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
224
+ use_time = time.time() - start
225
+ print("vits use time:{}".format(use_time))
226
+ return audio, audio.shape[-1]
227
+
228
+
229
+ # class SvcONNXInferModel(object):
230
+ # def __init__(self, hubert_onnx, vits_onnx, config_path):
231
+ # self.config_path = config_path
232
+ # self.vits_onnx = vits_onnx
233
+ # self.hubert_onnx = hubert_onnx
234
+ # self.hubert_onnx_session = onnxruntime.InferenceSession(hubert_onnx, providers=['CUDAExecutionProvider', ])
235
+ # self.inspect_onnx(self.hubert_onnx_session)
236
+ # self.vits_onnx_session = onnxruntime.InferenceSession(vits_onnx, providers=['CUDAExecutionProvider', ])
237
+ # self.inspect_onnx(self.vits_onnx_session)
238
+ # self.hps_ms = utils.get_hparams_from_file(self.config_path)
239
+ # self.target_sample = self.hps_ms.data.sampling_rate
240
+ # self.feature_input = FeatureInput(self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length)
241
+ #
242
+ # @staticmethod
243
+ # def inspect_onnx(session):
244
+ # for i in session.get_inputs():
245
+ # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
246
+ # for i in session.get_outputs():
247
+ # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
248
+ #
249
+ # def infer(self, speaker_id, tran, raw_path):
250
+ # sid = np.array([int(speaker_id)], dtype=np.int64)
251
+ # soft, pitch = self.get_unit_pitch(raw_path, tran)
252
+ # pitch = np.expand_dims(pitch, axis=0).astype(np.int64)
253
+ # stn_tst = soft
254
+ # x_tst = np.expand_dims(stn_tst, axis=0)
255
+ # x_tst_lengths = np.array([stn_tst.shape[0]], dtype=np.int64)
256
+ # # 使用ONNX Runtime进行推理
257
+ # start = time.time()
258
+ # audio = self.vits_onnx_session.run(output_names=["audio"],
259
+ # input_feed={
260
+ # "hidden_unit": x_tst,
261
+ # "lengths": x_tst_lengths,
262
+ # "pitch": pitch,
263
+ # "sid": sid,
264
+ # })[0][0, 0]
265
+ # use_time = time.time() - start
266
+ # print("vits_onnx_session.run time:{}".format(use_time))
267
+ # audio = torch.from_numpy(audio)
268
+ # return audio, audio.shape[-1]
269
+ #
270
+ # def get_units(self, source, sr):
271
+ # source = torchaudio.functional.resample(source, sr, 16000)
272
+ # if len(source.shape) == 2 and source.shape[1] >= 2:
273
+ # source = torch.mean(source, dim=0).unsqueeze(0)
274
+ # source = source.unsqueeze(0)
275
+ # # 使用ONNX Runtime进行推理
276
+ # start = time.time()
277
+ # units = self.hubert_onnx_session.run(output_names=["embed"],
278
+ # input_feed={"source": source.numpy()})[0]
279
+ # use_time = time.time() - start
280
+ # print("hubert_onnx_session.run time:{}".format(use_time))
281
+ # return units
282
+ #
283
+ # def transcribe(self, source, sr, length, transform):
284
+ # feature_pit = self.feature_input.compute_f0(source, sr)
285
+ # feature_pit = feature_pit * 2 ** (transform / 12)
286
+ # feature_pit = resize2d_f0(feature_pit, length)
287
+ # coarse_pit = self.feature_input.coarse_f0(feature_pit)
288
+ # return coarse_pit
289
+ #
290
+ # def get_unit_pitch(self, in_path, tran):
291
+ # source, sr = torchaudio.load(in_path)
292
+ # soft = self.get_units(source, sr).squeeze(0)
293
+ # input_pitch = self.transcribe(source.numpy()[0], sr, soft.shape[0], tran)
294
+ # return soft, input_pitch
295
+
296
+
297
+ class RealTimeVC:
298
+ def __init__(self):
299
+ self.last_chunk = None
300
+ self.last_o = None
301
+ self.chunk_len = 16000 # 区块长度
302
+ self.pre_len = 3840 # 交叉淡化长度,640的倍数
303
+
304
+ """输入输出都是1维numpy 音频波形数组"""
305
+
306
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
307
+ audio, sr = torchaudio.load(input_wav_path)
308
+ audio = audio.cpu().numpy()[0]
309
+ temp_wav = io.BytesIO()
310
+ if self.last_chunk is None:
311
+ input_wav_path.seek(0)
312
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
313
+ audio = audio.cpu().numpy()
314
+ self.last_chunk = audio[-self.pre_len:]
315
+ self.last_o = audio
316
+ return audio[-self.chunk_len:]
317
+ else:
318
+ audio = np.concatenate([self.last_chunk, audio])
319
+ soundfile.write(temp_wav, audio, sr, format="wav")
320
+ temp_wav.seek(0)
321
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
322
+ audio = audio.cpu().numpy()
323
+ ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
324
+ self.last_chunk = audio[-self.pre_len:]
325
+ self.last_o = audio
326
+ return ret[self.chunk_len:2 * self.chunk_len]
inference/slicer.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torchaudio
6
+ from scipy.ndimage import maximum_filter1d, uniform_filter1d
7
+
8
+
9
+ def timeit(func):
10
+ def run(*args, **kwargs):
11
+ t = time.time()
12
+ res = func(*args, **kwargs)
13
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
14
+ return res
15
+
16
+ return run
17
+
18
+
19
+ # @timeit
20
+ def _window_maximum(arr, win_sz):
21
+ return maximum_filter1d(arr, size=win_sz)[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
22
+
23
+
24
+ # @timeit
25
+ def _window_rms(arr, win_sz):
26
+ filtered = np.sqrt(uniform_filter1d(np.power(arr, 2), win_sz) - np.power(uniform_filter1d(arr, win_sz), 2))
27
+ return filtered[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
28
+
29
+
30
+ def level2db(levels, eps=1e-12):
31
+ return 20 * np.log10(np.clip(levels, a_min=eps, a_max=1))
32
+
33
+
34
+ def _apply_slice(audio, begin, end):
35
+ if len(audio.shape) > 1:
36
+ return audio[:, begin: end]
37
+ else:
38
+ return audio[begin: end]
39
+
40
+
41
+ class Slicer:
42
+ def __init__(self,
43
+ sr: int,
44
+ db_threshold: float = -40,
45
+ min_length: int = 5000,
46
+ win_l: int = 300,
47
+ win_s: int = 20,
48
+ max_silence_kept: int = 500):
49
+ self.db_threshold = db_threshold
50
+ self.min_samples = round(sr * min_length / 1000)
51
+ self.win_ln = round(sr * win_l / 1000)
52
+ self.win_sn = round(sr * win_s / 1000)
53
+ self.max_silence = round(sr * max_silence_kept / 1000)
54
+ if not self.min_samples >= self.win_ln >= self.win_sn:
55
+ raise ValueError('The following condition must be satisfied: min_length >= win_l >= win_s')
56
+ if not self.max_silence >= self.win_sn:
57
+ raise ValueError('The following condition must be satisfied: max_silence_kept >= win_s')
58
+
59
+ @timeit
60
+ def slice(self, audio):
61
+ samples = audio
62
+ if samples.shape[0] <= self.min_samples:
63
+ return {"0": {"slice": False, "split_time": f"0,{len(audio)}"}}
64
+ # get absolute amplitudes
65
+ abs_amp = np.abs(samples - np.mean(samples))
66
+ # calculate local maximum with large window
67
+ win_max_db = level2db(_window_maximum(abs_amp, win_sz=self.win_ln))
68
+ sil_tags = []
69
+ left = right = 0
70
+ while right < win_max_db.shape[0]:
71
+ if win_max_db[right] < self.db_threshold:
72
+ right += 1
73
+ elif left == right:
74
+ left += 1
75
+ right += 1
76
+ else:
77
+ if left == 0:
78
+ split_loc_l = left
79
+ else:
80
+ sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
81
+ rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
82
+ split_win_l = left + np.argmin(rms_db_left)
83
+ split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
84
+ if len(sil_tags) != 0 and split_loc_l - sil_tags[-1][1] < self.min_samples and right < win_max_db.shape[
85
+ 0] - 1:
86
+ right += 1
87
+ left = right
88
+ continue
89
+ if right == win_max_db.shape[0] - 1:
90
+ split_loc_r = right + self.win_ln
91
+ else:
92
+ sil_right_n = min(self.max_silence, (right + self.win_ln - left) // 2)
93
+ rms_db_right = level2db(_window_rms(samples[right + self.win_ln - sil_right_n: right + self.win_ln],
94
+ win_sz=self.win_sn))
95
+ split_win_r = right + self.win_ln - sil_right_n + np.argmin(rms_db_right)
96
+ split_loc_r = split_win_r + np.argmin(abs_amp[split_win_r: split_win_r + self.win_sn])
97
+ sil_tags.append((split_loc_l, split_loc_r))
98
+ right += 1
99
+ left = right
100
+ if left != right:
101
+ sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
102
+ rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
103
+ split_win_l = left + np.argmin(rms_db_left)
104
+ split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
105
+ sil_tags.append((split_loc_l, samples.shape[0]))
106
+ if len(sil_tags) == 0:
107
+ return {"0": {"slice": False, "split_time": f"0,{len(audio)}"}}
108
+ else:
109
+ chunks = []
110
+ # 第一段静音并非从头开始,补上有声片段
111
+ if sil_tags[0][0]:
112
+ chunks.append({"slice": False, "split_time": f"0,{sil_tags[0][0]}"})
113
+ for i in range(0, len(sil_tags)):
114
+ # 标识有声片段(跳过第一段)
115
+ if i:
116
+ chunks.append({"slice": False, "split_time": f"{sil_tags[i - 1][1]},{sil_tags[i][0]}"})
117
+ # 标识所有静音片段
118
+ chunks.append({"slice": True, "split_time": f"{sil_tags[i][0]},{sil_tags[i][1]}"})
119
+ # 最后一段静音并非结尾,补上结尾片段
120
+ if sil_tags[-1][1] != len(audio):
121
+ chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1]},{len(audio)}"})
122
+ chunk_dict = {}
123
+ for i in range(len(chunks)):
124
+ chunk_dict[str(i)] = chunks[i]
125
+ return chunk_dict
126
+
127
+
128
+ def cut(audio_path, db_thresh=-30, min_len=5000, win_l=300, win_s=20, max_sil_kept=500):
129
+ audio, sr = torchaudio.load(audio_path)
130
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
131
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
132
+ audio = audio.cpu().numpy()[0]
133
+
134
+ slicer = Slicer(
135
+ sr=sr,
136
+ db_threshold=db_thresh,
137
+ min_length=min_len,
138
+ win_l=win_l,
139
+ win_s=win_s,
140
+ max_silence_kept=max_sil_kept
141
+ )
142
+ chunks = slicer.slice(audio)
143
+ return chunks
144
+
145
+
146
+ def chunks2audio(audio_path, chunks):
147
+ chunks = dict(chunks)
148
+ audio, sr = torchaudio.load(audio_path)
149
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
150
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
151
+ audio = audio.cpu().numpy()[0]
152
+ result = []
153
+ for k, v in chunks.items():
154
+ tag = v["split_time"].split(",")
155
+ result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
156
+ return result, sr
157
+
158
+