File size: 4,991 Bytes
2478285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88cb6ea
2478285
591c662
2478285
 
 
 
 
 
 
 
 
 
 
 
591c662
2478285
a750793
2478285
 
 
 
 
 
88cb6ea
2478285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68e3b88
2478285
 
 
 
a750793
2478285
88cb6ea
 
2478285
 
 
 
 
 
 
68e3b88
218a76c
 
 
68e3b88
218a76c
 
 
 
bd49cdd
 
218a76c
68e3b88
8ddfd6b
68e3b88
 
a750793
68e3b88
2478285
 
a750793
2478285
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import os
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')

import logging

numba_logger = logging.getLogger('numba')
numba_logger.setLevel(logging.WARNING)
import librosa
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import numpy as np
import soundfile as sf
from preprocess_wave import FeatureInput

def resize2d(x, target_len):
    source = np.array(x)
    source[source<0.001] = np.nan
    target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
    res = np.nan_to_num(target)
    return res

def transcribe(path, length, transform):
    featur_pit = featureInput.compute_f0(path)
    featur_pit = featur_pit * 2**(transform/12)
    featur_pit = resize2d(featur_pit, length)
    coarse_pit = featureInput.coarse_f0(featur_pit)
    return coarse_pit

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    print(text_norm.shape)
    return text_norm

convert_cnt = [0]

hps_ms = utils.get_hparams_from_file("configs/nyarumul.json")
net_g_ms = SynthesizerTrn(
    len(symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model)

featureInput = FeatureInput(hps_ms.data.sampling_rate, hps_ms.data.hop_length)


hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")

_ = utils.load_checkpoint("nyarumodel.pth", net_g_ms, None)

def vc_fn(sid,random1, input_audio,vc_transform):
    if input_audio is None:
        return "You need to upload an audio", None
    sampling_rate, audio = input_audio
    # print(audio.shape,sampling_rate)
    duration = audio.shape[0] / sampling_rate
    if duration > 45:
        return "请上传小于45s的音频,需要转换长音频请使用colab", None
    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)

    source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0)
    print(source.shape)
    with torch.inference_mode():
        units = hubert.units(source)
        soft = units.squeeze(0).numpy()
    audio22050 = librosa.resample(audio, orig_sr=16000, target_sr=22050)
    sf.write("temp.wav", audio22050, 22050)
    pitch = transcribe("temp.wav", soft.shape[0], vc_transform)
    pitch = torch.LongTensor(pitch).unsqueeze(0)
    sid = torch.LongTensor([0]) if sid == "猫雷" else torch.LongTensor([1])
    stn_tst = torch.FloatTensor(soft)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch,sid=sid, noise_scale=float(random1),
                               noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().numpy()
    convert_cnt[0] += 1
    print(convert_cnt[0])
    return "Success", (hps_ms.data.sampling_rate, audio)


app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem("Basic"):
            gr.Markdown(value="""本模型相比与前一个模型,音质和音准方面有一定的提升,但是低音音域目前存在较大问题。

            目前猫雷模型能够唱的最低音为#G3(207hz) 低于该音会当场爆炸(之前的模型只是会跑调),
            
            因此请不要让这个模型唱男声的音高,请使用变调功能将音域移动至207hz以上。
            
            该模型的 [github仓库链接](https://github.com/innnky/so-vits-svc)
            
            如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide)
            
            ps: 更新了一下模型,现在和视频中不是一个同一个模型,b站视频中的模型在git历史中(因为之前数据集中似乎混入了一些杂项导致音色有些偏离猫雷音色)

            """)
            sid = gr.Dropdown(label="音色",choices=['猫雷'], value="猫雷")
            vc_input3 = gr.Audio(label="上传音频(长度小于45秒)")
            vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)",value=0)
            random1 = gr.Number(label="随机化程度,似乎会影响音质,建议保持默认",value=0.4)
            vc_submit = gr.Button("转换", variant="primary")
            vc_output1 = gr.Textbox(label="Output Message")
            vc_output2 = gr.Audio(label="Output Audio")
        vc_submit.click(vc_fn, [sid,random1,  vc_input3, vc_transform], [vc_output1, vc_output2])

    app.launch()