File size: 4,759 Bytes
2478285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88cb6ea
2478285
591c662
2478285
 
 
 
 
 
 
 
 
 
 
 
591c662
2478285
a750793
2478285
 
 
 
 
 
88cb6ea
2478285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68e3b88
2478285
 
 
 
a750793
2478285
88cb6ea
 
2478285
 
 
 
 
 
 
68e3b88
218a76c
 
 
68e3b88
218a76c
 
 
 
 
68e3b88
8ddfd6b
68e3b88
 
a750793
68e3b88
2478285
 
a750793
2478285
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import os
os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')

import logging

numba_logger = logging.getLogger('numba')
numba_logger.setLevel(logging.WARNING)
import librosa
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import numpy as np
import soundfile as sf
from preprocess_wave import FeatureInput

def resize2d(x, target_len):
    source = np.array(x)
    source[source<0.001] = np.nan
    target = np.interp(np.arange(0, len(source)*target_len, len(source))/ target_len, np.arange(0, len(source)), source)
    res = np.nan_to_num(target)
    return res

def transcribe(path, length, transform):
    featur_pit = featureInput.compute_f0(path)
    featur_pit = featur_pit * 2**(transform/12)
    featur_pit = resize2d(featur_pit, length)
    coarse_pit = featureInput.coarse_f0(featur_pit)
    return coarse_pit

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    print(text_norm.shape)
    return text_norm

convert_cnt = [0]

hps_ms = utils.get_hparams_from_file("configs/nyarumul.json")
net_g_ms = SynthesizerTrn(
    len(symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model)

featureInput = FeatureInput(hps_ms.data.sampling_rate, hps_ms.data.hop_length)


hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")

_ = utils.load_checkpoint("nyarumodel.pth", net_g_ms, None)

def vc_fn(sid,random1, input_audio,vc_transform):
    if input_audio is None:
        return "You need to upload an audio", None
    sampling_rate, audio = input_audio
    # print(audio.shape,sampling_rate)
    duration = audio.shape[0] / sampling_rate
    if duration > 45:
        return "请上传小于45s的音频,需要转换长音频请使用colab", None
    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)

    source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0)
    print(source.shape)
    with torch.inference_mode():
        units = hubert.units(source)
        soft = units.squeeze(0).numpy()
    audio22050 = librosa.resample(audio, orig_sr=16000, target_sr=22050)
    sf.write("temp.wav", audio22050, 22050)
    pitch = transcribe("temp.wav", soft.shape[0], vc_transform)
    pitch = torch.LongTensor(pitch).unsqueeze(0)
    sid = torch.LongTensor([0]) if sid == "猫雷" else torch.LongTensor([1])
    stn_tst = torch.FloatTensor(soft)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        audio = net_g_ms.infer(x_tst, x_tst_lengths, pitch=pitch,sid=sid, noise_scale=float(random1),
                               noise_scale_w=0.1, length_scale=1)[0][0, 0].data.float().numpy()
    convert_cnt[0] += 1
    print(convert_cnt[0])
    return "Success", (hps_ms.data.sampling_rate, audio)


app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem("Basic"):
            gr.Markdown(value="""本模型相比与前一个模型,音质和音准方面有一定的提升,但是低音音域目前存在较大问题。

            目前猫雷模型能够唱的最低音为#G3(207hz) 低于该音会当场爆炸(之前的模型只是会跑调),
            
            因此请不要让这个模型唱男声的音高,请使用变调功能将音域移动至207hz以上。
            
            该模型的 [github仓库链接](https://github.com/innnky/so-vits-svc)
            
            如果想自己制作并训练模型可以访问这个 [github仓库](https://github.com/IceKyrin/sovits_guide)

            """)
            sid = gr.Dropdown(label="音色",choices=['猫雷'], value="猫雷")
            vc_input3 = gr.Audio(label="上传音频(长度小于45秒)")
            vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)",value=0)
            random1 = gr.Number(label="随机化程度,似乎会影响音质,建议保持默认",value=0.4)
            vc_submit = gr.Button("转换", variant="primary")
            vc_output1 = gr.Textbox(label="Output Message")
            vc_output2 = gr.Audio(label="Output Audio")
        vc_submit.click(vc_fn, [sid,random1,  vc_input3, vc_transform], [vc_output1, vc_output2])

    app.launch()