File size: 9,822 Bytes
f30f93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import io
import os

#os.system("wget -P hubert/ https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so/resolve/main/hubert/checkpoint_best_legacy_500.pt")
import gradio as gr
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc
import logging
import time
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
model = Svc("logs/44k/@github-NGZ-sovits-4.pth", "configs/config-65.json", cluster_model_path="logs/44k/kmeans_10000.pt")
#model = Svc("logs/44k/@github-NGZ-sovits-4.pth", "configs/config.json")

from matplotlib import pyplot as plt

def f0_to_pitch(ff):
    f0_pitch = 69 + 12 * np.log2(ff / 160)
    return f0_pitch
def compute_f0(wav_file1, wav_file2,tran):
    y1, sr1 = librosa.load(wav_file1, sr=16000)
    y2, sr2 = librosa.load(wav_file2, sr=16000)

    # Compute the f0 using the YIN pitch estimation method
    f0_1 = librosa.core.yin(y1, fmin=70, fmax=600)
    f0_2 = librosa.core.yin(y2, fmin=70, fmax=600)
    # 半音偏差
    sum_y = []
    if np.sum(wav_file1 == 0) / len(wav_file1) > 0.9:
        mistake, var_take = 0, 0
    else:
        for i in range(min(len(f0_1), len(f0_2))):
            if f0_1[i] > 0 and f0_2[i] > 0:
                sum_y.append(
                    abs(f0_to_pitch(f0_2[i]) - (f0_to_pitch(f0_1[i]) + tran)))
        num_y = 0
        for x in sum_y:
            num_y += x
        len_y = len(sum_y) if len(sum_y) else 1
        mistake = round(float(num_y / len_y), 2)
        var_take = round(float(np.std(sum_y, ddof=1)), 2)
    print("mistake", mistake, var_take)
    return f0_1, f0_2, sr1, sr2, round(mistake / 10, 2), round(var_take / 10, 2)


def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,F0_mean_pooling):
    #cluster_ratio =0
    start_time = time.time()
    if input_audio is None:
        return "You need to upload an audio", None
    sampling_rate, audio = input_audio
    duration = audio.shape[0] / sampling_rate
    if duration > 70:
        return "请上传小于70s的音频,需要转换长音频请本地进行转换", None	, None
    audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != 16000:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
    #print(audio.shape)

    
    out_wav_path = "temp.wav"
    soundfile.write(out_wav_path, audio, 16000, format="wav")
    
    #print(slice_db, cluster_ratio, auto_f0, noise_scale, sid)
    print(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale, F0_mean_pooling)
    _audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,F0_mean_pooling=F0_mean_pooling)

    soundfile.write("output.wav", _audio, 44100, format="wav")
    f01, f02, sr1, sr2 , mistake ,var  = compute_f0('temp.wav', 'output.wav',vc_transform)
    time_step_1 = np.arange(0,len(f01) )
    time_step_2 = np.arange(0,len(f02) )
    plt.figure(figsize=[8, 3])
    plt.plot(time_step_1  , f01, label='Input')
    plt.plot(time_step_2 , f02, label='Output')

    #plt.title("T0 of Input and Output")
    #plt.ylabel("T0")
    #plt.xlabel("Time step")

    length = np.arange(0,int( duration*10) ,int( duration))
    plt.xticks(np.linspace(0, len(f01),len(length)), length)
    plt.legend()
    plt.savefig('temp.svg')
    plt.close()
        
    used_time = round(time.time() - start_time, 2)
    out_str = ("Success! total use time:{}s\n半音偏差:{}\n半音方差:{}".format(
        used_time, mistake, var)) 
    return out_str , (44100, _audio), gr.Image.update("temp.svg")


app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem("Basic"):
            gr.Markdown(value="""
                    # 前言
                    * 此模型为sovits4.0原版(抗混响强),如果音色不像可以试试另一个模型:[https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so-dev](https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so-dev)
                    * 23-05-29修复池化功能,有bug记得反馈下。模型更新日期23-04-26.新模型使用65小时语音训练63位成员。仅供个人娱乐和非商业用途,禁止用于血腥、暴力、性相关、政治相关内容,转换长音频请本地进行转换
                    * 扒干声教程:[BV1sb411o7xF](https://www.bilibili.com/video/BV1sb411o7xF) [cv23095265](https://www.bilibili.com/read/cv23095265) b站传播的Ai翻唱大多数是他人翻唱或原曲混响和声少的,不是所有歌都能扒干净的,如果声音不像都是因为混响与和声扒不干净,结合自己的时间学会放弃。更多相关教程,翻唱,本地整合包在Tg群:[t.me/+vP8NK1NMLiYzMDJl](https://t.me/+vP8NK1NMLiYzMDJl)
                    * [Ripx,Au,UVR工具下载](https://pan.baidu.com/s/1Ne55iKqoacjKE-moK_YtGg?pwd=qsfd)  总有问制作流程,这说一下。。以冬之花为例,1.用UVR-4_HP-Vocal模型提取人声 或 vocalremover.org(这个网站处理不会损伤人声,方便二次处理,推荐),UVR-5_HP-Karaoke去除和声,2.合成,对比干声听听有几处哑音 如果有,使用RipX去除干声里造成哑音的和声 4.合成再听听,再不行就使用池化 5.使用Au调音,按喜好,添加混响,和声,回声等,这步可以增强音色,效果是很明显的。通过冬之花的练习,你已经具备处理干声的能力,轻松一天量产10首。
                    
                    # 声明
                    * 如用此模型制作音频请标注来源:github.com/3110asuka/Nogizaka46-so 或 huggingface.co/spaces/Nogizaka46/Nogizaka46-so""")
            gr.Markdown(value="""秋元真夏 AKIMOTO_MANATSU| 生田絵梨花 IKUTA_ERIKA| 生駒里奈 IKOMA_RINA| 伊藤純奈 ITO_JUNNA| 井上小百合 INOUE_SAYURI| 衛藤美彩 ETO_MISA| 川後陽菜 KAWAGO_HINA|北野日奈子 KITANO_HINAKO|齋藤飛鳥 SAITO_ASUKA|斉藤優里 SATO_YUURI|相楽伊織 SAGARA_IORI|桜井玲香 SAKURAI_REIKA|佐々木琴子 SASAKI_KOTOKO|白石麻衣 SHIRAISHI_MAI|新内眞衣 SHINUCHI_MAI|鈴木絢音 SUZUKI_AYANE|高山一実 TAKAYAMA_KAZUMI|寺田蘭世 TERADA_RANZE|西野七瀬 NISHINO_NANASE|能條愛未 NOUJO_AMI|樋口日奈 HIGUCHI_HINA|星野みなみ HOSHINO_MINAMI|堀未央奈 HORI_MIONA|松村沙友理 MATSUMURA_SAYURI|山崎怜奈 YAMAZAKI_RENA|若月佑美 WAKATSUKI_YUMI|渡辺みり愛 WATANABE_MIRIA|和田まあや WADA_MAAYA|伊藤理々杏 ITO_RIRIA|岩本蓮加 IWAMOTO_RENKA|梅澤美波 UMEZAWA_MINAMI|大園桃子 OZONO_MOMOKO|久保史緒里 KUBO_SHIORI|阪口珠美 SAKAGUCHI_TAMAMI|佐藤楓 SATO_KAEDE|中村麗乃 NAKAMURA_RENO|向井葉月 MUKAI_HAZUKI|山下美月 YAMASHITA_MIZUKI|与田祐希 YODA_YUUKI|遠藤さくら ENDO_SAKURA|賀喜遥香 KAKI_HARUKA|掛橋沙耶香 KAKEHASHI_SAYAKA|金川紗耶 KANAGAWA_SAYA|北川悠理 KITAGAWA_YURI|柴田柚菜 SHIBATA_YUNA|田村真佑 TAMURA_MAYU|筒井あやめ TSUTSUI_AYAME|早川聖来 HAYAKAWA_SEIRA|矢久保美緒 YAKUBO_MIO|黒見明香 HARUKA_KUROMI|佐藤璃果 RIKA_SATO|林瑠奈 RUNA_HAYASHI|松尾美佑 MIYU_MATSUO|弓木奈於 NAO_YUMIKI|五百城茉央 IOKI_MAO|池田瑛紗 IKEDA_TERESA|一ノ瀬美空 ICHINOSE_MIKU|井上和 INOUE_NAGI|小川彩 OGAWA_AYA|奥田いろは OKUDA_IROHA|川﨑桜 KAWASAKI_SAKURA|菅原咲月 SUGAWARA_SATSUKI|冨里奈央 TOMISATO_NAO|中西アルノ NAKANISHI_ARUNO""")
            spks = list(model.spk2id.keys())
            sid = gr.Dropdown(label="音色", choices=spks, value="IKUTA_ERIKA")
            vc_input3 = gr.Audio(label="上传音频<70s无BGM无混响的干声",value="没什么「你的名字。」干声素材12s.mp3")
            #vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)一般填写6或-6之内", value=0)
            vc_transform = gr.Slider(label="变调(整数,可以正负,半音数量,升高八度就是12)一般填写6或-6之内",
                                              maximum=16, minimum=-16, step=1, value=0)
            cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
            auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False)
            slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)",
                                              maximum=-30, minimum=-70, step=1, value=-40)
            noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4)
            F0_mean_pooling = gr.Checkbox(label="是否对F0使用均值滤波器(池化),对部分哑音有改善(和声混响造成的哑音无效)。注意,会导致推理速度下降,默认关闭", value=False)
            vc_submit = gr.Button("转换", variant="primary")
            vc_output1 = gr.Textbox(label="音高平均偏差半音数量,体现转换音频的跑调情况(一般小于0.5)")
            vc_output2 = gr.Audio(label="Output Audio")
            f0_image = gr.Image(label="f0曲线,蓝色为输入音高,橙色为合成音频的音高(代码有误差)")
        vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,F0_mean_pooling], 
                        [vc_output1, vc_output2, f0_image])

    app.launch()