Delete app-full.py

#11
by KimaruRai - opened
Files changed (1) hide show
  1. app-full.py +0 -254
app-full.py DELETED
@@ -1,254 +0,0 @@
1
- import os
2
- import json
3
- import argparse
4
- import traceback
5
- import logging
6
- import gradio as gr
7
- import numpy as np
8
- import librosa
9
- import torch
10
- import asyncio
11
- import edge_tts
12
- import yt_dlp
13
- import ffmpeg
14
- import subprocess
15
- import sys
16
- import io
17
- import wave
18
- from datetime import datetime
19
- from fairseq import checkpoint_utils
20
- from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
21
- from vc_infer_pipeline import VC
22
- from config import (
23
- is_half,
24
- device
25
- )
26
- logging.getLogger("numba").setLevel(logging.WARNING)
27
- limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
28
-
29
- def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index, file_big_npy):
30
- def vc_fn(
31
- input_audio,
32
- upload_audio,
33
- upload_mode,
34
- f0_up_key,
35
- f0_method,
36
- index_rate,
37
- tts_mode,
38
- tts_text,
39
- tts_voice
40
- ):
41
- try:
42
- if tts_mode:
43
- if len(tts_text) > 100 and limitation:
44
- return "Text is too long", None
45
- if tts_text is None or tts_voice is None:
46
- return "You need to enter text and select a voice", None
47
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
48
- audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
49
- else:
50
- if upload_mode:
51
- if input_audio is None:
52
- return "You need to upload an audio", None
53
- sampling_rate, audio = upload_audio
54
- duration = audio.shape[0] / sampling_rate
55
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
56
- if len(audio.shape) > 1:
57
- audio = librosa.to_mono(audio.transpose(1, 0))
58
- if sampling_rate != 16000:
59
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
60
- else:
61
- audio, sr = librosa.load(input_audio, sr=16000, mono=True)
62
- times = [0, 0, 0]
63
- f0_up_key = int(f0_up_key)
64
- audio_opt = vc.pipeline(
65
- hubert_model,
66
- net_g,
67
- 0,
68
- audio,
69
- times,
70
- f0_up_key,
71
- f0_method,
72
- file_index,
73
- file_big_npy,
74
- index_rate,
75
- if_f0,
76
- )
77
- print(
78
- f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
79
- )
80
- return "Success", (tgt_sr, audio_opt)
81
- except:
82
- info = traceback.format_exc()
83
- print(info)
84
- return info, (None, None)
85
- return vc_fn
86
-
87
- def cut_vocal_and_inst(yt_url):
88
- if yt_url != "":
89
- if not os.path.exists("youtube_audio"):
90
- os.mkdir("youtube_audio")
91
- ydl_opts = {
92
- 'format': 'bestaudio/best',
93
- 'postprocessors': [{
94
- 'key': 'FFmpegExtractAudio',
95
- 'preferredcodec': 'wav',
96
- }],
97
- "outtmpl": 'youtube_audio/audio',
98
- }
99
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
100
- ydl.download([yt_url])
101
- yt_audio_path = "youtube_audio/audio.wav"
102
- command = f"demucs --two-stems=vocals {yt_audio_path}"
103
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
104
- print(result.stdout.decode())
105
- return ("separated/htdemucs/audio/vocals.wav", "separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "separated/htdemucs/audio/vocals.wav")
106
-
107
- def combine_vocal_and_inst(audio_data, audio_volume):
108
- print(audio_data)
109
- if not os.path.exists("result"):
110
- os.mkdir("result")
111
- vocal_path = "result/output.wav"
112
- inst_path = "separated/htdemucs/audio/no_vocals.wav"
113
- output_path = "result/combine.mp3"
114
- with wave.open(vocal_path, "w") as wave_file:
115
- wave_file.setnchannels(1)
116
- wave_file.setsampwidth(2)
117
- wave_file.setframerate(audio_data[0])
118
- wave_file.writeframes(audio_data[1].tobytes())
119
- command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
120
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
121
- return output_path
122
-
123
- def load_hubert():
124
- global hubert_model
125
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
126
- ["hubert_base.pt"],
127
- suffix="",
128
- )
129
- hubert_model = models[0]
130
- hubert_model = hubert_model.to(device)
131
- if is_half:
132
- hubert_model = hubert_model.half()
133
- else:
134
- hubert_model = hubert_model.float()
135
- hubert_model.eval()
136
-
137
- def change_to_tts_mode(tts_mode, upload_mode):
138
- if tts_mode:
139
- return gr.Textbox.update(visible=False), gr.Audio.update(visible=False), gr.Checkbox.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
140
- else:
141
- if upload_mode:
142
- return gr.Textbox.update(visible=False), gr.Audio.update(visible=True), gr.Checkbox.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
143
- else:
144
- return gr.Textbox.update(visible=True), gr.Audio.update(visible=False), gr.Checkbox.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
145
-
146
- def change_to_upload_mode(upload_mode):
147
- if upload_mode:
148
- return gr.Textbox().update(visible=False), gr.Audio().update(visible=True)
149
- else:
150
- return gr.Textbox().update(visible=True), gr.Audio().update(visible=False)
151
-
152
- if __name__ == '__main__':
153
- parser = argparse.ArgumentParser()
154
- parser.add_argument('--api', action="store_true", default=False)
155
- parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
156
- args, unknown = parser.parse_known_args()
157
- load_hubert()
158
- models = []
159
- tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
160
- voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
161
- with open("weights/model_info.json", "r", encoding="utf-8") as f:
162
- models_info = json.load(f)
163
- for name, info in models_info.items():
164
- if not info['enable']:
165
- continue
166
- title = info['title']
167
- author = info.get("author", None)
168
- cover = f"weights/{name}/{info['cover']}"
169
- index = f"weights/{name}/{info['feature_retrieval_library']}"
170
- npy = f"weights/{name}/{info['feature_file']}"
171
- cpt = torch.load(f"weights/{name}/{name}.pth", map_location="cpu")
172
- tgt_sr = cpt["config"][-1]
173
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
174
- if_f0 = cpt.get("f0", 1)
175
- if if_f0 == 1:
176
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
177
- else:
178
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
179
- del net_g.enc_q
180
- print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净, 真奇葩
181
- net_g.eval().to(device)
182
- if is_half:
183
- net_g = net_g.half()
184
- else:
185
- net_g = net_g.float()
186
- vc = VC(tgt_sr, device, is_half)
187
- models.append((name, title, author, cover, create_vc_fn(tgt_sr, net_g, vc, if_f0, index, npy)))
188
- with gr.Blocks() as app:
189
- gr.Markdown(
190
- "# <center> RVC Models\n"
191
- "## <center> The input audio should be clean and pure voice without background music.\n"
192
- "### <center> More feature will be added soon... \n"
193
- "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1hx6kKvIuv5XNY1Gai2PEuZhpO5z6xpVh?usp=sharing)\n\n"
194
- "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
195
- )
196
- with gr.Tabs():
197
- for (name, title, author, cover, vc_fn) in models:
198
- with gr.TabItem(name):
199
- with gr.Row():
200
- gr.Markdown(
201
- '<div align="center">'
202
- f'<div>{title}</div>\n'+
203
- (f'<div>Model author: {author}</div>' if author else "")+
204
- (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
205
- '</div>'
206
- )
207
- with gr.Row():
208
- with gr.Column():
209
- vc_youtube = gr.Textbox(label="Youtube URL")
210
- vc_convert = gr.Button("Convert", variant="primary")
211
- vc_vocal_preview = gr.Audio(label="Vocal Preview")
212
- vc_inst_preview = gr.Audio(label="Instrumental Preview")
213
- vc_audio_preview = gr.Audio(label="Audio Preview")
214
- with gr.Column():
215
- vc_input = gr.Textbox(label="Input audio path")
216
- vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
217
- upload_mode = gr.Checkbox(label="Upload mode", value=False)
218
- vc_transpose = gr.Number(label="Transpose", value=0)
219
- vc_f0method = gr.Radio(
220
- label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
221
- choices=["pm", "harvest"],
222
- value="pm",
223
- interactive=True,
224
- )
225
- vc_index_ratio = gr.Slider(
226
- minimum=0,
227
- maximum=1,
228
- label="Retrieval feature ratio",
229
- value=0.6,
230
- interactive=True,
231
- )
232
- tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
233
- tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
234
- tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
235
- vc_output1 = gr.Textbox(label="Output Message")
236
- vc_output2 = gr.Audio(label="Output Audio")
237
- vc_submit = gr.Button("Generate", variant="primary")
238
- with gr.Column():
239
- vc_volume = gr.Slider(
240
- minimum=0,
241
- maximum=10,
242
- label="Vocal volume",
243
- value=4,
244
- interactive=True,
245
- step=1
246
- )
247
- vc_outputCombine = gr.Audio(label="Output Combined Audio")
248
- vc_combine = gr.Button("Combine",variant="primary")
249
- vc_submit.click(vc_fn, [vc_input, vc_upload, upload_mode, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
250
- vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
251
- vc_combine.click(combine_vocal_and_inst, [vc_output2, vc_volume], vc_outputCombine)
252
- tts_mode.change(change_to_tts_mode, [tts_mode, upload_mode], [vc_input, vc_upload, upload_mode, tts_text, tts_voice])
253
- upload_mode.change(change_to_upload_mode, [upload_mode], [vc_input, vc_upload])
254
- app.queue(concurrency_count=1, max_size=20, api_open=args.api).launch(share=args.colab)