ArkanDash commited on
Commit
50600ce
·
1 Parent(s): edf08a9

feat(app): minor update

Browse files
.gitignore CHANGED
@@ -10,6 +10,7 @@
10
  *.userosscache
11
  *.sln.docstates
12
  load.py
 
13
 
14
  # User-specific files (MonoDevelop/Xamarin Studio)
15
  *.userprefs
 
10
  *.userosscache
11
  *.sln.docstates
12
  load.py
13
+ button.py
14
 
15
  # User-specific files (MonoDevelop/Xamarin Studio)
16
  *.userprefs
app-full.py CHANGED
@@ -27,6 +27,8 @@ limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingfac
27
  def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
28
  def vc_fn(
29
  input_audio,
 
 
30
  f0_up_key,
31
  f0_method,
32
  index_rate,
@@ -43,20 +45,18 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
43
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
44
  audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
45
  else:
46
- if config.files:
47
- audio, sr = librosa.load(input_audio, sr=16000, mono=True)
48
- else:
49
  if input_audio is None:
50
  return "You need to upload an audio", None
51
- sampling_rate, audio = input_audio
52
  duration = audio.shape[0] / sampling_rate
53
- if duration > 20 and limitation:
54
- return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
55
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
56
  if len(audio.shape) > 1:
57
  audio = librosa.to_mono(audio.transpose(1, 0))
58
  if sampling_rate != 16000:
59
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
 
60
  times = [0, 0, 0]
61
  f0_up_key = int(f0_up_key)
62
  audio_opt = vc.pipeline(
@@ -84,31 +84,31 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
84
 
85
  def cut_vocal_and_inst(yt_url):
86
  if yt_url != "":
87
- if not os.path.exists("/content/youtube_audio"):
88
- os.mkdir("/content/youtube_audio")
89
  ydl_opts = {
90
  'format': 'bestaudio/best',
91
  'postprocessors': [{
92
  'key': 'FFmpegExtractAudio',
93
  'preferredcodec': 'wav',
94
  }],
95
- "outtmpl": '/content/youtube_audio/audio',
96
  }
97
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
98
  ydl.download([yt_url])
99
- yt_audio_path = "/content/youtube_audio/audio.wav"
100
  command = f"demucs --two-stems=vocals {yt_audio_path}"
101
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
102
  print(result.stdout.decode())
103
- return ("/content/rvc-models-new/separated/htdemucs/audio/vocals.wav", "/content/rvc-models-new/separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "/content/rvc-models-new/separated/htdemucs/audio/vocals.wav")
104
 
105
  def combine_vocal_and_inst(audio_data, audio_volume):
106
  print(audio_data)
107
- if not os.path.exists("/content/result"):
108
- os.mkdir("/content/result")
109
- vocal_path = "/content/result/output.wav"
110
- inst_path = "/content/rvc-models-new/separated/htdemucs/audio/no_vocals.wav"
111
- output_path = "/content/result/combine.mp3"
112
  with wave.open(vocal_path, "w") as wave_file:
113
  wave_file.setnchannels(1)
114
  wave_file.setsampwidth(2)
@@ -132,19 +132,20 @@ def load_hubert():
132
  hubert_model = hubert_model.float()
133
  hubert_model.eval()
134
 
135
- def change_to_tts_mode(tts_mode):
136
  if tts_mode:
137
- return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
138
  else:
139
- return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
 
 
 
140
 
141
- '''
142
- def change_audio_to_path_mode(audio_mode):
143
- if audio_mode:
144
- return gr.Audio.update(visible=False), gr.Textbox.update(visible=True)
145
  else:
146
- return gr.Audio.update(visible=True), gr.Textbox.update(visible=False)
147
- '''
148
 
149
  if __name__ == '__main__':
150
  load_hubert()
@@ -215,7 +216,7 @@ if __name__ == '__main__':
215
  "# <center> RVC Models (Latest Update)\n"
216
  "## <center> The input audio should be clean and pure voice without background music.\n"
217
  "### <center> More feature will be added soon... \n"
218
- "##### <center> Please regenerate your model to latest RVC to fully applied this new rvc.\n"
219
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
220
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
221
  )
@@ -231,19 +232,16 @@ if __name__ == '__main__':
231
  '</div>'
232
  )
233
  with gr.Row():
234
- if config.files:
235
- with gr.Column():
236
- vc_youtube = gr.Textbox(label="Youtube URL")
237
- vc_convert = gr.Button("Convert", variant="primary")
238
- vc_vocal_preview = gr.Audio(label="Vocal Preview")
239
- vc_inst_preview = gr.Audio(label="Instrumental Preview")
240
- vc_audio_preview = gr.Audio(label="Audio Preview")
241
  with gr.Column():
242
- # vc_audio_mode = gr.Checkbox(label="Use audio path", value=False)
243
- if config.files:
244
- vc_input = gr.Textbox(label="Input audio path")
245
- else:
246
- vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
 
 
 
 
247
  vc_transpose = gr.Number(label="Transpose", value=0)
248
  vc_f0method = gr.Radio(
249
  label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
@@ -261,28 +259,27 @@ if __name__ == '__main__':
261
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
262
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
263
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
264
- vc_submit = gr.Button("Generate", variant="primary")
265
  vc_output1 = gr.Textbox(label="Output Message")
266
  vc_output2 = gr.Audio(label="Output Audio")
267
- if config.files:
268
- with gr.Column():
269
- vc_volume = gr.Slider(
270
- minimum=0,
271
- maximum=10,
272
- label="Vocal volume",
273
- value=5,
274
- interactive=True,
275
- step=1
276
- )
277
- vc_outputCombine = gr.Audio(label="Output Combined Audio")
278
- vc_combine = gr.Button("Combine",variant="primary")
279
- vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
280
- tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
281
- # vc_audio_input(change_audio_to_path_mode, [vc_audio_mode])
282
- if config.files:
283
- vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
284
- vc_combine.click(combine_vocal_and_inst, [vc_output2, vc_volume], vc_outputCombine)
285
- gr.Markdown('# <center>Changelog 2023.05.14')
286
- gr.Markdown('- Remove nilou-jp due to outdated model')
287
- gr.Markdown('- Added ganyu-jp from latest rvc train')
288
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
 
27
  def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
28
  def vc_fn(
29
  input_audio,
30
+ upload_audio,
31
+ upload_mode,
32
  f0_up_key,
33
  f0_method,
34
  index_rate,
 
45
  asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
46
  audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
47
  else:
48
+ if upload_mode:
 
 
49
  if input_audio is None:
50
  return "You need to upload an audio", None
51
+ sampling_rate, audio = upload_audio
52
  duration = audio.shape[0] / sampling_rate
 
 
53
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
54
  if len(audio.shape) > 1:
55
  audio = librosa.to_mono(audio.transpose(1, 0))
56
  if sampling_rate != 16000:
57
  audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
58
+ else:
59
+ audio, sr = librosa.load(input_audio, sr=16000, mono=True)
60
  times = [0, 0, 0]
61
  f0_up_key = int(f0_up_key)
62
  audio_opt = vc.pipeline(
 
84
 
85
  def cut_vocal_and_inst(yt_url):
86
  if yt_url != "":
87
+ if not os.path.exists("youtube_audio"):
88
+ os.mkdir("youtube_audio")
89
  ydl_opts = {
90
  'format': 'bestaudio/best',
91
  'postprocessors': [{
92
  'key': 'FFmpegExtractAudio',
93
  'preferredcodec': 'wav',
94
  }],
95
+ "outtmpl": 'youtube_audio/audio',
96
  }
97
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
98
  ydl.download([yt_url])
99
+ yt_audio_path = "youtube_audio/audio.wav"
100
  command = f"demucs --two-stems=vocals {yt_audio_path}"
101
  result = subprocess.run(command.split(), stdout=subprocess.PIPE)
102
  print(result.stdout.decode())
103
+ return ("separated/htdemucs/audio/vocals.wav", "separated/htdemucs/audio/no_vocals.wav", yt_audio_path, "separated/htdemucs/audio/vocals.wav")
104
 
105
  def combine_vocal_and_inst(audio_data, audio_volume):
106
  print(audio_data)
107
+ if not os.path.exists("result"):
108
+ os.mkdir("result")
109
+ vocal_path = "result/output.wav"
110
+ inst_path = "separated/htdemucs/audio/no_vocals.wav"
111
+ output_path = "result/combine.mp3"
112
  with wave.open(vocal_path, "w") as wave_file:
113
  wave_file.setnchannels(1)
114
  wave_file.setsampwidth(2)
 
132
  hubert_model = hubert_model.float()
133
  hubert_model.eval()
134
 
135
+ def change_to_tts_mode(tts_mode, upload_mode):
136
  if tts_mode:
137
+ return gr.Textbox.update(visible=False), gr.Audio.update(visible=False), gr.Checkbox.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
138
  else:
139
+ if upload_mode:
140
+ return gr.Textbox.update(visible=False), gr.Audio.update(visible=True), gr.Checkbox.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
141
+ else:
142
+ return gr.Textbox.update(visible=True), gr.Audio.update(visible=False), gr.Checkbox.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
143
 
144
+ def change_to_upload_mode(upload_mode):
145
+ if upload_mode:
146
+ return gr.Textbox().update(visible=False), gr.Audio().update(visible=True)
 
147
  else:
148
+ return gr.Textbox().update(visible=True), gr.Audio().update(visible=False)
 
149
 
150
  if __name__ == '__main__':
151
  load_hubert()
 
216
  "# <center> RVC Models (Latest Update)\n"
217
  "## <center> The input audio should be clean and pure voice without background music.\n"
218
  "### <center> More feature will be added soon... \n"
219
+ "#### <center> Please regenerate your model to latest RVC to fully applied this new rvc.\n"
220
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
221
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
222
  )
 
232
  '</div>'
233
  )
234
  with gr.Row():
 
 
 
 
 
 
 
235
  with gr.Column():
236
+ vc_youtube = gr.Textbox(label="Youtube URL")
237
+ vc_convert = gr.Button("Convert", variant="primary")
238
+ vc_vocal_preview = gr.Audio(label="Vocal Preview")
239
+ vc_inst_preview = gr.Audio(label="Instrumental Preview")
240
+ vc_audio_preview = gr.Audio(label="Audio Preview")
241
+ with gr.Column():
242
+ vc_input = gr.Textbox(label="Input audio path")
243
+ vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
244
+ upload_mode = gr.Checkbox(label="Upload mode", value=False)
245
  vc_transpose = gr.Number(label="Transpose", value=0)
246
  vc_f0method = gr.Radio(
247
  label="Pitch extraction algorithm, PM is fast but Harvest is better for low frequencies",
 
259
  tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
260
  tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
261
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
 
262
  vc_output1 = gr.Textbox(label="Output Message")
263
  vc_output2 = gr.Audio(label="Output Audio")
264
+ vc_submit = gr.Button("Generate", variant="primary")
265
+ with gr.Column():
266
+ vc_volume = gr.Slider(
267
+ minimum=0,
268
+ maximum=10,
269
+ label="Vocal volume",
270
+ value=3,
271
+ interactive=True,
272
+ step=1
273
+ )
274
+ vc_outputCombine = gr.Audio(label="Output Combined Audio")
275
+ vc_combine = gr.Button("Combine",variant="primary")
276
+ vc_submit.click(vc_fn, [vc_input, vc_upload, upload_mode, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
277
+ vc_convert.click(cut_vocal_and_inst, vc_youtube, [vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input])
278
+ vc_combine.click(combine_vocal_and_inst, [vc_output2, vc_volume], vc_outputCombine)
279
+ tts_mode.change(change_to_tts_mode, [tts_mode, upload_mode], [vc_input, vc_upload, upload_mode, tts_text, tts_voice])
280
+ upload_mode.change(change_to_upload_mode, [upload_mode], [vc_input, vc_upload])
281
+ gr.Markdown('# <center>Changelog 2023.05.15')
282
+ gr.Markdown('- Added support for direct upload to gradio')
283
+ gr.Markdown('- Added ayato-jp')
284
+ gr.Markdown('- Minor fix and adjustment')
285
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
app.py CHANGED
@@ -166,7 +166,7 @@ if __name__ == '__main__':
166
  "# <center> RVC Models (Latest Update)\n"
167
  "## <center> The input audio should be clean and pure voice without background music.\n"
168
  "### <center> [Recommended to use google colab for more features](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing) \n"
169
- "##### <center> Please regenerate your model to latest RVC to fully applied this new rvc.\n"
170
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
171
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
172
  )
@@ -207,7 +207,8 @@ if __name__ == '__main__':
207
  vc_output2 = gr.Audio(label="Output Audio")
208
  vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
209
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
210
- gr.Markdown('# <center>Changelog 2023.05.14')
211
- gr.Markdown('- Disable nilou-jp due to an outdated model')
212
- gr.Markdown('- Added ganyu-jp from latest rvc train')
 
213
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
 
166
  "# <center> RVC Models (Latest Update)\n"
167
  "## <center> The input audio should be clean and pure voice without background music.\n"
168
  "### <center> [Recommended to use google colab for more features](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing) \n"
169
+ "#### <center> Please regenerate your model to latest RVC to fully applied this new rvc.\n"
170
  "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"
171
  "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)"
172
  )
 
207
  vc_output2 = gr.Audio(label="Output Audio")
208
  vc_submit.click(vc_fn, [vc_input, vc_transpose, vc_f0method, vc_index_ratio, tts_mode, tts_text, tts_voice], [vc_output1, vc_output2])
209
  tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice])
210
+ gr.Markdown('# <center>Changelog 2023.05.15')
211
+ gr.Markdown('- Added support for direct upload to gradio')
212
+ gr.Markdown('- Added ayato-jp')
213
+ gr.Markdown('- Minor fix and adjustment')
214
  app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
config.py CHANGED
@@ -17,7 +17,6 @@ class Config:
17
  self.noparallel,
18
  self.noautoopen,
19
  self.api,
20
- self.files,
21
  self.json
22
  ) = self.arg_parse()
23
  self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
@@ -39,7 +38,6 @@ class Config:
39
  help="Do not open in browser automatically",
40
  )
41
  parser.add_argument('--api', action="store_true", default=False)
42
- parser.add_argument("--files", action="store_true", default=False, help="load audio from path")
43
  parser.add_argument("--json", action="store_true", default=False, help="use model_info.json")
44
  cmd_opts = parser.parse_args()
45
 
@@ -52,7 +50,6 @@ class Config:
52
  cmd_opts.noparallel,
53
  cmd_opts.noautoopen,
54
  cmd_opts.api,
55
- cmd_opts.files,
56
  cmd_opts.json
57
  )
58
 
 
17
  self.noparallel,
18
  self.noautoopen,
19
  self.api,
 
20
  self.json
21
  ) = self.arg_parse()
22
  self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
 
38
  help="Do not open in browser automatically",
39
  )
40
  parser.add_argument('--api', action="store_true", default=False)
 
41
  parser.add_argument("--json", action="store_true", default=False, help="use model_info.json")
42
  cmd_opts = parser.parse_args()
43
 
 
50
  cmd_opts.noparallel,
51
  cmd_opts.noautoopen,
52
  cmd_opts.api,
 
53
  cmd_opts.json
54
  )
55
 
weights/ayato-jp/added_IVF1304_Flat_nprobe_1.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fd600f40f3d8ef0c490eb66d73f25f89056d982edd4f467bda6b699d13e1fca
3
+ size 53868475
weights/ayato-jp/ayato-jp.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6139fd4482942ab0075f08d5960d8b9b576945e6204ef2965dd3bb96ccccca4
3
+ size 55027130
weights/ayato-jp/cover.png ADDED
weights/model_info.json CHANGED
@@ -14,5 +14,13 @@
14
  "cover": "cover.png",
15
  "feature_retrieval_library": "added_IVF1636_Flat_nprobe_1.index",
16
  "author":"ArkanDash"
 
 
 
 
 
 
 
 
17
  }
18
  }
 
14
  "cover": "cover.png",
15
  "feature_retrieval_library": "added_IVF1636_Flat_nprobe_1.index",
16
  "author":"ArkanDash"
17
+ },
18
+ "ayato-jp": {
19
+ "enable": true,
20
+ "name": "ayato-jp",
21
+ "title": "Genshin Impact - Kamisato Ayato",
22
+ "cover": "cover.png",
23
+ "feature_retrieval_library": "added_IVF1304_Flat_nprobe_1.index",
24
+ "author":"ArkanDash"
25
  }
26
  }