ajayarora1235 commited on
Commit
d602592
·
1 Parent(s): 005bb33

get rid of voicebox tab to move python versions

Browse files
Files changed (3) hide show
  1. README.md +0 -1
  2. app.py +646 -646
  3. requirements.txt +0 -8
README.md CHANGED
@@ -5,7 +5,6 @@ colorFrom: pink
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.42.0
8
- python_version: 3.9.16
9
  app_file: app.py
10
  pinned: true
11
  ---
 
5
  colorTo: pink
6
  sdk: gradio
7
  sdk_version: 3.42.0
 
8
  app_file: app.py
9
  pinned: true
10
  ---
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import subprocess, torch, os, traceback, sys, warnings, shutil, numpy as np
2
 
3
  import pandas as pd
4
- import torchaudio
5
- from lib.voicecraft.data.tokenizer import (
6
- AudioTokenizer,
7
- TextTokenizer,
8
- )
9
- import whisperx
10
  import os
11
  import time
12
  import gc
@@ -1472,252 +1472,252 @@ def stoptraining(mim):
1472
 
1473
 
1474
 
1475
- def transcribe_btn_click(audio_choice):
1476
- global transcript_fn
1477
- global audio_fn
1478
 
1479
- temp_folder = "./demo/temp"
1480
- orig_audio = audio_choice
1481
- filename = os.path.splitext(orig_audio.split("/")[-1])[0]
1482
- audio_fn = f"{temp_folder}/{filename}.wav"
1483
- transcript_fn = f"{temp_folder}/{filename}.txt"
1484
- if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
1485
- print("Audio and transcript already exist, skipping transcript")
1486
- transcript = open(transcript_fn, "r").read()
1487
- return transcript
1488
 
1489
- batch_size = 1 # Adjust based on your GPU memory availability
1490
- compute_type = "float16"
1491
- device = "cuda" if torch.cuda.is_available() else "cpu"
1492
 
1493
- model = whisperx.load_model("large-v2", device, compute_type=compute_type)
1494
- pre_result = model.transcribe(audio_choice, batch_size=batch_size)
1495
 
1496
- # Correctly handle the transcription result based on its structure
1497
- if 'segments' in pre_result:
1498
- result = " ".join([segment['text'] for segment in pre_result['segments']])
1499
- else:
1500
- result = pre_result.get('text', '')
1501
 
1502
- print("Transcribe text: " + result) # Directly print the result as it is now a string
1503
 
1504
- # remove model to save VRAM
1505
- gc.collect(); torch.cuda.empty_cache(); del model
1506
 
1507
- # point to the original file or record the file
1508
- # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
1509
- orig_audio = audio_choice
1510
- orig_transcript = result
1511
- # move the audio and transcript to temp folder
1512
- os.makedirs(temp_folder, exist_ok=True)
1513
- os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
1514
- filename = os.path.splitext(orig_audio.split("/")[-1])[0]
1515
- with open(f"{temp_folder}/{filename}.txt", "w") as f:
1516
- f.write(orig_transcript)
1517
- # run MFA to get the alignment
1518
- align_temp = f"{temp_folder}/mfa_alignments"
1519
- os.makedirs(align_temp, exist_ok=True)
1520
 
1521
- audio_fn = f"{temp_folder}/{filename}.wav"
1522
- transcript_fn = f"{temp_folder}/{filename}.txt"
1523
 
1524
- return result
1525
 
1526
 
1527
- def run(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1528
- temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
1529
- global voicecraft_model, voicecraft_config, phn2num
1530
 
1531
- print("Transcribing the input audio")
1532
- transcribed_text = transcribe_btn_click(input_audio_fn)
1533
- print("Transcription complete")
1534
 
1535
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1536
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1537
- os.environ["USER"] = "USER"
1538
- # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1539
- cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1540
- target_transcript = transcribed_text + target_transcript
1541
- print(target_transcript)
1542
- info = torchaudio.info(audio_fn)
1543
- audio_dur = info.num_frames / info.sample_rate
1544
- print(f"Audio_fn num frames: {info.num_frames}, sample rate: {info.sample_rate}")
1545
-
1546
- print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
1547
- assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1548
- prompt_end_frame = int(cut_off_sec * info.sample_rate)
1549
-
1550
- # # load model, tokenizer, and other necessary files
1551
- # # original file loaded it each time. here we load it only once
1552
- # global model_loaded
1553
- # f model_loaded==False:
1554
-
1555
- if voicecraft_model is None:
1556
- load_voicecraft()
1557
-
1558
- encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
1559
- text_tokenizer = TextTokenizer(backend="espeak")
1560
- audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
1561
-
1562
-
1563
- # # run the model to get the output
1564
- decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
1565
- 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
1566
- "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
1567
- from lib.voicecraft.inference_tts_scale import inference_one_sample
1568
- concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1569
- audio_fn, target_transcript, config.device, decode_config,
1570
- prompt_end_frame)
1571
-
1572
- # save segments for comparison
1573
- concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
1574
- # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
1575
-
1576
- output_dir = "./demo/generated_tts"
1577
- os.makedirs(output_dir, exist_ok=True)
1578
- seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
1579
- seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
1580
-
1581
-
1582
- torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
1583
- torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
1584
-
1585
- return [seg_save_fn_concat, seg_save_fn_gen]
1586
-
1587
- def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1588
- temperature, kvcache, target_transcript, silence_tokens,
1589
- sid,
1590
- f0_up_key,
1591
- f0_file,
1592
- f0_method,
1593
- file_index,
1594
- #file_index2,
1595
- # file_big_npy,
1596
- index_rate,
1597
- filter_radius,
1598
- resample_sr,
1599
- rms_mix_rate,
1600
- protect,
1601
- crepe_hop_length):
1602
- global voicecraft_model, voicecraft_config, phn2num
1603
-
1604
- print("Transcribing the input audio")
1605
- transcribed_text = transcribe_btn_click(input_audio_fn)
1606
- print("Transcription complete", transcribed_text)
1607
 
1608
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1609
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1610
- os.environ["USER"] = "USER"
1611
- # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1612
- # cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1613
-
1614
- target_transcript = transcribed_text + ' ' + target_transcript
1615
- print(target_transcript)
1616
- info = torchaudio.info(audio_fn)
1617
- audio_dur = info.num_frames / info.sample_rate
1618
- cut_off_sec = audio_dur - 0.1
1619
-
1620
- assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1621
- prompt_end_frame = int(cut_off_sec * info.sample_rate)
1622
-
1623
- if voicecraft_model is None:
1624
- load_voicecraft()
1625
-
1626
- encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
1627
- text_tokenizer = TextTokenizer(backend="espeak")
1628
- audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
1629
-
1630
-
1631
- # # run the model to get the output
1632
- decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
1633
- 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
1634
- "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
1635
- from lib.voicecraft.inference_tts_scale import inference_one_sample
1636
- concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1637
- audio_fn, target_transcript, config.device, decode_config,
1638
- prompt_end_frame)
1639
- print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
1640
-
1641
- # save segments for comparison
1642
- concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
1643
- # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
1644
-
1645
- output_dir = "./demo/generated_tts"
1646
- os.makedirs(output_dir, exist_ok=True)
1647
- seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
1648
- seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
1649
-
1650
-
1651
- torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
1652
- torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
1653
-
1654
-
1655
- global tgt_sr, net_g, vc, hubert_model, version
1656
-
1657
- f0_up_key = int(f0_up_key)
1658
- try:
1659
- # audio = gen_audio.squeeze()
1660
- audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
1661
- audio_max = np.abs(audio).max() / 0.95
1662
- if audio_max > 1:
1663
- audio /= audio_max
1664
- times = [0, 0, 0]
1665
- if hubert_model == None:
1666
- load_hubert()
1667
- if_f0 = cpt.get("f0", 1)
1668
- file_index = (
1669
- (
1670
- file_index.strip(" ")
1671
- .strip('"')
1672
- .strip("\n")
1673
- .strip('"')
1674
- .strip(" ")
1675
- .replace("trained", "added")
1676
- )
1677
- ) # 防止小白写错,自动帮他替换掉
1678
- # file_big_npy = (
1679
- # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
1680
- # )
1681
- print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
1682
- audio_opt = vc.pipeline(
1683
- hubert_model,
1684
- net_g,
1685
- sid,
1686
- audio,
1687
- seg_save_fn_gen,
1688
- times,
1689
- f0_up_key,
1690
- f0_method,
1691
- file_index,
1692
- # file_big_npy,
1693
- index_rate,
1694
- if_f0,
1695
- filter_radius,
1696
- tgt_sr,
1697
- resample_sr,
1698
- rms_mix_rate,
1699
- version,
1700
- protect,
1701
- crepe_hop_length,
1702
- f0_file=f0_file,
1703
- )
1704
- if resample_sr >= 16000 and tgt_sr != resample_sr:
1705
- tgt_sr = resample_sr
1706
- index_info = (
1707
- "Using index:%s." % file_index
1708
- if os.path.exists(file_index)
1709
- else "Index not used."
1710
- )
1711
- return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
1712
- index_info,
1713
- times[0],
1714
- times[1],
1715
- times[2],
1716
- ), seg_save_fn_gen, (tgt_sr, audio_opt)
1717
- except:
1718
- info = traceback.format_exc()
1719
- print(info)
1720
- return info, (None, None)
1721
 
1722
 
1723
 
@@ -2136,433 +2136,433 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
2136
  [vc_output3],
2137
  )
2138
  but1.click(fn=lambda: easy_uploader.clear())
2139
- with gr.TabItem("TTS"):
2140
- app.load(update_message)
2141
-
2142
- # Other RVC stuff
2143
- with gr.Row():
2144
- sid0 = gr.Dropdown(label="1. Choose your model", choices=sorted(names), value=check_for_name())
2145
- refresh_button = gr.Button("Refresh", variant="primary")
2146
- if check_for_name() != '':
2147
- get_vc(sorted(names)[0])
2148
- vc_transform0 = gr.Number(label="Key Shift: 0 for no key shifted output; 12 f for output an octave higher and -12 for output an octave lower.", value=0)
2149
- #clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
2150
- spk_item = gr.Slider(
2151
- minimum=0,
2152
- maximum=2333,
2153
- step=1,
2154
- label="speaker id",
2155
- value=0,
2156
- visible=False,
2157
- interactive=True,
2158
- )
2159
- #clean_button.click(fn=clean, inputs=[], outputs=[sid0])
2160
- sid0.change(
2161
- fn=get_vc,
2162
- inputs=[sid0],
2163
- outputs=[spk_item],
2164
- )
2165
- but0 = gr.Button("Convert", variant="primary")
2166
- with gr.Row():
2167
- with gr.Column():
2168
- # with gr.Row():
2169
- # dropbox = gr.File(label="Drag your audio file and click refresh.")
2170
- with gr.Row():
2171
- record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
2172
- with gr.Row():
2173
- input_audio0 = gr.Dropdown(
2174
- label="2.Choose the audio file.",
2175
- value="./audios/calm.wav",
2176
- choices=audio_files
2177
- )
2178
- audio_display = gr.Audio(value=input_audio0.value, label="Selected Audio File", type="filepath")
2179
- # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
2180
- # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
2181
- refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
2182
- # transcribed_text = gr.Textbox(label="transcibed text + mfa",
2183
- # value="The dogs sat at the door.",
2184
- # info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
2185
- record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
2186
- record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
2187
- # update audio_display
2188
- input_audio0.change(fn=lambda x: x, inputs=[input_audio0], outputs=[audio_display])
2189
 
2190
- with gr.Row():
2191
- # with gr.Column():
2192
- # input_audio = gr.Audio(label="Input Audio", type="filepath")
2193
- # # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
2194
- # # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
2195
- # # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
2196
- # transcribed_text = gr.Textbox(label="transcibed text + mfa",
2197
- # info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
2198
- # transcribe_info_text = gr.TextArea(label="How to use",
2199
- # value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
2200
- # transcribe_btn = gr.Button(value="transcribe and create mfa")
2201
 
2202
 
2203
- with gr.Column():
2204
- target_transcript = gr.Textbox(label="target transcript")
2205
 
2206
- # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
2207
- # outputs=[transcribed_text])
2208
 
2209
 
2210
 
2211
- with gr.Column():
2212
 
2213
- output_audio_gen = gr.Audio(
2214
- label="Output Audio generated",
2215
- type='filepath',
2216
- interactive=False
2217
- )
2218
 
2219
 
2220
- vc_output2 = gr.Audio(
2221
- label="Voice converted! (Click on the three dots to download the audio)",
2222
- type='filepath',
2223
- interactive=False,
2224
- )
2225
 
2226
- #with gr.Column():
2227
- with gr.Accordion("Advanced TTS Settings", open=False):
2228
- seed = gr.Number(label='seed', interactive=True, value=1)
2229
- stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
2230
- info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
2231
- sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
2232
- info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
2233
- left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
2234
- info=" not used for TTS, only for speech editing")
2235
- right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
2236
- info=" not used for TTS, only for speech editing")
2237
- codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
2238
- codec_sr = gr.Number(label='codec', interactive=True, value=50)
2239
- top_k = gr.Number(label='top_k', interactive=True, value=0)
2240
- top_p = gr.Number(label='top_p', interactive=True, value=0.8)
2241
- temperature = gr.Number(label='temperature', interactive=True, value=1)
2242
- kvcache = gr.Number(label='kvcache', interactive=True, value=1,
2243
- info='set to 0 to use less VRAM, results may be worse and slower inference')
2244
- silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
2245
- with gr.Accordion("Index Settings", open=False):
2246
- #with gr.Row():
2247
 
2248
- file_index1 = gr.Dropdown(
2249
- label="3. Choose the index file (in case it wasn't automatically found.)",
2250
- choices=get_indexes(),
2251
- value=get_index(),
2252
- interactive=True,
2253
- )
2254
- sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
2255
- refresh_button.click(
2256
- fn=change_choices, inputs=[], outputs=[sid0, file_index1]
2257
- )
2258
- # file_big_npy1 = gr.Textbox(
2259
- # label=i18n("特征文件路径"),
2260
- # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
2261
- # interactive=True,
2262
- # )
2263
- index_rate1 = gr.Slider(
2264
- minimum=0,
2265
- maximum=1,
2266
- label="index rate",
2267
- value=0,
2268
- interactive=True,
2269
- )
2270
-
2271
- # animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
2272
-
2273
- with gr.Accordion("Advanced Options", open=False):
2274
- f0method0 = gr.Radio(
2275
- label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
2276
- choices=["pm", "dio", "crepe-tiny", "mangio-crepe-tiny", "crepe", "harvest", "mangio-crepe", "rmvpe"], # Fork Feature. Add Crepe-Tiny
2277
- value="rmvpe",
2278
- interactive=True,
2279
- )
2280
 
2281
- crepe_hop_length = gr.Slider(
2282
- minimum=1,
2283
- maximum=512,
2284
- step=1,
2285
- label="Mangio-Crepe Hop Length. Higher numbers will reduce the chance of extreme pitch changes but lower numbers will increase accuracy. 64-192 is a good range to experiment with.",
2286
- value=120,
2287
- interactive=True,
2288
- visible=False,
2289
- )
2290
- f0method0.change(fn=whethercrepeornah, inputs=[f0method0], outputs=[crepe_hop_length])
2291
- filter_radius0 = gr.Slider(
2292
- minimum=0,
2293
- maximum=7,
2294
- label="label",
2295
- value=3,
2296
- step=1,
2297
- interactive=True,
2298
- )
2299
- resample_sr0 = gr.Slider(
2300
- minimum=0,
2301
- maximum=48000,
2302
- label="label",
2303
- value=0,
2304
- step=1,
2305
- interactive=True,
2306
- visible=False
2307
- )
2308
- rms_mix_rate0 = gr.Slider(
2309
- minimum=0,
2310
- maximum=1,
2311
- label="label",
2312
- value=0.21,
2313
- interactive=True,
2314
- )
2315
- protect0 = gr.Slider(
2316
- minimum=0,
2317
- maximum=0.5,
2318
- label="label",
2319
- value=0,
2320
- step=0.01,
2321
- interactive=True,
2322
- )
2323
- formanting = gr.Checkbox(
2324
- value=bool(DoFormant),
2325
- label="[EXPERIMENTAL] Formant shift inference audio",
2326
- info="Used for male to female and vice-versa conversions",
2327
- interactive=True,
2328
- visible=True,
2329
- )
2330
 
2331
- formant_preset = gr.Dropdown(
2332
- value='',
2333
- choices=get_fshift_presets(),
2334
- label="browse presets for formanting",
2335
- visible=bool(DoFormant),
2336
- )
2337
- formant_refresh_button = gr.Button(
2338
- value='\U0001f504',
2339
- visible=bool(DoFormant),
2340
- variant='primary',
2341
- )
2342
- #formant_refresh_button = ToolButton( elem_id='1')
2343
- #create_refresh_button(formant_preset, lambda: {"choices": formant_preset}, "refresh_list_shiftpresets")
2344
 
2345
- qfrency = gr.Slider(
2346
- value=Quefrency,
2347
- info="Default value is 1.0",
2348
- label="Frequency for formant shifting",
2349
- minimum=0.0,
2350
- maximum=16.0,
2351
- step=0.1,
2352
- visible=bool(DoFormant),
2353
- interactive=True,
2354
- )
2355
- tmbre = gr.Slider(
2356
- value=Timbre,
2357
- info="Default value is 1.0",
2358
- label="Timbre for formant shifting",
2359
- minimum=0.0,
2360
- maximum=16.0,
2361
- step=0.1,
2362
- visible=bool(DoFormant),
2363
- interactive=True,
2364
- )
2365
 
2366
- formant_preset.change(fn=preset_apply, inputs=[formant_preset, qfrency, tmbre], outputs=[qfrency, tmbre])
2367
- frmntbut = gr.Button("Apply", variant="primary", visible=bool(DoFormant))
2368
- formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
2369
- frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
2370
- formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
2371
 
2372
- with gr.Row():
2373
- vc_output1 = gr.Textbox("")
2374
- f0_file = gr.File(label="f0 file", visible=False)
2375
-
2376
- # run_btn.click(fn=run,
2377
- # inputs=[
2378
- # input_audio0,
2379
- # seed,
2380
- # stop_repitition,
2381
- # sample_batch_size,
2382
- # left_margin,
2383
- # right_margin,
2384
- # codecaudio_sr,
2385
- # codec_sr,
2386
- # top_k,
2387
- # top_p,
2388
- # temperature,
2389
- # kvcache,
2390
- # cutoff_value,
2391
- # target_transcript,
2392
- # silence_tokens,
2393
- # transcribed_text],
2394
- # outputs=[
2395
- # output_audio_con,
2396
- # output_audio_gen
2397
- # ])
2398
 
2399
- # but0.click(
2400
- # vc_single,
2401
- # [
2402
- # spk_item,
2403
- # input_audio0,
2404
- # vc_transform0,
2405
- # f0_file,
2406
- # f0method0,
2407
- # file_index1,
2408
- # # file_index2,
2409
- # # file_big_npy1,
2410
- # index_rate1,
2411
- # filter_radius0,
2412
- # resample_sr0,
2413
- # rms_mix_rate0,
2414
- # protect0,
2415
- # crepe_hop_length
2416
- # ],
2417
- # [vc_output1, vc_output2],
2418
- # )
2419
-
2420
- but0.click(
2421
- fn=run_joint,
2422
- inputs=[
2423
- input_audio0,
2424
- seed,
2425
- stop_repitition,
2426
- sample_batch_size,
2427
- left_margin,
2428
- right_margin,
2429
- codecaudio_sr,
2430
- codec_sr,
2431
- top_k,
2432
- top_p,
2433
- temperature,
2434
- kvcache,
2435
- target_transcript,
2436
- silence_tokens,
2437
- spk_item,
2438
- vc_transform0,
2439
- f0_file,
2440
- f0method0,
2441
- file_index1,
2442
- # file_index2,
2443
- # file_big_npy1,
2444
- index_rate1,
2445
- filter_radius0,
2446
- resample_sr0,
2447
- rms_mix_rate0,
2448
- protect0,
2449
- crepe_hop_length
2450
- ],
2451
- outputs=[vc_output1, output_audio_gen, vc_output2])
2452
 
2453
- with gr.Accordion("Batch Conversion",open=False, visible=False):
2454
- with gr.Row():
2455
- with gr.Column():
2456
- vc_transform1 = gr.Number(
2457
- label="speaker id", value=0
2458
- )
2459
- opt_input = gr.Textbox(label="opt", value="opt")
2460
- f0method1 = gr.Radio(
2461
- label="f0 method",
2462
- choices=["pm", "harvest", "crepe", "rmvpe"],
2463
- value="rmvpe",
2464
- interactive=True,
2465
- )
2466
- filter_radius1 = gr.Slider(
2467
- minimum=0,
2468
- maximum=7,
2469
- label="harvest",
2470
- value=3,
2471
- step=1,
2472
- interactive=True,
2473
- )
2474
- with gr.Column():
2475
- file_index3 = gr.Textbox(
2476
- label="file index",
2477
- value="",
2478
- interactive=True,
2479
- )
2480
- file_index4 = gr.Dropdown(
2481
- label="index path (dropdown)",
2482
- choices=sorted(index_paths),
2483
- interactive=True,
2484
- )
2485
- refresh_button.click(
2486
- fn=lambda username: change_choices(username)[1],
2487
- inputs=[gr.State('username')],
2488
- outputs=file_index4,
2489
- )
2490
- # file_big_npy2 = gr.Textbox(
2491
- # label=i18n("特征文件路径"),
2492
- # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
2493
- # interactive=True,
2494
- # )
2495
- index_rate2 = gr.Slider(
2496
- minimum=0,
2497
- maximum=1,
2498
- label="index rate 2",
2499
- value=1,
2500
- interactive=True,
2501
- )
2502
- with gr.Column():
2503
- resample_sr1 = gr.Slider(
2504
- minimum=0,
2505
- maximum=48000,
2506
- label="resample rate",
2507
- value=0,
2508
- step=1,
2509
- interactive=True,
2510
- )
2511
- rms_mix_rate1 = gr.Slider(
2512
- minimum=0,
2513
- maximum=1,
2514
- label="rms mix rate",
2515
- value=1,
2516
- interactive=True,
2517
- )
2518
- protect1 = gr.Slider(
2519
- minimum=0,
2520
- maximum=0.5,
2521
- label="protection rate",
2522
- value=0.33,
2523
- step=0.01,
2524
- interactive=True,
2525
- )
2526
- with gr.Column():
2527
- dir_input = gr.Textbox(
2528
- label="directory input",
2529
- value="E:\codes\py39\\test-20230416b\\todo-songs",
2530
- )
2531
- inputs = gr.File(
2532
- file_count="multiple", label="input"
2533
- )
2534
- with gr.Row():
2535
- format1 = gr.Radio(
2536
- label="output format",
2537
- choices=["wav", "flac", "mp3", "m4a"],
2538
- value="flac",
2539
- interactive=True,
2540
- )
2541
- but1 = gr.Button("primary", variant="primary")
2542
- vc_output3 = gr.Textbox(label="label")
2543
- but1.click(
2544
- vc_multi,
2545
- [
2546
- spk_item,
2547
- dir_input,
2548
- opt_input,
2549
- inputs,
2550
- vc_transform1,
2551
- f0method1,
2552
- file_index3,
2553
- file_index4,
2554
- # file_big_npy2,
2555
- index_rate2,
2556
- filter_radius1,
2557
- resample_sr1,
2558
- rms_mix_rate1,
2559
- protect1,
2560
- format1,
2561
- crepe_hop_length,
2562
- ],
2563
- [vc_output3],
2564
- )
2565
- but1.click(fn=lambda: easy_uploader.clear())
2566
  with gr.TabItem("Download Voice Models"):
2567
  with gr.Row():
2568
  url=gr.Textbox(label="Huggingface Link:")
 
1
  import subprocess, torch, os, traceback, sys, warnings, shutil, numpy as np
2
 
3
  import pandas as pd
4
+ # import torchaudio
5
+ # from lib.voicecraft.data.tokenizer import (
6
+ # AudioTokenizer,
7
+ # TextTokenizer,
8
+ # )
9
+ # import whisperx
10
  import os
11
  import time
12
  import gc
 
1472
 
1473
 
1474
 
1475
+ # def transcribe_btn_click(audio_choice):
1476
+ # global transcript_fn
1477
+ # global audio_fn
1478
 
1479
+ # temp_folder = "./demo/temp"
1480
+ # orig_audio = audio_choice
1481
+ # filename = os.path.splitext(orig_audio.split("/")[-1])[0]
1482
+ # audio_fn = f"{temp_folder}/{filename}.wav"
1483
+ # transcript_fn = f"{temp_folder}/{filename}.txt"
1484
+ # if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
1485
+ # print("Audio and transcript already exist, skipping transcript")
1486
+ # transcript = open(transcript_fn, "r").read()
1487
+ # return transcript
1488
 
1489
+ # batch_size = 1 # Adjust based on your GPU memory availability
1490
+ # compute_type = "float16"
1491
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
1492
 
1493
+ # model = whisperx.load_model("large-v2", device, compute_type=compute_type)
1494
+ # pre_result = model.transcribe(audio_choice, batch_size=batch_size)
1495
 
1496
+ # # Correctly handle the transcription result based on its structure
1497
+ # if 'segments' in pre_result:
1498
+ # result = " ".join([segment['text'] for segment in pre_result['segments']])
1499
+ # else:
1500
+ # result = pre_result.get('text', '')
1501
 
1502
+ # print("Transcribe text: " + result) # Directly print the result as it is now a string
1503
 
1504
+ # # remove model to save VRAM
1505
+ # gc.collect(); torch.cuda.empty_cache(); del model
1506
 
1507
+ # # point to the original file or record the file
1508
+ # # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
1509
+ # orig_audio = audio_choice
1510
+ # orig_transcript = result
1511
+ # # move the audio and transcript to temp folder
1512
+ # os.makedirs(temp_folder, exist_ok=True)
1513
+ # os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
1514
+ # filename = os.path.splitext(orig_audio.split("/")[-1])[0]
1515
+ # with open(f"{temp_folder}/{filename}.txt", "w") as f:
1516
+ # f.write(orig_transcript)
1517
+ # # run MFA to get the alignment
1518
+ # align_temp = f"{temp_folder}/mfa_alignments"
1519
+ # os.makedirs(align_temp, exist_ok=True)
1520
 
1521
+ # audio_fn = f"{temp_folder}/{filename}.wav"
1522
+ # transcript_fn = f"{temp_folder}/{filename}.txt"
1523
 
1524
+ # return result
1525
 
1526
 
1527
+ # def run(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1528
+ # temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
1529
+ # global voicecraft_model, voicecraft_config, phn2num
1530
 
1531
+ # print("Transcribing the input audio")
1532
+ # transcribed_text = transcribe_btn_click(input_audio_fn)
1533
+ # print("Transcription complete")
1534
 
1535
+ # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1536
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1537
+ # os.environ["USER"] = "USER"
1538
+ # # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1539
+ # cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1540
+ # target_transcript = transcribed_text + target_transcript
1541
+ # print(target_transcript)
1542
+ # info = torchaudio.info(audio_fn)
1543
+ # audio_dur = info.num_frames / info.sample_rate
1544
+ # print(f"Audio_fn num frames: {info.num_frames}, sample rate: {info.sample_rate}")
1545
+
1546
+ # print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
1547
+ # assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1548
+ # prompt_end_frame = int(cut_off_sec * info.sample_rate)
1549
+
1550
+ # # # load model, tokenizer, and other necessary files
1551
+ # # # original file loaded it each time. here we load it only once
1552
+ # # global model_loaded
1553
+ # # f model_loaded==False:
1554
+
1555
+ # if voicecraft_model is None:
1556
+ # load_voicecraft()
1557
+
1558
+ # encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
1559
+ # text_tokenizer = TextTokenizer(backend="espeak")
1560
+ # audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
1561
+
1562
+
1563
+ # # # run the model to get the output
1564
+ # decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
1565
+ # 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
1566
+ # "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
1567
+ # from lib.voicecraft.inference_tts_scale import inference_one_sample
1568
+ # concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1569
+ # audio_fn, target_transcript, config.device, decode_config,
1570
+ # prompt_end_frame)
1571
+
1572
+ # # save segments for comparison
1573
+ # concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
1574
+ # # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
1575
+
1576
+ # output_dir = "./demo/generated_tts"
1577
+ # os.makedirs(output_dir, exist_ok=True)
1578
+ # seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
1579
+ # seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
1580
+
1581
+
1582
+ # torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
1583
+ # torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
1584
+
1585
+ # return [seg_save_fn_concat, seg_save_fn_gen]
1586
+
1587
+ # def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
1588
+ # temperature, kvcache, target_transcript, silence_tokens,
1589
+ # sid,
1590
+ # f0_up_key,
1591
+ # f0_file,
1592
+ # f0_method,
1593
+ # file_index,
1594
+ # #file_index2,
1595
+ # # file_big_npy,
1596
+ # index_rate,
1597
+ # filter_radius,
1598
+ # resample_sr,
1599
+ # rms_mix_rate,
1600
+ # protect,
1601
+ # crepe_hop_length):
1602
+ # global voicecraft_model, voicecraft_config, phn2num
1603
+
1604
+ # print("Transcribing the input audio")
1605
+ # transcribed_text = transcribe_btn_click(input_audio_fn)
1606
+ # print("Transcription complete", transcribed_text)
1607
 
1608
+ # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
1609
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
1610
+ # os.environ["USER"] = "USER"
1611
+ # # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
1612
+ # # cut_off_sec = cutoff_value # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
1613
+
1614
+ # target_transcript = transcribed_text + ' ' + target_transcript
1615
+ # print(target_transcript)
1616
+ # info = torchaudio.info(audio_fn)
1617
+ # audio_dur = info.num_frames / info.sample_rate
1618
+ # cut_off_sec = audio_dur - 0.1
1619
+
1620
+ # assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
1621
+ # prompt_end_frame = int(cut_off_sec * info.sample_rate)
1622
+
1623
+ # if voicecraft_model is None:
1624
+ # load_voicecraft()
1625
+
1626
+ # encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
1627
+ # text_tokenizer = TextTokenizer(backend="espeak")
1628
+ # audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu
1629
+
1630
+
1631
+ # # # run the model to get the output
1632
+ # decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
1633
+ # 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
1634
+ # "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
1635
+ # from lib.voicecraft.inference_tts_scale import inference_one_sample
1636
+ # concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
1637
+ # audio_fn, target_transcript, config.device, decode_config,
1638
+ # prompt_end_frame)
1639
+ # print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
1640
+
1641
+ # # save segments for comparison
1642
+ # concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
1643
+ # # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
1644
+
1645
+ # output_dir = "./demo/generated_tts"
1646
+ # os.makedirs(output_dir, exist_ok=True)
1647
+ # seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
1648
+ # seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
1649
+
1650
+
1651
+ # torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
1652
+ # torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
1653
+
1654
+
1655
+ # global tgt_sr, net_g, vc, hubert_model, version
1656
+
1657
+ # f0_up_key = int(f0_up_key)
1658
+ # try:
1659
+ # # audio = gen_audio.squeeze()
1660
+ # audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
1661
+ # audio_max = np.abs(audio).max() / 0.95
1662
+ # if audio_max > 1:
1663
+ # audio /= audio_max
1664
+ # times = [0, 0, 0]
1665
+ # if hubert_model == None:
1666
+ # load_hubert()
1667
+ # if_f0 = cpt.get("f0", 1)
1668
+ # file_index = (
1669
+ # (
1670
+ # file_index.strip(" ")
1671
+ # .strip('"')
1672
+ # .strip("\n")
1673
+ # .strip('"')
1674
+ # .strip(" ")
1675
+ # .replace("trained", "added")
1676
+ # )
1677
+ # ) # 防止小白写错,自动帮他替换掉
1678
+ # # file_big_npy = (
1679
+ # # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
1680
+ # # )
1681
+ # print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
1682
+ # audio_opt = vc.pipeline(
1683
+ # hubert_model,
1684
+ # net_g,
1685
+ # sid,
1686
+ # audio,
1687
+ # seg_save_fn_gen,
1688
+ # times,
1689
+ # f0_up_key,
1690
+ # f0_method,
1691
+ # file_index,
1692
+ # # file_big_npy,
1693
+ # index_rate,
1694
+ # if_f0,
1695
+ # filter_radius,
1696
+ # tgt_sr,
1697
+ # resample_sr,
1698
+ # rms_mix_rate,
1699
+ # version,
1700
+ # protect,
1701
+ # crepe_hop_length,
1702
+ # f0_file=f0_file,
1703
+ # )
1704
+ # if resample_sr >= 16000 and tgt_sr != resample_sr:
1705
+ # tgt_sr = resample_sr
1706
+ # index_info = (
1707
+ # "Using index:%s." % file_index
1708
+ # if os.path.exists(file_index)
1709
+ # else "Index not used."
1710
+ # )
1711
+ # return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
1712
+ # index_info,
1713
+ # times[0],
1714
+ # times[1],
1715
+ # times[2],
1716
+ # ), seg_save_fn_gen, (tgt_sr, audio_opt)
1717
+ # except:
1718
+ # info = traceback.format_exc()
1719
+ # print(info)
1720
+ # return info, (None, None)
1721
 
1722
 
1723
 
 
2136
  [vc_output3],
2137
  )
2138
  but1.click(fn=lambda: easy_uploader.clear())
2139
+ # with gr.TabItem("TTS"):
2140
+ # app.load(update_message)
2141
+
2142
+ # # Other RVC stuff
2143
+ # with gr.Row():
2144
+ # sid0 = gr.Dropdown(label="1. Choose your model", choices=sorted(names), value=check_for_name())
2145
+ # refresh_button = gr.Button("Refresh", variant="primary")
2146
+ # if check_for_name() != '':
2147
+ # get_vc(sorted(names)[0])
2148
+ # vc_transform0 = gr.Number(label="Key Shift: 0 for no key shifted output; 12 f for output an octave higher and -12 for output an octave lower.", value=0)
2149
+ # #clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
2150
+ # spk_item = gr.Slider(
2151
+ # minimum=0,
2152
+ # maximum=2333,
2153
+ # step=1,
2154
+ # label="speaker id",
2155
+ # value=0,
2156
+ # visible=False,
2157
+ # interactive=True,
2158
+ # )
2159
+ # #clean_button.click(fn=clean, inputs=[], outputs=[sid0])
2160
+ # sid0.change(
2161
+ # fn=get_vc,
2162
+ # inputs=[sid0],
2163
+ # outputs=[spk_item],
2164
+ # )
2165
+ # but0 = gr.Button("Convert", variant="primary")
2166
+ # with gr.Row():
2167
+ # with gr.Column():
2168
+ # # with gr.Row():
2169
+ # # dropbox = gr.File(label="Drag your audio file and click refresh.")
2170
+ # with gr.Row():
2171
+ # record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
2172
+ # with gr.Row():
2173
+ # input_audio0 = gr.Dropdown(
2174
+ # label="2.Choose the audio file.",
2175
+ # value="./audios/calm.wav",
2176
+ # choices=audio_files
2177
+ # )
2178
+ # audio_display = gr.Audio(value=input_audio0.value, label="Selected Audio File", type="filepath")
2179
+ # # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
2180
+ # # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
2181
+ # refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
2182
+ # # transcribed_text = gr.Textbox(label="transcibed text + mfa",
2183
+ # # value="The dogs sat at the door.",
2184
+ # # info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
2185
+ # record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
2186
+ # record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
2187
+ # # update audio_display
2188
+ # input_audio0.change(fn=lambda x: x, inputs=[input_audio0], outputs=[audio_display])
2189
 
2190
+ # with gr.Row():
2191
+ # # with gr.Column():
2192
+ # # input_audio = gr.Audio(label="Input Audio", type="filepath")
2193
+ # # # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
2194
+ # # # choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
2195
+ # # # info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
2196
+ # # transcribed_text = gr.Textbox(label="transcibed text + mfa",
2197
+ # # info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
2198
+ # # transcribe_info_text = gr.TextArea(label="How to use",
2199
+ # # value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
2200
+ # # transcribe_btn = gr.Button(value="transcribe and create mfa")
2201
 
2202
 
2203
+ # with gr.Column():
2204
+ # target_transcript = gr.Textbox(label="target transcript")
2205
 
2206
+ # # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
2207
+ # # outputs=[transcribed_text])
2208
 
2209
 
2210
 
2211
+ # with gr.Column():
2212
 
2213
+ # output_audio_gen = gr.Audio(
2214
+ # label="Output Audio generated",
2215
+ # type='filepath',
2216
+ # interactive=False
2217
+ # )
2218
 
2219
 
2220
+ # vc_output2 = gr.Audio(
2221
+ # label="Voice converted! (Click on the three dots to download the audio)",
2222
+ # type='filepath',
2223
+ # interactive=False,
2224
+ # )
2225
 
2226
+ # #with gr.Column():
2227
+ # with gr.Accordion("Advanced TTS Settings", open=False):
2228
+ # seed = gr.Number(label='seed', interactive=True, value=1)
2229
+ # stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
2230
+ # info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
2231
+ # sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
2232
+ # info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
2233
+ # left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
2234
+ # info=" not used for TTS, only for speech editing")
2235
+ # right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
2236
+ # info=" not used for TTS, only for speech editing")
2237
+ # codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
2238
+ # codec_sr = gr.Number(label='codec', interactive=True, value=50)
2239
+ # top_k = gr.Number(label='top_k', interactive=True, value=0)
2240
+ # top_p = gr.Number(label='top_p', interactive=True, value=0.8)
2241
+ # temperature = gr.Number(label='temperature', interactive=True, value=1)
2242
+ # kvcache = gr.Number(label='kvcache', interactive=True, value=1,
2243
+ # info='set to 0 to use less VRAM, results may be worse and slower inference')
2244
+ # silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
2245
+ # with gr.Accordion("Index Settings", open=False):
2246
+ # #with gr.Row():
2247
 
2248
+ # file_index1 = gr.Dropdown(
2249
+ # label="3. Choose the index file (in case it wasn't automatically found.)",
2250
+ # choices=get_indexes(),
2251
+ # value=get_index(),
2252
+ # interactive=True,
2253
+ # )
2254
+ # sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
2255
+ # refresh_button.click(
2256
+ # fn=change_choices, inputs=[], outputs=[sid0, file_index1]
2257
+ # )
2258
+ # # file_big_npy1 = gr.Textbox(
2259
+ # # label=i18n("特征文件路径"),
2260
+ # # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
2261
+ # # interactive=True,
2262
+ # # )
2263
+ # index_rate1 = gr.Slider(
2264
+ # minimum=0,
2265
+ # maximum=1,
2266
+ # label="index rate",
2267
+ # value=0,
2268
+ # interactive=True,
2269
+ # )
2270
+
2271
+ # # animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
2272
+
2273
+ # with gr.Accordion("Advanced Options", open=False):
2274
+ # f0method0 = gr.Radio(
2275
+ # label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
2276
+ # choices=["pm", "dio", "crepe-tiny", "mangio-crepe-tiny", "crepe", "harvest", "mangio-crepe", "rmvpe"], # Fork Feature. Add Crepe-Tiny
2277
+ # value="rmvpe",
2278
+ # interactive=True,
2279
+ # )
2280
 
2281
+ # crepe_hop_length = gr.Slider(
2282
+ # minimum=1,
2283
+ # maximum=512,
2284
+ # step=1,
2285
+ # label="Mangio-Crepe Hop Length. Higher numbers will reduce the chance of extreme pitch changes but lower numbers will increase accuracy. 64-192 is a good range to experiment with.",
2286
+ # value=120,
2287
+ # interactive=True,
2288
+ # visible=False,
2289
+ # )
2290
+ # f0method0.change(fn=whethercrepeornah, inputs=[f0method0], outputs=[crepe_hop_length])
2291
+ # filter_radius0 = gr.Slider(
2292
+ # minimum=0,
2293
+ # maximum=7,
2294
+ # label="label",
2295
+ # value=3,
2296
+ # step=1,
2297
+ # interactive=True,
2298
+ # )
2299
+ # resample_sr0 = gr.Slider(
2300
+ # minimum=0,
2301
+ # maximum=48000,
2302
+ # label="label",
2303
+ # value=0,
2304
+ # step=1,
2305
+ # interactive=True,
2306
+ # visible=False
2307
+ # )
2308
+ # rms_mix_rate0 = gr.Slider(
2309
+ # minimum=0,
2310
+ # maximum=1,
2311
+ # label="label",
2312
+ # value=0.21,
2313
+ # interactive=True,
2314
+ # )
2315
+ # protect0 = gr.Slider(
2316
+ # minimum=0,
2317
+ # maximum=0.5,
2318
+ # label="label",
2319
+ # value=0,
2320
+ # step=0.01,
2321
+ # interactive=True,
2322
+ # )
2323
+ # formanting = gr.Checkbox(
2324
+ # value=bool(DoFormant),
2325
+ # label="[EXPERIMENTAL] Formant shift inference audio",
2326
+ # info="Used for male to female and vice-versa conversions",
2327
+ # interactive=True,
2328
+ # visible=True,
2329
+ # )
2330
 
2331
+ # formant_preset = gr.Dropdown(
2332
+ # value='',
2333
+ # choices=get_fshift_presets(),
2334
+ # label="browse presets for formanting",
2335
+ # visible=bool(DoFormant),
2336
+ # )
2337
+ # formant_refresh_button = gr.Button(
2338
+ # value='\U0001f504',
2339
+ # visible=bool(DoFormant),
2340
+ # variant='primary',
2341
+ # )
2342
+ # #formant_refresh_button = ToolButton( elem_id='1')
2343
+ # #create_refresh_button(formant_preset, lambda: {"choices": formant_preset}, "refresh_list_shiftpresets")
2344
 
2345
+ # qfrency = gr.Slider(
2346
+ # value=Quefrency,
2347
+ # info="Default value is 1.0",
2348
+ # label="Frequency for formant shifting",
2349
+ # minimum=0.0,
2350
+ # maximum=16.0,
2351
+ # step=0.1,
2352
+ # visible=bool(DoFormant),
2353
+ # interactive=True,
2354
+ # )
2355
+ # tmbre = gr.Slider(
2356
+ # value=Timbre,
2357
+ # info="Default value is 1.0",
2358
+ # label="Timbre for formant shifting",
2359
+ # minimum=0.0,
2360
+ # maximum=16.0,
2361
+ # step=0.1,
2362
+ # visible=bool(DoFormant),
2363
+ # interactive=True,
2364
+ # )
2365
 
2366
+ # formant_preset.change(fn=preset_apply, inputs=[formant_preset, qfrency, tmbre], outputs=[qfrency, tmbre])
2367
+ # frmntbut = gr.Button("Apply", variant="primary", visible=bool(DoFormant))
2368
+ # formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
2369
+ # frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
2370
+ # formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
2371
 
2372
+ # with gr.Row():
2373
+ # vc_output1 = gr.Textbox("")
2374
+ # f0_file = gr.File(label="f0 file", visible=False)
2375
+
2376
+ # # run_btn.click(fn=run,
2377
+ # # inputs=[
2378
+ # # input_audio0,
2379
+ # # seed,
2380
+ # # stop_repitition,
2381
+ # # sample_batch_size,
2382
+ # # left_margin,
2383
+ # # right_margin,
2384
+ # # codecaudio_sr,
2385
+ # # codec_sr,
2386
+ # # top_k,
2387
+ # # top_p,
2388
+ # # temperature,
2389
+ # # kvcache,
2390
+ # # cutoff_value,
2391
+ # # target_transcript,
2392
+ # # silence_tokens,
2393
+ # # transcribed_text],
2394
+ # # outputs=[
2395
+ # # output_audio_con,
2396
+ # # output_audio_gen
2397
+ # # ])
2398
 
2399
+ # # but0.click(
2400
+ # # vc_single,
2401
+ # # [
2402
+ # # spk_item,
2403
+ # # input_audio0,
2404
+ # # vc_transform0,
2405
+ # # f0_file,
2406
+ # # f0method0,
2407
+ # # file_index1,
2408
+ # # # file_index2,
2409
+ # # # file_big_npy1,
2410
+ # # index_rate1,
2411
+ # # filter_radius0,
2412
+ # # resample_sr0,
2413
+ # # rms_mix_rate0,
2414
+ # # protect0,
2415
+ # # crepe_hop_length
2416
+ # # ],
2417
+ # # [vc_output1, vc_output2],
2418
+ # # )
2419
+
2420
+ # but0.click(
2421
+ # fn=run_joint,
2422
+ # inputs=[
2423
+ # input_audio0,
2424
+ # seed,
2425
+ # stop_repitition,
2426
+ # sample_batch_size,
2427
+ # left_margin,
2428
+ # right_margin,
2429
+ # codecaudio_sr,
2430
+ # codec_sr,
2431
+ # top_k,
2432
+ # top_p,
2433
+ # temperature,
2434
+ # kvcache,
2435
+ # target_transcript,
2436
+ # silence_tokens,
2437
+ # spk_item,
2438
+ # vc_transform0,
2439
+ # f0_file,
2440
+ # f0method0,
2441
+ # file_index1,
2442
+ # # file_index2,
2443
+ # # file_big_npy1,
2444
+ # index_rate1,
2445
+ # filter_radius0,
2446
+ # resample_sr0,
2447
+ # rms_mix_rate0,
2448
+ # protect0,
2449
+ # crepe_hop_length
2450
+ # ],
2451
+ # outputs=[vc_output1, output_audio_gen, vc_output2])
2452
 
2453
+ # with gr.Accordion("Batch Conversion",open=False, visible=False):
2454
+ # with gr.Row():
2455
+ # with gr.Column():
2456
+ # vc_transform1 = gr.Number(
2457
+ # label="speaker id", value=0
2458
+ # )
2459
+ # opt_input = gr.Textbox(label="opt", value="opt")
2460
+ # f0method1 = gr.Radio(
2461
+ # label="f0 method",
2462
+ # choices=["pm", "harvest", "crepe", "rmvpe"],
2463
+ # value="rmvpe",
2464
+ # interactive=True,
2465
+ # )
2466
+ # filter_radius1 = gr.Slider(
2467
+ # minimum=0,
2468
+ # maximum=7,
2469
+ # label="harvest",
2470
+ # value=3,
2471
+ # step=1,
2472
+ # interactive=True,
2473
+ # )
2474
+ # with gr.Column():
2475
+ # file_index3 = gr.Textbox(
2476
+ # label="file index",
2477
+ # value="",
2478
+ # interactive=True,
2479
+ # )
2480
+ # file_index4 = gr.Dropdown(
2481
+ # label="index path (dropdown)",
2482
+ # choices=sorted(index_paths),
2483
+ # interactive=True,
2484
+ # )
2485
+ # refresh_button.click(
2486
+ # fn=lambda username: change_choices(username)[1],
2487
+ # inputs=[gr.State('username')],
2488
+ # outputs=file_index4,
2489
+ # )
2490
+ # # file_big_npy2 = gr.Textbox(
2491
+ # # label=i18n("特征文件路径"),
2492
+ # # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
2493
+ # # interactive=True,
2494
+ # # )
2495
+ # index_rate2 = gr.Slider(
2496
+ # minimum=0,
2497
+ # maximum=1,
2498
+ # label="index rate 2",
2499
+ # value=1,
2500
+ # interactive=True,
2501
+ # )
2502
+ # with gr.Column():
2503
+ # resample_sr1 = gr.Slider(
2504
+ # minimum=0,
2505
+ # maximum=48000,
2506
+ # label="resample rate",
2507
+ # value=0,
2508
+ # step=1,
2509
+ # interactive=True,
2510
+ # )
2511
+ # rms_mix_rate1 = gr.Slider(
2512
+ # minimum=0,
2513
+ # maximum=1,
2514
+ # label="rms mix rate",
2515
+ # value=1,
2516
+ # interactive=True,
2517
+ # )
2518
+ # protect1 = gr.Slider(
2519
+ # minimum=0,
2520
+ # maximum=0.5,
2521
+ # label="protection rate",
2522
+ # value=0.33,
2523
+ # step=0.01,
2524
+ # interactive=True,
2525
+ # )
2526
+ # with gr.Column():
2527
+ # dir_input = gr.Textbox(
2528
+ # label="directory input",
2529
+ # value="E:\codes\py39\\test-20230416b\\todo-songs",
2530
+ # )
2531
+ # inputs = gr.File(
2532
+ # file_count="multiple", label="input"
2533
+ # )
2534
+ # with gr.Row():
2535
+ # format1 = gr.Radio(
2536
+ # label="output format",
2537
+ # choices=["wav", "flac", "mp3", "m4a"],
2538
+ # value="flac",
2539
+ # interactive=True,
2540
+ # )
2541
+ # but1 = gr.Button("primary", variant="primary")
2542
+ # vc_output3 = gr.Textbox(label="label")
2543
+ # but1.click(
2544
+ # vc_multi,
2545
+ # [
2546
+ # spk_item,
2547
+ # dir_input,
2548
+ # opt_input,
2549
+ # inputs,
2550
+ # vc_transform1,
2551
+ # f0method1,
2552
+ # file_index3,
2553
+ # file_index4,
2554
+ # # file_big_npy2,
2555
+ # index_rate2,
2556
+ # filter_radius1,
2557
+ # resample_sr1,
2558
+ # rms_mix_rate1,
2559
+ # protect1,
2560
+ # format1,
2561
+ # crepe_hop_length,
2562
+ # ],
2563
+ # [vc_output3],
2564
+ # )
2565
+ # but1.click(fn=lambda: easy_uploader.clear())
2566
  with gr.TabItem("Download Voice Models"):
2567
  with gr.Row():
2568
  url=gr.Textbox(label="Huggingface Link:")
requirements.txt CHANGED
@@ -17,12 +17,4 @@ mega.py
17
  gdown==5.1.0
18
  onnxruntime
19
  pyngrok==4.1.12
20
- xformers==0.0.22
21
- torchaudio==2.0.2
22
- torch==2.0.1 # this assumes your system is compatible with CUDA 11.7, otherwise checkout https://pytorch.org/get-started/previous-versions/#v201
23
- tensorboard==2.16.2
24
- phonemizer==3.2.1
25
- datasets==2.16.0
26
- torchmetrics==0.11.1
27
- whisperx @ git+https://github.com/m-bain/whisperx.git
28
  # install MFA for getting forced-alignment, this could take a few minutes
 
17
  gdown==5.1.0
18
  onnxruntime
19
  pyngrok==4.1.12
 
 
 
 
 
 
 
 
20
  # install MFA for getting forced-alignment, this could take a few minutes