3417543_models

Paused

App Files Files Community

ajayarora1235 commited on Apr 14, 2024

Commit

d602592

1 Parent(s): 005bb33

get rid of voicebox tab to move python versions

Browse files

Files changed (3) hide show

README.md +0 -1
app.py +646 -646
requirements.txt +0 -8

README.md CHANGED Viewed

@@ -5,7 +5,6 @@ colorFrom: pink
 colorTo: pink
 sdk: gradio
 sdk_version: 3.42.0
-python_version: 3.9.16
 app_file: app.py
 pinned: true
 ---

 colorTo: pink
 sdk: gradio
 sdk_version: 3.42.0
 app_file: app.py
 pinned: true
 ---

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import subprocess, torch, os, traceback, sys, warnings, shutil, numpy as np
 import pandas as pd
-import torchaudio
-from lib.voicecraft.data.tokenizer import (
-    AudioTokenizer,
-    TextTokenizer,
-)
-import whisperx
 import os
 import time
 import gc
@@ -1472,252 +1472,252 @@ def stoptraining(mim):
-def transcribe_btn_click(audio_choice):
-    global transcript_fn
-    global audio_fn
-    temp_folder = "./demo/temp"
-    orig_audio = audio_choice
-    filename = os.path.splitext(orig_audio.split("/")[-1])[0]
-    audio_fn = f"{temp_folder}/{filename}.wav"
-    transcript_fn = f"{temp_folder}/{filename}.txt"
-    if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
-        print("Audio and transcript already exist, skipping transcript")
-        transcript = open(transcript_fn, "r").read()
-        return transcript
-    batch_size = 1  # Adjust based on your GPU memory availability
-    compute_type = "float16"
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = whisperx.load_model("large-v2", device, compute_type=compute_type)
-    pre_result = model.transcribe(audio_choice, batch_size=batch_size)
-    # Correctly handle the transcription result based on its structure
-    if 'segments' in pre_result:
-        result = " ".join([segment['text'] for segment in pre_result['segments']])
-    else:
-        result = pre_result.get('text', '')
-    print("Transcribe text: " + result)  # Directly print the result as it is now a string
-    # remove model to save VRAM
-    gc.collect(); torch.cuda.empty_cache(); del model
-    # point to the original file or record the file
-    # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
-    orig_audio = audio_choice
-    orig_transcript = result
-    # move the audio and transcript to temp folder
-    os.makedirs(temp_folder, exist_ok=True)
-    os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
-    filename = os.path.splitext(orig_audio.split("/")[-1])[0]
-    with open(f"{temp_folder}/{filename}.txt", "w") as f:
-        f.write(orig_transcript)
-    # run MFA to get the alignment
-    align_temp = f"{temp_folder}/mfa_alignments"
-    os.makedirs(align_temp, exist_ok=True)
-    audio_fn = f"{temp_folder}/{filename}.wav"
-    transcript_fn = f"{temp_folder}/{filename}.txt"
-    return result
-def run(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
-        temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
-    global voicecraft_model, voicecraft_config, phn2num
-    print("Transcribing the input audio")
-    transcribed_text = transcribe_btn_click(input_audio_fn)
-    print("Transcription complete")
-    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-    os.environ["USER"] = "USER"
-    # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
-    cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
-    target_transcript = transcribed_text + target_transcript
-    print(target_transcript)
-    info = torchaudio.info(audio_fn)
-    audio_dur = info.num_frames / info.sample_rate
-    print(f"Audio_fn num frames: {info.num_frames}, sample rate: {info.sample_rate}")
-    print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
-    assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
-    prompt_end_frame = int(cut_off_sec * info.sample_rate)
-    # # load model, tokenizer, and other necessary files
-    # # original file loaded it each time. here we load it only once
-    # global model_loaded
-    # f model_loaded==False:
-    if voicecraft_model is None:
-        load_voicecraft()
-    encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
-    text_tokenizer = TextTokenizer(backend="espeak")
-    audio_tokenizer = AudioTokenizer(signature=encodec_fn)  # will also put the neural codec model on gpu
-    # # run the model to get the output
-    decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
-                     'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
-                     "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
-    from lib.voicecraft.inference_tts_scale import inference_one_sample
-    concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
-                                                     audio_fn, target_transcript, config.device, decode_config,
-                                                     prompt_end_frame)
-    # save segments for comparison
-    concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
-    # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
-    output_dir = "./demo/generated_tts"
-    os.makedirs(output_dir, exist_ok=True)
-    seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
-    seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
-    torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
-    torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
-    return [seg_save_fn_concat, seg_save_fn_gen]
-def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
-        temperature, kvcache, target_transcript, silence_tokens,
-        sid,
-        f0_up_key,
-        f0_file,
-        f0_method,
-        file_index,
-        #file_index2,
-        # file_big_npy,
-        index_rate,
-        filter_radius,
-        resample_sr,
-        rms_mix_rate,
-        protect,
-        crepe_hop_length):
-    global voicecraft_model, voicecraft_config, phn2num
-    print("Transcribing the input audio")
-    transcribed_text = transcribe_btn_click(input_audio_fn)
-    print("Transcription complete", transcribed_text)
-    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-    os.environ["USER"] = "USER"
-    # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
-    # cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
-    target_transcript = transcribed_text + ' ' + target_transcript
-    print(target_transcript)
-    info = torchaudio.info(audio_fn)
-    audio_dur = info.num_frames / info.sample_rate
-    cut_off_sec = audio_dur - 0.1
-    assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
-    prompt_end_frame = int(cut_off_sec * info.sample_rate)
-    if voicecraft_model is None:
-        load_voicecraft()
-    encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
-    text_tokenizer = TextTokenizer(backend="espeak")
-    audio_tokenizer = AudioTokenizer(signature=encodec_fn)  # will also put the neural codec model on gpu
-    # # run the model to get the output
-    decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
-                     'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
-                     "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
-    from lib.voicecraft.inference_tts_scale import inference_one_sample
-    concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
-                                                     audio_fn, target_transcript, config.device, decode_config,
-                                                     prompt_end_frame)
-    print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
-    # save segments for comparison
-    concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
-    # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
-    output_dir = "./demo/generated_tts"
-    os.makedirs(output_dir, exist_ok=True)
-    seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
-    seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
-    torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
-    torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
-    global tgt_sr, net_g, vc, hubert_model, version
-    f0_up_key = int(f0_up_key)
-    try:
-        # audio = gen_audio.squeeze()
-        audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
-        audio_max = np.abs(audio).max() / 0.95
-        if audio_max > 1:
-            audio /= audio_max
-        times = [0, 0, 0]
-        if hubert_model == None:
-            load_hubert()
-        if_f0 = cpt.get("f0", 1)
-        file_index = (
-            (
-                file_index.strip(" ")
-                .strip('"')
-                .strip("\n")
-                .strip('"')
-                .strip(" ")
-                .replace("trained", "added")
-            )
-        )  # 防止小白写错，自动帮他替换掉
-        # file_big_npy = (
-        #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
-        # )
-        print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
-        audio_opt = vc.pipeline(
-            hubert_model,
-            net_g,
-            sid,
-            audio,
-            seg_save_fn_gen,
-            times,
-            f0_up_key,
-            f0_method,
-            file_index,
-            # file_big_npy,
-            index_rate,
-            if_f0,
-            filter_radius,
-            tgt_sr,
-            resample_sr,
-            rms_mix_rate,
-            version,
-            protect,
-            crepe_hop_length,
-            f0_file=f0_file,
-        )
-        if resample_sr >= 16000 and tgt_sr != resample_sr:
-            tgt_sr = resample_sr
-        index_info = (
-            "Using index:%s." % file_index
-            if os.path.exists(file_index)
-            else "Index not used."
-        )
-        return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
-            index_info,
-            times[0],
-            times[1],
-            times[2],
-        ), seg_save_fn_gen, (tgt_sr, audio_opt)
-    except:
-        info = traceback.format_exc()
-        print(info)
-        return info, (None, None)
@@ -2136,433 +2136,433 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="pink", secondary_hue="rose")
                         [vc_output3],
                     )
                     but1.click(fn=lambda: easy_uploader.clear())
-        with gr.TabItem("TTS"):
-            app.load(update_message)
-            # Other RVC stuff
-            with gr.Row():
-                sid0 = gr.Dropdown(label="1. Choose your model", choices=sorted(names), value=check_for_name())
-                refresh_button = gr.Button("Refresh", variant="primary")
-                if check_for_name() != '':
-                    get_vc(sorted(names)[0])
-                vc_transform0 = gr.Number(label="Key Shift: 0 for no key shifted output; 12 f for output an octave higher and -12 for output an octave lower.", value=0)
-                #clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
-                spk_item = gr.Slider(
-                    minimum=0,
-                    maximum=2333,
-                    step=1,
-                    label="speaker id",
-                    value=0,
-                    visible=False,
-                    interactive=True,
-                )
-                #clean_button.click(fn=clean, inputs=[], outputs=[sid0])
-                sid0.change(
-                    fn=get_vc,
-                    inputs=[sid0],
-                    outputs=[spk_item],
-                )
-                but0 = gr.Button("Convert", variant="primary")
-            with gr.Row():
-                with gr.Column():
-                    # with gr.Row():
-                    #     dropbox = gr.File(label="Drag your audio file and click refresh.")
-                    with gr.Row():
-                        record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
-                    with gr.Row():
-                        input_audio0 = gr.Dropdown(
-                            label="2.Choose the audio file.",
-                            value="./audios/calm.wav",
-                            choices=audio_files
-                        )
-                        audio_display = gr.Audio(value=input_audio0.value, label="Selected Audio File", type="filepath")
-                        # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
-                        # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
-                        refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
-                        # transcribed_text = gr.Textbox(label="transcibed text + mfa",
-                        #                               value="The dogs sat at the door.",
-                        #                             info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
-                        record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
-                        record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
-                        # update audio_display
-                        input_audio0.change(fn=lambda x: x, inputs=[input_audio0], outputs=[audio_display])
-                    with gr.Row():
-                        # with gr.Column():
-                            # input_audio = gr.Audio(label="Input Audio", type="filepath")
-                            # # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
-                            # #                                 choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
-                            # #                                 info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
-                            # transcribed_text = gr.Textbox(label="transcibed text + mfa",
-                            #                             info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
-                            # transcribe_info_text = gr.TextArea(label="How to use",
-                            #                                 value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
-                            # transcribe_btn = gr.Button(value="transcribe and create mfa")
-                        with gr.Column():
-                            target_transcript = gr.Textbox(label="target transcript")
-                        # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
-                        #                      outputs=[transcribed_text])
-                with gr.Column():
-                    output_audio_gen = gr.Audio(
-                        label="Output Audio generated",
-                        type='filepath',
-                        interactive=False
-                    )
-                    vc_output2 = gr.Audio(
-                        label="Voice converted! (Click on the three dots to download the audio)",
-                        type='filepath',
-                        interactive=False,
-                    )
-                #with gr.Column():
-                    with gr.Accordion("Advanced TTS Settings", open=False):
-                        seed = gr.Number(label='seed', interactive=True, value=1)
-                        stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
-                                                info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
-                        sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
-                                                    info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
-                        left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
-                                                info=" not used for TTS, only for speech editing")
-                        right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
-                                                info=" not used for TTS, only for speech editing")
-                        codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
-                        codec_sr = gr.Number(label='codec', interactive=True, value=50)
-                        top_k = gr.Number(label='top_k', interactive=True, value=0)
-                        top_p = gr.Number(label='top_p', interactive=True, value=0.8)
-                        temperature = gr.Number(label='temperature', interactive=True, value=1)
-                        kvcache = gr.Number(label='kvcache', interactive=True, value=1,
-                                            info='set to 0 to use less VRAM, results may be worse and slower inference')
-                        silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
-                    with gr.Accordion("Index Settings", open=False):
-                        #with gr.Row():
-                            file_index1 = gr.Dropdown(
-                                label="3. Choose the index file (in case it wasn't automatically found.)",
-                                choices=get_indexes(),
-                                value=get_index(),
-                                interactive=True,
-                                )
-                            sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
-                            refresh_button.click(
-                                fn=change_choices, inputs=[], outputs=[sid0, file_index1]
-                                )
-                            # file_big_npy1 = gr.Textbox(
-                            #     label=i18n("特征文件路径"),
-                            #     value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
-                            #     interactive=True,
-                            # )
-                            index_rate1 = gr.Slider(
-                                minimum=0,
-                                maximum=1,
-                                label="index rate",
-                                value=0,
-                                interactive=True,
-                                )
-                    # animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
-                    with gr.Accordion("Advanced Options", open=False):
-                        f0method0 = gr.Radio(
-                            label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
-                            choices=["pm", "dio", "crepe-tiny", "mangio-crepe-tiny", "crepe", "harvest", "mangio-crepe", "rmvpe"], # Fork Feature. Add Crepe-Tiny
-                            value="rmvpe",
-                            interactive=True,
-                        )
-                        crepe_hop_length = gr.Slider(
-                            minimum=1,
-                            maximum=512,
-                            step=1,
-                            label="Mangio-Crepe Hop Length. Higher numbers will reduce the chance of extreme pitch changes but lower numbers will increase accuracy. 64-192 is a good range to experiment with.",
-                            value=120,
-                            interactive=True,
-                            visible=False,
-                            )
-                        f0method0.change(fn=whethercrepeornah, inputs=[f0method0], outputs=[crepe_hop_length])
-                        filter_radius0 = gr.Slider(
-                            minimum=0,
-                            maximum=7,
-                            label="label",
-                            value=3,
-                            step=1,
-                            interactive=True,
-                            )
-                        resample_sr0 = gr.Slider(
-                            minimum=0,
-                            maximum=48000,
-                            label="label",
-                            value=0,
-                            step=1,
-                            interactive=True,
-                            visible=False
-                            )
-                        rms_mix_rate0 = gr.Slider(
-                            minimum=0,
-                            maximum=1,
-                            label="label",
-                            value=0.21,
-                            interactive=True,
-                            )
-                        protect0 = gr.Slider(
-                            minimum=0,
-                            maximum=0.5,
-                            label="label",
-                            value=0,
-                            step=0.01,
-                            interactive=True,
-                            )
-                        formanting = gr.Checkbox(
-                            value=bool(DoFormant),
-                            label="[EXPERIMENTAL] Formant shift inference audio",
-                            info="Used for male to female and vice-versa conversions",
-                            interactive=True,
-                            visible=True,
-                        )
-                        formant_preset = gr.Dropdown(
-                            value='',
-                            choices=get_fshift_presets(),
-                            label="browse presets for formanting",
-                            visible=bool(DoFormant),
-                        )
-                        formant_refresh_button = gr.Button(
-                            value='\U0001f504',
-                            visible=bool(DoFormant),
-                            variant='primary',
-                        )
-                        #formant_refresh_button = ToolButton( elem_id='1')
-                        #create_refresh_button(formant_preset, lambda: {"choices": formant_preset}, "refresh_list_shiftpresets")
-                        qfrency = gr.Slider(
-                                value=Quefrency,
-                                info="Default value is 1.0",
-                                label="Frequency for formant shifting",
-                                minimum=0.0,
-                                maximum=16.0,
-                                step=0.1,
-                                visible=bool(DoFormant),
-                                interactive=True,
-                            )
-                        tmbre = gr.Slider(
-                            value=Timbre,
-                            info="Default value is 1.0",
-                            label="Timbre for formant shifting",
-                            minimum=0.0,
-                            maximum=16.0,
-                            step=0.1,
-                            visible=bool(DoFormant),
-                            interactive=True,
-                        )
-                        formant_preset.change(fn=preset_apply, inputs=[formant_preset, qfrency, tmbre], outputs=[qfrency, tmbre])
-                        frmntbut = gr.Button("Apply", variant="primary", visible=bool(DoFormant))
-                        formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
-                        frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
-                        formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
-            with gr.Row():
-                vc_output1 = gr.Textbox("")
-                f0_file = gr.File(label="f0 file", visible=False)
-                # run_btn.click(fn=run,
-                #             inputs=[
-                #                 input_audio0,
-                #                 seed,
-                #                 stop_repitition,
-                #                 sample_batch_size,
-                #                 left_margin,
-                #                 right_margin,
-                #                 codecaudio_sr,
-                #                 codec_sr,
-                #                 top_k,
-                #                 top_p,
-                #                 temperature,
-                #                 kvcache,
-                #                 cutoff_value,
-                #                 target_transcript,
-                #                 silence_tokens,
-                #                 transcribed_text],
-                #             outputs=[
-                #                 output_audio_con,
-                #                 output_audio_gen
-                #             ])
-                # but0.click(
-                #     vc_single,
-                #     [
-                #         spk_item,
-                #         input_audio0,
-                #         vc_transform0,
-                #         f0_file,
-                #         f0method0,
-                #         file_index1,
-                #         # file_index2,
-                #         # file_big_npy1,
-                #         index_rate1,
-                #         filter_radius0,
-                #         resample_sr0,
-                #         rms_mix_rate0,
-                #         protect0,
-                #         crepe_hop_length
-                #     ],
-                #     [vc_output1, vc_output2],
-                # )
-                but0.click(
-                    fn=run_joint,
-                    inputs=[
-                        input_audio0,
-                        seed,
-                        stop_repitition,
-                        sample_batch_size,
-                        left_margin,
-                        right_margin,
-                        codecaudio_sr,
-                        codec_sr,
-                        top_k,
-                        top_p,
-                        temperature,
-                        kvcache,
-                        target_transcript,
-                        silence_tokens,
-                        spk_item,
-                        vc_transform0,
-                        f0_file,
-                        f0method0,
-                        file_index1,
-                        # file_index2,
-                        # file_big_npy1,
-                        index_rate1,
-                        filter_radius0,
-                        resample_sr0,
-                        rms_mix_rate0,
-                        protect0,
-                        crepe_hop_length
-                    ],
-                    outputs=[vc_output1, output_audio_gen, vc_output2])
-            with gr.Accordion("Batch Conversion",open=False, visible=False):
-                with gr.Row():
-                    with gr.Column():
-                        vc_transform1 = gr.Number(
-                            label="speaker id", value=0
-                        )
-                        opt_input = gr.Textbox(label="opt", value="opt")
-                        f0method1 = gr.Radio(
-                            label="f0 method",
-                            choices=["pm", "harvest", "crepe", "rmvpe"],
-                            value="rmvpe",
-                            interactive=True,
-                        )
-                        filter_radius1 = gr.Slider(
-                            minimum=0,
-                            maximum=7,
-                            label="harvest",
-                            value=3,
-                            step=1,
-                            interactive=True,
-                        )
-                    with gr.Column():
-                        file_index3 = gr.Textbox(
-                            label="file index",
-                            value="",
-                            interactive=True,
-                        )
-                        file_index4 = gr.Dropdown(
-                            label="index path (dropdown)",
-                            choices=sorted(index_paths),
-                            interactive=True,
-                        )
-                        refresh_button.click(
-                            fn=lambda username: change_choices(username)[1],
-                            inputs=[gr.State('username')],
-                            outputs=file_index4,
-                        )
-                        # file_big_npy2 = gr.Textbox(
-                        #     label=i18n("特征文件路径"),
-                        #     value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
-                        #     interactive=True,
-                        # )
-                        index_rate2 = gr.Slider(
-                            minimum=0,
-                            maximum=1,
-                            label="index rate 2",
-                            value=1,
-                            interactive=True,
-                        )
-                    with gr.Column():
-                        resample_sr1 = gr.Slider(
-                            minimum=0,
-                            maximum=48000,
-                            label="resample rate",
-                            value=0,
-                            step=1,
-                            interactive=True,
-                        )
-                        rms_mix_rate1 = gr.Slider(
-                            minimum=0,
-                            maximum=1,
-                            label="rms mix rate",
-                            value=1,
-                            interactive=True,
-                        )
-                        protect1 = gr.Slider(
-                            minimum=0,
-                            maximum=0.5,
-                            label="protection rate",
-                            value=0.33,
-                            step=0.01,
-                            interactive=True,
-                        )
-                    with gr.Column():
-                        dir_input = gr.Textbox(
-                            label="directory input",
-                            value="E:\codes\py39\\test-20230416b\\todo-songs",
-                        )
-                        inputs = gr.File(
-                            file_count="multiple", label="input"
-                        )
-                    with gr.Row():
-                        format1 = gr.Radio(
-                            label="output format",
-                            choices=["wav", "flac", "mp3", "m4a"],
-                            value="flac",
-                            interactive=True,
-                        )
-                        but1 = gr.Button("primary", variant="primary")
-                        vc_output3 = gr.Textbox(label="label")
-                    but1.click(
-                        vc_multi,
-                        [
-                            spk_item,
-                            dir_input,
-                            opt_input,
-                            inputs,
-                            vc_transform1,
-                            f0method1,
-                            file_index3,
-                            file_index4,
-                            # file_big_npy2,
-                            index_rate2,
-                            filter_radius1,
-                            resample_sr1,
-                            rms_mix_rate1,
-                            protect1,
-                            format1,
-                            crepe_hop_length,
-                        ],
-                        [vc_output3],
-                    )
-                    but1.click(fn=lambda: easy_uploader.clear())
         with gr.TabItem("Download Voice Models"):
             with gr.Row():
                 url=gr.Textbox(label="Huggingface Link:")

 import subprocess, torch, os, traceback, sys, warnings, shutil, numpy as np
 import pandas as pd
+# import torchaudio
+# from lib.voicecraft.data.tokenizer import (
+#     AudioTokenizer,
+#     TextTokenizer,
+# )
+# import whisperx
 import os
 import time
 import gc
+# def transcribe_btn_click(audio_choice):
+#     global transcript_fn
+#     global audio_fn
+#     temp_folder = "./demo/temp"
+#     orig_audio = audio_choice
+#     filename = os.path.splitext(orig_audio.split("/")[-1])[0]
+#     audio_fn = f"{temp_folder}/{filename}.wav"
+#     transcript_fn = f"{temp_folder}/{filename}.txt"
+#     if os.path.exists(audio_fn) and os.path.exists(transcript_fn):
+#         print("Audio and transcript already exist, skipping transcript")
+#         transcript = open(transcript_fn, "r").read()
+#         return transcript
+#     batch_size = 1  # Adjust based on your GPU memory availability
+#     compute_type = "float16"
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     model = whisperx.load_model("large-v2", device, compute_type=compute_type)
+#     pre_result = model.transcribe(audio_choice, batch_size=batch_size)
+#     # Correctly handle the transcription result based on its structure
+#     if 'segments' in pre_result:
+#         result = " ".join([segment['text'] for segment in pre_result['segments']])
+#     else:
+#         result = pre_result.get('text', '')
+#     print("Transcribe text: " + result)  # Directly print the result as it is now a string
+#     # remove model to save VRAM
+#     gc.collect(); torch.cuda.empty_cache(); del model
+#     # point to the original file or record the file
+#     # write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
+#     orig_audio = audio_choice
+#     orig_transcript = result
+#     # move the audio and transcript to temp folder
+#     os.makedirs(temp_folder, exist_ok=True)
+#     os.system(f"cp \"{orig_audio}\" \"{temp_folder}\"")
+#     filename = os.path.splitext(orig_audio.split("/")[-1])[0]
+#     with open(f"{temp_folder}/{filename}.txt", "w") as f:
+#         f.write(orig_transcript)
+#     # run MFA to get the alignment
+#     align_temp = f"{temp_folder}/mfa_alignments"
+#     os.makedirs(align_temp, exist_ok=True)
+#     audio_fn = f"{temp_folder}/{filename}.wav"
+#     transcript_fn = f"{temp_folder}/{filename}.txt"
+#     return result
+# def run(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
+#         temperature, kvcache, cutoff_value, target_transcript, silence_tokens, transcribed_text):
+#     global voicecraft_model, voicecraft_config, phn2num
+#     print("Transcribing the input audio")
+#     transcribed_text = transcribe_btn_click(input_audio_fn)
+#     print("Transcription complete")
+#     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+#     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+#     os.environ["USER"] = "USER"
+#     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
+#     cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
+#     target_transcript = transcribed_text + target_transcript
+#     print(target_transcript)
+#     info = torchaudio.info(audio_fn)
+#     audio_dur = info.num_frames / info.sample_rate
+#     print(f"Audio_fn num frames: {info.num_frames}, sample rate: {info.sample_rate}")
+#     print("audio dur s is", audio_dur, "cutoff_sec is", cut_off_sec)
+#     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
+#     prompt_end_frame = int(cut_off_sec * info.sample_rate)
+#     # # load model, tokenizer, and other necessary files
+#     # # original file loaded it each time. here we load it only once
+#     # global model_loaded
+#     # f model_loaded==False:
+#     if voicecraft_model is None:
+#         load_voicecraft()
+#     encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
+#     text_tokenizer = TextTokenizer(backend="espeak")
+#     audio_tokenizer = AudioTokenizer(signature=encodec_fn)  # will also put the neural codec model on gpu
+#     # # run the model to get the output
+#     decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
+#                      'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
+#                      "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
+#     from lib.voicecraft.inference_tts_scale import inference_one_sample
+#     concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
+#                                                      audio_fn, target_transcript, config.device, decode_config,
+#                                                      prompt_end_frame)
+#     # save segments for comparison
+#     concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
+#     # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
+#     output_dir = "./demo/generated_tts"
+#     os.makedirs(output_dir, exist_ok=True)
+#     seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
+#     seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
+#     torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
+#     torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
+#     return [seg_save_fn_concat, seg_save_fn_gen]
+# def run_joint(input_audio_fn, seed, stop_repetition, sample_batch_size, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
+#         temperature, kvcache, target_transcript, silence_tokens,
+#         sid,
+#         f0_up_key,
+#         f0_file,
+#         f0_method,
+#         file_index,
+#         #file_index2,
+#         # file_big_npy,
+#         index_rate,
+#         filter_radius,
+#         resample_sr,
+#         rms_mix_rate,
+#         protect,
+#         crepe_hop_length):
+#     global voicecraft_model, voicecraft_config, phn2num
+#     print("Transcribing the input audio")
+#     transcribed_text = transcribe_btn_click(input_audio_fn)
+#     print("Transcription complete", transcribed_text)
+#     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+#     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+#     os.environ["USER"] = "USER"
+#     # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
+#     # cut_off_sec = cutoff_value  # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
+#     target_transcript = transcribed_text + ' ' + target_transcript
+#     print(target_transcript)
+#     info = torchaudio.info(audio_fn)
+#     audio_dur = info.num_frames / info.sample_rate
+#     cut_off_sec = audio_dur - 0.1
+#     assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
+#     prompt_end_frame = int(cut_off_sec * info.sample_rate)
+#     if voicecraft_model is None:
+#         load_voicecraft()
+#     encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
+#     text_tokenizer = TextTokenizer(backend="espeak")
+#     audio_tokenizer = AudioTokenizer(signature=encodec_fn)  # will also put the neural codec model on gpu
+#     # # run the model to get the output
+#     decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
+#                      'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr,
+#                      "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
+#     from lib.voicecraft.inference_tts_scale import inference_one_sample
+#     concated_audio, gen_audio = inference_one_sample(voicecraft_model, voicecraft_config, phn2num, text_tokenizer, audio_tokenizer,
+#                                                      audio_fn, target_transcript, config.device, decode_config,
+#                                                      prompt_end_frame)
+#     print("prompt_end_frame: ", prompt_end_frame, "voicecraft_config: ", voicecraft_config, "audio_fn: ", audio_fn, "target_transcript: ", target_transcript, "config.device: ", config.device, "decode_config: ", decode_config)
+#     # save segments for comparison
+#     concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
+#     # logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
+#     output_dir = "./demo/generated_tts"
+#     os.makedirs(output_dir, exist_ok=True)
+#     seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
+#     seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"
+#     torchaudio.save(seg_save_fn_gen, gen_audio, int(codec_audio_sr))
+#     torchaudio.save(seg_save_fn_concat, concated_audio, int(codec_audio_sr))
+#     global tgt_sr, net_g, vc, hubert_model, version
+#     f0_up_key = int(f0_up_key)
+#     try:
+#         # audio = gen_audio.squeeze()
+#         audio = load_audio(seg_save_fn_gen, 16000, DoFormant, Quefrency, Timbre).squeeze()
+#         audio_max = np.abs(audio).max() / 0.95
+#         if audio_max > 1:
+#             audio /= audio_max
+#         times = [0, 0, 0]
+#         if hubert_model == None:
+#             load_hubert()
+#         if_f0 = cpt.get("f0", 1)
+#         file_index = (
+#             (
+#                 file_index.strip(" ")
+#                 .strip('"')
+#                 .strip("\n")
+#                 .strip('"')
+#                 .strip(" ")
+#                 .replace("trained", "added")
+#             )
+#         )  # 防止小白写错，自动帮他替换掉
+#         # file_big_npy = (
+#         #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+#         # )
+#         print(f"Making VC Pipeline, device: {config.device}, audio shape: {audio.shape}")
+#         audio_opt = vc.pipeline(
+#             hubert_model,
+#             net_g,
+#             sid,
+#             audio,
+#             seg_save_fn_gen,
+#             times,
+#             f0_up_key,
+#             f0_method,
+#             file_index,
+#             # file_big_npy,
+#             index_rate,
+#             if_f0,
+#             filter_radius,
+#             tgt_sr,
+#             resample_sr,
+#             rms_mix_rate,
+#             version,
+#             protect,
+#             crepe_hop_length,
+#             f0_file=f0_file,
+#         )
+#         if resample_sr >= 16000 and tgt_sr != resample_sr:
+#             tgt_sr = resample_sr
+#         index_info = (
+#             "Using index:%s." % file_index
+#             if os.path.exists(file_index)
+#             else "Index not used."
+#         )
+#         return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
+#             index_info,
+#             times[0],
+#             times[1],
+#             times[2],
+#         ), seg_save_fn_gen, (tgt_sr, audio_opt)
+#     except:
+#         info = traceback.format_exc()
+#         print(info)
+#         return info, (None, None)
                         [vc_output3],
                     )
                     but1.click(fn=lambda: easy_uploader.clear())
+        # with gr.TabItem("TTS"):
+        #     app.load(update_message)
+        #     # Other RVC stuff
+        #     with gr.Row():
+        #         sid0 = gr.Dropdown(label="1. Choose your model", choices=sorted(names), value=check_for_name())
+        #         refresh_button = gr.Button("Refresh", variant="primary")
+        #         if check_for_name() != '':
+        #             get_vc(sorted(names)[0])
+        #         vc_transform0 = gr.Number(label="Key Shift: 0 for no key shifted output; 12 f for output an octave higher and -12 for output an octave lower.", value=0)
+        #         #clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
+        #         spk_item = gr.Slider(
+        #             minimum=0,
+        #             maximum=2333,
+        #             step=1,
+        #             label="speaker id",
+        #             value=0,
+        #             visible=False,
+        #             interactive=True,
+        #         )
+        #         #clean_button.click(fn=clean, inputs=[], outputs=[sid0])
+        #         sid0.change(
+        #             fn=get_vc,
+        #             inputs=[sid0],
+        #             outputs=[spk_item],
+        #         )
+        #         but0 = gr.Button("Convert", variant="primary")
+        #     with gr.Row():
+        #         with gr.Column():
+        #             # with gr.Row():
+        #             #     dropbox = gr.File(label="Drag your audio file and click refresh.")
+        #             with gr.Row():
+        #                 record_button=gr.Audio(source="microphone", label="Or you can use your microphone!", type="filepath")
+        #             with gr.Row():
+        #                 input_audio0 = gr.Dropdown(
+        #                     label="2.Choose the audio file.",
+        #                     value="./audios/calm.wav",
+        #                     choices=audio_files
+        #                 )
+        #                 audio_display = gr.Audio(value=input_audio0.value, label="Selected Audio File", type="filepath")
+        #                 # dropbox.upload(fn=save_to_wav2, inputs=[dropbox], outputs=[input_audio0])
+        #                 # dropbox.upload(fn=change_choices2, inputs=[], outputs=[input_audio0])
+        #                 refresh_button2 = gr.Button("Refresh", variant="primary", size='sm')
+        #                 # transcribed_text = gr.Textbox(label="transcibed text + mfa",
+        #                 #                               value="The dogs sat at the door.",
+        #                 #                             info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
+        #                 record_button.change(fn=save_to_wav, inputs=[record_button], outputs=[input_audio0])
+        #                 record_button.change(fn=change_choices2, inputs=[], outputs=[input_audio0])
+        #                 # update audio_display
+        #                 input_audio0.change(fn=lambda x: x, inputs=[input_audio0], outputs=[audio_display])
+        #             with gr.Row():
+        #                 # with gr.Column():
+        #                     # input_audio = gr.Audio(label="Input Audio", type="filepath")
+        #                     # # transcribe_btn_model = gr.Radio(value="base.en", interactive=True, label="what whisper model to download",
+        #                     # #                                 choices=["tiny.en", "base.en", "small.en", "medium.en", "large"],
+        #                     # #                                 info="VRAM usage: tiny.en 1 GB, base.en 1GB, small.en 2GB, medium.en 5GB, large 10GB.")
+        #                     # transcribed_text = gr.Textbox(label="transcibed text + mfa",
+        #                     #                             info="write down the transcript for the file, or run whisper model to get the transcript. Takes time to download whisper models on first run")
+        #                     # transcribe_info_text = gr.TextArea(label="How to use",
+        #                     #                                 value="running everything for the first time will download necessary models (4GB for main encoder + model) \n load a voice and choose your whisper model, base works most of the time. \n transcription and mfa takes ~50s on a 3090 for a 7s audio clip, rerun this when uploading a new audio clip only\nchoose the END value of the cut off word \n")
+        #                     # transcribe_btn = gr.Button(value="transcribe and create mfa")
+        #                 with gr.Column():
+        #                     target_transcript = gr.Textbox(label="target transcript")
+        #                 # transcribe_btn.click(fn=transcribe_btn_click, inputs=[input_audio],
+        #                 #                      outputs=[transcribed_text])
+        #         with gr.Column():
+        #             output_audio_gen = gr.Audio(
+        #                 label="Output Audio generated",
+        #                 type='filepath',
+        #                 interactive=False
+        #             )
+        #             vc_output2 = gr.Audio(
+        #                 label="Voice converted! (Click on the three dots to download the audio)",
+        #                 type='filepath',
+        #                 interactive=False,
+        #             )
+        #         #with gr.Column():
+        #             with gr.Accordion("Advanced TTS Settings", open=False):
+        #                 seed = gr.Number(label='seed', interactive=True, value=1)
+        #                 stop_repitition = gr.Radio(label="stop_repitition", interactive=True, choices=[1, 2, 3], value=3,
+        #                                         info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1")
+        #                 sample_batch_size = gr.Radio(label="sample_batch_size", interactive=True, choices=[4, 3, 2], value=4,
+        #                                             info="if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4")
+        #                 left_margin = gr.Number(label='left_margin', interactive=True, value=0.08, step=0.01,
+        #                                         info=" not used for TTS, only for speech editing")
+        #                 right_margin = gr.Number(label='right_margin', interactive=True, value=0.08, step=0.01,
+        #                                         info=" not used for TTS, only for speech editing")
+        #                 codecaudio_sr = gr.Number(label='codec_audio_sr', interactive=True, value=16000)
+        #                 codec_sr = gr.Number(label='codec', interactive=True, value=50)
+        #                 top_k = gr.Number(label='top_k', interactive=True, value=0)
+        #                 top_p = gr.Number(label='top_p', interactive=True, value=0.8)
+        #                 temperature = gr.Number(label='temperature', interactive=True, value=1)
+        #                 kvcache = gr.Number(label='kvcache', interactive=True, value=1,
+        #                                     info='set to 0 to use less VRAM, results may be worse and slower inference')
+        #                 silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")
+        #             with gr.Accordion("Index Settings", open=False):
+        #                 #with gr.Row():
+        #                     file_index1 = gr.Dropdown(
+        #                         label="3. Choose the index file (in case it wasn't automatically found.)",
+        #                         choices=get_indexes(),
+        #                         value=get_index(),
+        #                         interactive=True,
+        #                         )
+        #                     sid0.change(fn=match_index, inputs=[sid0],outputs=[file_index1])
+        #                     refresh_button.click(
+        #                         fn=change_choices, inputs=[], outputs=[sid0, file_index1]
+        #                         )
+        #                     # file_big_npy1 = gr.Textbox(
+        #                     #     label=i18n("特征文件路径"),
+        #                     #     value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
+        #                     #     interactive=True,
+        #                     # )
+        #                     index_rate1 = gr.Slider(
+        #                         minimum=0,
+        #                         maximum=1,
+        #                         label="index rate",
+        #                         value=0,
+        #                         interactive=True,
+        #                         )
+        #             # animate_button.click(fn=mouth, inputs=[size, face, vc_output2, faces], outputs=[animation, preview])
+        #             with gr.Accordion("Advanced Options", open=False):
+        #                 f0method0 = gr.Radio(
+        #                     label="Optional: Change the Pitch Extraction Algorithm. Extraction methods are sorted from 'worst quality' to 'best quality'. If you don't know what you're doing, leave rmvpe.",
+        #                     choices=["pm", "dio", "crepe-tiny", "mangio-crepe-tiny", "crepe", "harvest", "mangio-crepe", "rmvpe"], # Fork Feature. Add Crepe-Tiny
+        #                     value="rmvpe",
+        #                     interactive=True,
+        #                 )
+        #                 crepe_hop_length = gr.Slider(
+        #                     minimum=1,
+        #                     maximum=512,
+        #                     step=1,
+        #                     label="Mangio-Crepe Hop Length. Higher numbers will reduce the chance of extreme pitch changes but lower numbers will increase accuracy. 64-192 is a good range to experiment with.",
+        #                     value=120,
+        #                     interactive=True,
+        #                     visible=False,
+        #                     )
+        #                 f0method0.change(fn=whethercrepeornah, inputs=[f0method0], outputs=[crepe_hop_length])
+        #                 filter_radius0 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=7,
+        #                     label="label",
+        #                     value=3,
+        #                     step=1,
+        #                     interactive=True,
+        #                     )
+        #                 resample_sr0 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=48000,
+        #                     label="label",
+        #                     value=0,
+        #                     step=1,
+        #                     interactive=True,
+        #                     visible=False
+        #                     )
+        #                 rms_mix_rate0 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=1,
+        #                     label="label",
+        #                     value=0.21,
+        #                     interactive=True,
+        #                     )
+        #                 protect0 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=0.5,
+        #                     label="label",
+        #                     value=0,
+        #                     step=0.01,
+        #                     interactive=True,
+        #                     )
+        #                 formanting = gr.Checkbox(
+        #                     value=bool(DoFormant),
+        #                     label="[EXPERIMENTAL] Formant shift inference audio",
+        #                     info="Used for male to female and vice-versa conversions",
+        #                     interactive=True,
+        #                     visible=True,
+        #                 )
+        #                 formant_preset = gr.Dropdown(
+        #                     value='',
+        #                     choices=get_fshift_presets(),
+        #                     label="browse presets for formanting",
+        #                     visible=bool(DoFormant),
+        #                 )
+        #                 formant_refresh_button = gr.Button(
+        #                     value='\U0001f504',
+        #                     visible=bool(DoFormant),
+        #                     variant='primary',
+        #                 )
+        #                 #formant_refresh_button = ToolButton( elem_id='1')
+        #                 #create_refresh_button(formant_preset, lambda: {"choices": formant_preset}, "refresh_list_shiftpresets")
+        #                 qfrency = gr.Slider(
+        #                         value=Quefrency,
+        #                         info="Default value is 1.0",
+        #                         label="Frequency for formant shifting",
+        #                         minimum=0.0,
+        #                         maximum=16.0,
+        #                         step=0.1,
+        #                         visible=bool(DoFormant),
+        #                         interactive=True,
+        #                     )
+        #                 tmbre = gr.Slider(
+        #                     value=Timbre,
+        #                     info="Default value is 1.0",
+        #                     label="Timbre for formant shifting",
+        #                     minimum=0.0,
+        #                     maximum=16.0,
+        #                     step=0.1,
+        #                     visible=bool(DoFormant),
+        #                     interactive=True,
+        #                 )
+        #                 formant_preset.change(fn=preset_apply, inputs=[formant_preset, qfrency, tmbre], outputs=[qfrency, tmbre])
+        #                 frmntbut = gr.Button("Apply", variant="primary", visible=bool(DoFormant))
+        #                 formanting.change(fn=formant_enabled,inputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button],outputs=[formanting,qfrency,tmbre,frmntbut,formant_preset,formant_refresh_button])
+        #                 frmntbut.click(fn=formant_apply,inputs=[qfrency, tmbre], outputs=[qfrency, tmbre])
+        #                 formant_refresh_button.click(fn=update_fshift_presets,inputs=[formant_preset, qfrency, tmbre],outputs=[formant_preset, qfrency, tmbre])
+        #     with gr.Row():
+        #         vc_output1 = gr.Textbox("")
+        #         f0_file = gr.File(label="f0 file", visible=False)
+        #         # run_btn.click(fn=run,
+        #         #             inputs=[
+        #         #                 input_audio0,
+        #         #                 seed,
+        #         #                 stop_repitition,
+        #         #                 sample_batch_size,
+        #         #                 left_margin,
+        #         #                 right_margin,
+        #         #                 codecaudio_sr,
+        #         #                 codec_sr,
+        #         #                 top_k,
+        #         #                 top_p,
+        #         #                 temperature,
+        #         #                 kvcache,
+        #         #                 cutoff_value,
+        #         #                 target_transcript,
+        #         #                 silence_tokens,
+        #         #                 transcribed_text],
+        #         #             outputs=[
+        #         #                 output_audio_con,
+        #         #                 output_audio_gen
+        #         #             ])
+        #         # but0.click(
+        #         #     vc_single,
+        #         #     [
+        #         #         spk_item,
+        #         #         input_audio0,
+        #         #         vc_transform0,
+        #         #         f0_file,
+        #         #         f0method0,
+        #         #         file_index1,
+        #         #         # file_index2,
+        #         #         # file_big_npy1,
+        #         #         index_rate1,
+        #         #         filter_radius0,
+        #         #         resample_sr0,
+        #         #         rms_mix_rate0,
+        #         #         protect0,
+        #         #         crepe_hop_length
+        #         #     ],
+        #         #     [vc_output1, vc_output2],
+        #         # )
+        #         but0.click(
+        #             fn=run_joint,
+        #             inputs=[
+        #                 input_audio0,
+        #                 seed,
+        #                 stop_repitition,
+        #                 sample_batch_size,
+        #                 left_margin,
+        #                 right_margin,
+        #                 codecaudio_sr,
+        #                 codec_sr,
+        #                 top_k,
+        #                 top_p,
+        #                 temperature,
+        #                 kvcache,
+        #                 target_transcript,
+        #                 silence_tokens,
+        #                 spk_item,
+        #                 vc_transform0,
+        #                 f0_file,
+        #                 f0method0,
+        #                 file_index1,
+        #                 # file_index2,
+        #                 # file_big_npy1,
+        #                 index_rate1,
+        #                 filter_radius0,
+        #                 resample_sr0,
+        #                 rms_mix_rate0,
+        #                 protect0,
+        #                 crepe_hop_length
+        #             ],
+        #             outputs=[vc_output1, output_audio_gen, vc_output2])
+        #     with gr.Accordion("Batch Conversion",open=False, visible=False):
+        #         with gr.Row():
+        #             with gr.Column():
+        #                 vc_transform1 = gr.Number(
+        #                     label="speaker id", value=0
+        #                 )
+        #                 opt_input = gr.Textbox(label="opt", value="opt")
+        #                 f0method1 = gr.Radio(
+        #                     label="f0 method",
+        #                     choices=["pm", "harvest", "crepe", "rmvpe"],
+        #                     value="rmvpe",
+        #                     interactive=True,
+        #                 )
+        #                 filter_radius1 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=7,
+        #                     label="harvest",
+        #                     value=3,
+        #                     step=1,
+        #                     interactive=True,
+        #                 )
+        #             with gr.Column():
+        #                 file_index3 = gr.Textbox(
+        #                     label="file index",
+        #                     value="",
+        #                     interactive=True,
+        #                 )
+        #                 file_index4 = gr.Dropdown(
+        #                     label="index path (dropdown)",
+        #                     choices=sorted(index_paths),
+        #                     interactive=True,
+        #                 )
+        #                 refresh_button.click(
+        #                     fn=lambda username: change_choices(username)[1],
+        #                     inputs=[gr.State('username')],
+        #                     outputs=file_index4,
+        #                 )
+        #                 # file_big_npy2 = gr.Textbox(
+        #                 #     label=i18n("特征文件路径"),
+        #                 #     value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
+        #                 #     interactive=True,
+        #                 # )
+        #                 index_rate2 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=1,
+        #                     label="index rate 2",
+        #                     value=1,
+        #                     interactive=True,
+        #                 )
+        #             with gr.Column():
+        #                 resample_sr1 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=48000,
+        #                     label="resample rate",
+        #                     value=0,
+        #                     step=1,
+        #                     interactive=True,
+        #                 )
+        #                 rms_mix_rate1 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=1,
+        #                     label="rms mix rate",
+        #                     value=1,
+        #                     interactive=True,
+        #                 )
+        #                 protect1 = gr.Slider(
+        #                     minimum=0,
+        #                     maximum=0.5,
+        #                     label="protection rate",
+        #                     value=0.33,
+        #                     step=0.01,
+        #                     interactive=True,
+        #                 )
+        #             with gr.Column():
+        #                 dir_input = gr.Textbox(
+        #                     label="directory input",
+        #                     value="E:\codes\py39\\test-20230416b\\todo-songs",
+        #                 )
+        #                 inputs = gr.File(
+        #                     file_count="multiple", label="input"
+        #                 )
+        #             with gr.Row():
+        #                 format1 = gr.Radio(
+        #                     label="output format",
+        #                     choices=["wav", "flac", "mp3", "m4a"],
+        #                     value="flac",
+        #                     interactive=True,
+        #                 )
+        #                 but1 = gr.Button("primary", variant="primary")
+        #                 vc_output3 = gr.Textbox(label="label")
+        #             but1.click(
+        #                 vc_multi,
+        #                 [
+        #                     spk_item,
+        #                     dir_input,
+        #                     opt_input,
+        #                     inputs,
+        #                     vc_transform1,
+        #                     f0method1,
+        #                     file_index3,
+        #                     file_index4,
+        #                     # file_big_npy2,
+        #                     index_rate2,
+        #                     filter_radius1,
+        #                     resample_sr1,
+        #                     rms_mix_rate1,
+        #                     protect1,
+        #                     format1,
+        #                     crepe_hop_length,
+        #                 ],
+        #                 [vc_output3],
+        #             )
+        #             but1.click(fn=lambda: easy_uploader.clear())
         with gr.TabItem("Download Voice Models"):
             with gr.Row():
                 url=gr.Textbox(label="Huggingface Link:")

requirements.txt CHANGED Viewed

@@ -17,12 +17,4 @@ mega.py
 gdown==5.1.0
 onnxruntime
 pyngrok==4.1.12
-xformers==0.0.22
-torchaudio==2.0.2
-torch==2.0.1 # this assumes your system is compatible with CUDA 11.7, otherwise checkout https://pytorch.org/get-started/previous-versions/#v201
-tensorboard==2.16.2
-phonemizer==3.2.1
-datasets==2.16.0
-torchmetrics==0.11.1
-whisperx @ git+https://github.com/m-bain/whisperx.git
 # install MFA for getting forced-alignment, this could take a few minutes

 gdown==5.1.0
 onnxruntime
 pyngrok==4.1.12
 # install MFA for getting forced-alignment, this could take a few minutes