Spaces:

Laronix
/

Laronix_ASR_TTS_VC

Build error

App Files Files Community

KevinGeng commited on Aug 18, 2023

Commit

49f67b9

•

1 Parent(s): 06a906b

remove app.ver1.py app.whisper.fine_tuned.py

Browse files

Files changed (2) hide show

app.ver1.py +0 -72
app.whisper.fine_tuned.py +0 -272

app.ver1.py DELETED Viewed

@@ -1,72 +0,0 @@
-#TODO:
-#  + [x] Load Configuration
-#  + [ ] Checking
-#  + [ ] Better saving directory
-from pathlib import Path
-from transformers import pipeline
-import torch.nn as nn
-import torch
-import torchaudio
-import gradio as gr
-import sys
-# Local imports
-sys.path.append("src")
-from espnet2.bin.tts_inference import Text2Speech
-from espnet2.utils.types import str_or_none
-# Check if GPU is available
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# ASR part
-data_path = "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
-audio_files = sorted(list(Path(data_path).glob("**/*wav")))
-# audio_files = sorted(list(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav")))
-transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_train_dev_test_seed_1")
-# TTS part
-def load_model(lang, tag, vocoder_tag):
-    if lang == "Japanese":
-        if tag == "kan-bayashi/ljspeech_parallel_wavegan":
-            tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_parallel_wavegan")
-        elif tag == "kan-bayashi/ljspeech_merlin_multi_band_melgan":
-            tts_model = Text2Speech.from_pretrained("kan-bayashi/ljspeech_merlin_multi_band_melgan")
-        else:
-            raise ValueError(f"Not supported: lang={lang}, tag={tag}")
-        vocoder = None if vocoder_tag == "none" else vocoder_tag
-    elif lang == "English":
-        # VITS needs no vocoder; others do
-        if tag == "kan-bayashi/libritts_xvector_vits":
-            tts_model = Text2Speech.from_pretrained("kan-bayashi/libritts_xvector_vits")
-            vocoder = None
-        elif tag == "kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3":
-            tts_model = Text2Speech.from_pretrained("kan-bayashi/fastspeech2_en_libritts_guessspeaker_melgan.v3")
-            vocoder = "melgan"
-        else:
-            raise ValueError(f"Not supported: lang={lang}, tag={tag}")
-    else:
-        raise ValueError(f"Not supported: lang={lang}")
-    return tts_model, vocoder
-tts_model, vocoder_tag = load_model(lang="English", tag="kan-bayashi/libritts_xvector_vits", vocoder_tag="parallel_wavegan/vctk_parallel_wavegan.v1.long")
-tts_model = tts_model.to(device)
-vocoder = None if vocoder_tag == "none" else torchaudio.models.vocoder.from_pretrained(vocoder_tag).to(device)
-# Gradio part
-def synthesize(text):
-    with torch.no_grad():
-        # Text-to-speech
-        wav = tts_model(text)[0]
-        if vocoder is not None:
-            # Apply vocoder
-            wav = vocoder.inference(wav)
-        # Convert to numpy array
-        wav = wav.squeeze().cpu().numpy()
-    return wav
-interface = gr.Interface(synthesize, inputs="text", outputs="audio")
-interface.launch()

app.whisper.fine_tuned.py DELETED Viewed

@@ -1,272 +0,0 @@
-"""
-TODO:
-    + [x] Load Configuration
-    + [ ] Checking
-    + [ ] Better saving directory
-"""
-import numpy as np
-from pathlib import Path
-import torch.nn as nn
-import torch
-import torchaudio
-from transformers import pipeline
-from pathlib import Path
-# local import
-import sys
-from espnet2.bin.tts_inference import Text2Speech
-from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC# pdb.set_trace()
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-sys.path.append("src")
-import gradio as gr
-# ASR part
-audio_files = [
-    str(x)
-    for x in sorted(
-        Path(
-            "/home/kevingeng/Disk2/laronix/laronix_automos/data/20230103_video"
-        ).glob("**/*wav")
-    )
-]
-# audio_files = [str(x) for x in sorted(Path("./data/Patient_sil_trim_16k_normed_5_snr_40/Rainbow").glob("**/*wav"))]
-# transcriber = pipeline(
-#     "automatic-speech-recognition",
-#     model="KevinGeng/PAL_John_128_train_dev_test_seed_1",
-# )
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-processor = AutoProcessor.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
-model = AutoModelForSpeechSeq2Seq.from_pretrained("KevinGeng/whipser_medium_en_PAL300_step25")
-# feature_extractor = AutoFeatureExtractor.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
-# representation_model = AutoModelForCTC.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
-# tokenizer = AutoTokenizer.from_pretrained("KevinGeng/PAL_John_128_train_dev_test_seed_1")
-transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/whipser_medium_en_PAL300_step25")
-# transcriber = pipeline("automatic-speech-recognition", model="KevinGeng/PAL_John_128_p326_300_train_dev_test_seed_1")
-# 【Female】kan-bayashi ljspeech parallel wavegan
-# tts_model = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits")
-# 【Male】fastspeech2-en-200_speaker-cv4, hifigan vocoder
-# pdb.set_trace()
-# @title English multi-speaker pretrained model { run: "auto" }
-lang = "English"
-tag = "kan-bayashi/libritts_xvector_vits"
-# vits needs no
-vocoder_tag = "parallel_wavegan/vctk_parallel_wavegan.v1.long"  # @param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}
-from espnet2.bin.tts_inference import Text2Speech
-from espnet2.utils.types import str_or_none
-text2speech = Text2Speech.from_pretrained(
-    model_tag=str_or_none(tag),
-    vocoder_tag=str_or_none(vocoder_tag),
-    device="cuda",
-    use_att_constraint=False,
-    backward_window=1,
-    forward_window=3,
-    speed_control_alpha=1.0,
-)
-import glob
-import os
-import numpy as np
-import kaldiio
-# Get model directory path
-from espnet_model_zoo.downloader import ModelDownloader
-d = ModelDownloader()
-model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])
-# Speaker x-vector selection
-xvector_ark = [
-    p
-    for p in glob.glob(
-       f"xvector/test-clean/spk_xvector.ark", recursive=True
-    )
-    if "test" in p
-][0]
-xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
-spks = list(xvectors.keys())
-male_spks = {
-    "Male1": "2300_131720",
-    "Male2": "1320_122612",
-}
-    # "M3": "1188_133604",
-    # "M4": "61_70970",
-female_spks = {"Female1": "2961_961", "Female2": "8463_287645", }
-# "F3": "121_121726"
-spks = dict(male_spks, **female_spks)
-spk_names = sorted(spks.keys())
-## 20230224 Mousa: No reference,
-def ASRTTS(audio_file, spk_name, ref_text=""):
-    spk = spks[spk_name]
-    spembs = xvectors[spk]
-    if ref_text == "":
-        reg_text = transcriber(audio_file)["text"]
-    else:
-        reg_text = ref_text
-    speech, sr = torchaudio.load(
-        audio_file, channels_first=True
-    )  # Mono channel
-    wav_tensor_spembs = text2speech(
-        text=reg_text, speech=speech, spembs=spembs
-    )["wav"]
-    wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
-    sample_rate = 22050
-    save_id = (
-        "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
-    )
-    torchaudio.save(
-        save_id,
-        src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
-        sample_rate=22050,
-    )
-    return save_id, reg_text
-def ASRTTS_clean(audio_file, spk_name):
-    spk = spks[spk_name]
-    spembs = xvectors[spk]
-    reg_text = transcriber(audio_file)["text"]
-    speech, sr = torchaudio.load(
-        audio_file, channels_first=True
-    )  # Mono channel
-    wav_tensor_spembs = text2speech(
-        text=reg_text, speech=speech, spembs=spembs
-    )["wav"]
-    wav_numpy = wav_tensor_spembs.unsqueeze(1).to("cpu")
-    sample_rate = 22050
-    save_id = (
-        "./wav/" + Path(audio_file).stem + "_" + spk_name + "_spkembs.wav"
-    )
-    torchaudio.save(
-        save_id,
-        src=wav_tensor_spembs.unsqueeze(0).to("cpu"),
-        sample_rate=22050,
-    )
-    return save_id
-reference_textbox = gr.Textbox(
-    value="",
-    placeholder="Input reference here",
-    label="Reference",
-)
-recognization_textbox = gr.Textbox(
-    value="",
-    placeholder="Output recognization here",
-    label="recognization_textbox",
-)
-speaker_option = gr.Radio(choices=spk_names, label="Speaker")
-input_audio = gr.Audio(
-    source="upload", type="filepath", label="Audio_to_Evaluate"
-)
-output_audio = gr.Audio(
-    source="upload", file="filepath", label="Synthesized Audio"
-)
-examples = [
-    ["./samples/001.wav", "M1", ""],
-    ["./samples/002.wav", "M2", ""],
-    ["./samples/003.wav", "F1", ""],
-    ["./samples/004.wav", "F2", ""],
-]
-def change_audiobox(choice):
-    if choice == "upload":
-        input_audio = gr.Audio.update(source="upload", visible=True)
-    elif choice == "microphone":
-        input_audio = gr.Audio.update(source="microphone", visible=True)
-    else:
-        input_audio = gr.Audio.update(visible=False)
-    return input_audio
-def show_icon(choice):
-    if choice == "Male1":
-        spk_icon = gr.Image.update(value="speaker_icons/male1.png", visible=True)
-    elif choice == "Male2":
-        spk_icon = gr.Image.update(value="speaker_icons/male2.png", visible=True)
-    elif choice == "Female1":
-        spk_icon = gr.Image.update(value="speaker_icons/female1.png", visible=True)
-    elif choice == "Female2":
-        spk_icon = gr.Image.update(value="speaker_icons/female2.png", visible=True)
-    return spk_icon
-def get_download_file(audio_file=None):
-    if audio_file == None:
-        output_audio_file = gr.File.update(visible=False)
-    else:
-        output_audio_file = gr.File.update(visible=True)
-    return output_audio_file
-def download_file(audio_file):
-    return gr.File(value=audio_file)
-# pdb.set_trace()
-with gr.Blocks(
-    analytics_enabled=False,
-    css=".gradio-container {background-color: #78BD91}",
-) as demo:
-    with gr.Column(elem_id="Column"):
-        input_format = gr.Radio(
-            choices=["microphone", "upload"], label="Choose your input format", elem_id="input_format"
-        )
-        input_audio = gr.Audio(
-            source="microphone",
-            type="filepath",
-            label="Input Audio",
-            interactive=True,
-            visible=False,
-            elem_id="input_audio"
-        )
-        input_format.change(
-            fn=change_audiobox, inputs=input_format, outputs=input_audio
-        )
-        speaker_option = gr.Radio(choices=spk_names, value="Male1", label="Choose your voice profile")
-        spk_icon = gr.Image(value="speaker_icons/male1.png",
-                            type="filepath",
-                            image_mode="RGB",
-                            source="upload",
-                            shape=[50, 50],
-                            interactive=True,
-                            visible=True)
-        speaker_option.change(
-            fn=show_icon, inputs=speaker_option, outputs=spk_icon
-        )
-    b2 = gr.Button("Convert")
-    output_audio = gr.Audio(
-        source="upload", file="filepath", label="Converted Audio", interactive=False
-    )
-    b2.click(
-        ASRTTS_clean,
-        inputs=[input_audio, speaker_option],
-        outputs=output_audio,
-        api_name="convert"
-    )
-# download_file("wav/001_F1_spkembs.wav")
-demo.launch(share=False)