TurkmenTTSweSTT / app.py
Akmyradov's picture
Duplicate from facebook/MMS
757a712
raw
history blame
2.81 kB
import gradio as gr
import librosa
from asr import transcribe
from tts import synthesize, TTS_EXAMPLES
ALL_LANGUAGES = {}
for task in ["asr", "tts", "lid"]:
ALL_LANGUAGES.setdefault(task, {})
with open(f"data/{task}/all_langs.tsv") as f:
for line in f:
iso, name = line.split(" ", 1)
ALL_LANGUAGES[task][iso] = name
def identify(microphone, file_upload):
LID_SAMPLING_RATE = 16_000
warn_output = ""
if (microphone is not None) and (file_upload is not None):
warn_output = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
elif (microphone is None) and (file_upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
audio_fp = microphone if microphone is not None else file_upload
inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0]
raw_output = {"eng": 0.9, "hin": 0.04, "heb": 0.03, "ara": 0.02, "fra": 0.01}
return {(k + ": " + ALL_LANGUAGES["lid"][k]): v for k, v in raw_output.items()}
demo = gr.Blocks()
mms_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath"),
gr.Audio(source="upload", type="filepath"),
gr.Dropdown(
[f"{k}: {v}" for k, v in ALL_LANGUAGES["asr"].items()],
label="Language",
value="eng: English",
),
],
outputs="text",
title="Speech-to-text",
description=("Transcribe audio!"),
allow_flagging="never",
)
mms_synthesize = gr.Interface(
fn=synthesize,
inputs=[
gr.Text(label="Input text"),
gr.Dropdown(
[f"{k}: {v}" for k, v in ALL_LANGUAGES["tts"].items()],
label="Language",
value="eng: English",
),
gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"),
],
outputs=[
gr.Audio(label="Generated Audio", type="numpy"),
gr.Text(label="Filtered text after removing OOVs"),
],
examples=TTS_EXAMPLES,
title="Text-to-speech",
description=("Generate audio!"),
allow_flagging="never",
)
mms_identify = gr.Interface(
fn=identify,
inputs=[
gr.Audio(source="microphone", type="filepath"),
gr.Audio(source="upload", type="filepath"),
],
outputs=gr.Label(num_top_classes=10),
title="Language Identification",
description=("Identity the language of audio!"),
allow_flagging="never",
)
with demo:
gr.TabbedInterface(
[mms_transcribe, mms_synthesize, mms_identify],
["Speech-to-text", "Text-to-speech", "Language Identification"],
)
demo.launch()