tonyliu404's picture
Update app.py
2489c1a verified
from transformers import pipeline
import soundfile as sf
import numpy as np
import librosa
import gradio as gr
from IPython.display import Audio as IPythonAudio
import torch
import tempfile
asr = pipeline("automatic-speech-recognition", model="distil-whisper/distil-small.en") #sound to text model
tr = pipeline("translation", model="facebook/nllb-200-distilled-600M", torch_dtype=torch.bfloat16) #text translator model
narrator = pipeline("text-to-speech", model="facebook/mms-tts-spa") #text to speech spanish
demo = gr.Blocks()
def transcribe_long_form(filepath):
if filepath is None:
gr.Warning("No audio found, please retry.")
return ""
audio, sampling_rate = sf.read(filepath) #reading the converted .wav
#converting audio into one dimension (stereo audio has 2, audio and spacial audio. We dont need spacial)
audio_transposed = np.transpose(audio)
audio_mono = librosa.to_mono(audio_transposed)
IPythonAudio(audio_mono, rate=sampling_rate)
#converting to same sampling rate as model
audio_16KHz = librosa.resample(audio_mono,
orig_sr=sampling_rate,
target_sr=16000)
output = asr(
audio_16KHz,
max_new_tokens=256,
chunk_length_s=30,
batch_size=12,
)
text_translated = tr(output["text"],
src_lang="eng_Latn",
tgt_lang="spa_Latn")
completed_translation = text_translated[0]['translation_text']
narrated_text = narrator(completed_translation)
# Save the narrated audio to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
sf.write(tmpfile.name, narrated_text['audio'][0], narrated_text['sampling_rate'])
return tmpfile.name
mic_transcribe = gr.Interface(
fn=transcribe_long_form,
inputs=gr.Audio(sources="microphone",
type="filepath"),
outputs=gr.Audio(label="Translated Audio"),
flagging_mode="auto")
file_transcribe = gr.Interface(
fn=transcribe_long_form,
inputs=gr.Audio(sources="upload",
type="filepath"),
outputs=gr.Audio(label="Translated Audio"),
flagging_mode="auto",
)
with demo:
gr.TabbedInterface(
[mic_transcribe,
file_transcribe],
["Transcribe Microphone",
"Transcribe Audio File"],
)
demo.launch()