Spaces:

tonyliu404
/

Audio-to-text-Translated

Sleeping

App Files Files Community

Audio-to-text-Translated / app.py

tonyliu404

Update app.py

2489c1a verified 8 months ago

raw

history blame contribute delete

2.43 kB

	from transformers import pipeline
	import soundfile as sf
	import numpy as np
	import librosa
	import gradio as gr
	from IPython.display import Audio as IPythonAudio
	import torch
	import tempfile

	asr = pipeline("automatic-speech-recognition", model="distil-whisper/distil-small.en") #sound to text model

	tr = pipeline("translation", model="facebook/nllb-200-distilled-600M", torch_dtype=torch.bfloat16) #text translator model

	narrator = pipeline("text-to-speech", model="facebook/mms-tts-spa") #text to speech spanish

	demo = gr.Blocks()
	def transcribe_long_form(filepath):
	if filepath is None:
	gr.Warning("No audio found, please retry.")
	return ""
	audio, sampling_rate = sf.read(filepath) #reading the converted .wav
	#converting audio into one dimension (stereo audio has 2, audio and spacial audio. We dont need spacial)
	audio_transposed = np.transpose(audio)
	audio_mono = librosa.to_mono(audio_transposed)
	IPythonAudio(audio_mono, rate=sampling_rate)

	#converting to same sampling rate as model
	audio_16KHz = librosa.resample(audio_mono,
	orig_sr=sampling_rate,
	target_sr=16000)
	output = asr(
	audio_16KHz,
	max_new_tokens=256,
	chunk_length_s=30,
	batch_size=12,
	)

	text_translated = tr(output["text"],
	src_lang="eng_Latn",
	tgt_lang="spa_Latn")

	completed_translation = text_translated[0]['translation_text']
	narrated_text = narrator(completed_translation)

	# Save the narrated audio to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
	sf.write(tmpfile.name, narrated_text['audio'][0], narrated_text['sampling_rate'])
	return tmpfile.name

	mic_transcribe = gr.Interface(
	fn=transcribe_long_form,
	inputs=gr.Audio(sources="microphone",
	type="filepath"),
	outputs=gr.Audio(label="Translated Audio"),
	flagging_mode="auto")

	file_transcribe = gr.Interface(
	fn=transcribe_long_form,
	inputs=gr.Audio(sources="upload",
	type="filepath"),
	outputs=gr.Audio(label="Translated Audio"),
	flagging_mode="auto",
	)

	with demo:
	gr.TabbedInterface(
	[mic_transcribe,
	file_transcribe],
	["Transcribe Microphone",
	"Transcribe Audio File"],
	)
	demo.launch()