Spaces:

SpeechTek
/

FAMA-demo

Running

FAMA-demo / app.py

Lorenzoncina

ST feedbacks implemented

927d6f8 3 days ago

4.89 kB

	"""
	Description:
	This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK.

	Dependencies:
	all the necessary dependencies are listed in requirements.txt

	Usage:
	The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space

	Author: Lorenzo Concina
	Date: 4/6/2025
	"""
	import os
	import torch
	import librosa as lb
	import gradio as gr
	from transformers import AutoProcessor, pipeline
	from datasets import load_dataset

	def load_fama(model_id, input_lang, task_type):
	processor = AutoProcessor.from_pretrained(model_id)

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	tgt_lang = "it"

	#select the right lang depending by Utterance lang and Task type
	output_lang = ""
	if task_type == "ASR":
	output_lang = input_lang
	elif task_type == "ST" and input_lang == "it":
	output_lang = "en"
	elif task_type == "ST" and input_lang == "en":
	output_lang = "it"

	# Force the model to start with the language tag
	lang_tag = "<lang:{}>".format(output_lang)
	lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag)

	generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id}

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model_id,
	trust_remote_code=True,
	torch_dtype=torch.float32,
	device=device,
	return_timestamps=False,
	generate_kwargs=generate_kwargs,
	chunk_length_s=60,
	stride_length_s=1
	)
	return pipe

	def load_audio_file(audio_path):
	y, sr = lb.load(audio_path, sr=16000, mono=True)
	return y

	def transcribe(audio, task_type, model_id, input_lang):
	"""
	Function called by gradio interface. It runs model inference on an audio sample
	"""
	pipeline = load_fama(model_id, input_lang, task_type)

	if isinstance(audio, str) and os.path.isfile(audio):
	#load the audio with Librosa
	utterance = load_audio_file(audio)
	result = pipeline(utterance)
	else:
	#user used the mic
	result = pipeline(audio)
	return result["text"]


	def update_model_options(task_type):
	if task_type == "ST":
	model_choices = ["FBK-MT/fama-small", "FBK-MT/fama-medium"]
	default_model = "FBK-MT/fama-small"
	button_label = "Translate"
	textbox_label = "Translation"
	else:
	model_choices = [
	"FBK-MT/fama-small",
	"FBK-MT/fama-medium",
	"FBK-MT/fama-small-asr",
	"FBK-MT/fama-medium-asr"
	]
	default_model = "FBK-MT/fama-small"
	button_label = "Transcribe"
	textbox_label = "Transcription"

	return (
	gr.update(choices=model_choices, value=default_model),
	gr.update(value=button_label),
	gr.update(label=textbox_label)
	)


	# Language options (languages supported by FAMA models)
	language_choices = ["en", "it"]


	if __name__ == "__main__":

	with gr.Blocks() as iface:
	gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo for English and Italian powered by FAMA models, developed at FBK. \
	More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""")
	#with gr.Row():
	audio_input = gr.Audio(type="filepath", label="Upload or record audio")
	#task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")

	lang_input = gr.Dropdown(choices=language_choices, value="it", label="Utterance Language")
	task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type")

	model_input = gr.Radio(choices=[
	"FBK-MT/fama-small",
	"FBK-MT/fama-medium",
	"FBK-MT/fama-small-asr",
	"FBK-MT/fama-medium-asr"
	], value="FBK-MT/fama-small", label="Select a FAMA model")

	output = gr.Textbox(label="Transcription")

	transcribe_btn = gr.Button("Transcribe")
	#Dinamically change object when task changes
	task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=[model_input, transcribe_btn, output])

	transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output)

	gr.Markdown(""" ### Instructions: \n
	1 - Load an audio file or record yourself talking with a microphone \n
	2 - Specify the language of the utterance (FAMA supports English and Italian)\n
	3 - Select the task to run: Speech recognition or Speech Translation. \n
	4 - Select a FAMA model among the available ones \n
	4 - Click on Transcribe/Translate
	""")

	iface.launch()