""" Description: This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK. Dependencies: all the necessary dependencies are listed in requirements.txt Usage: The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space Author: Lorenzo Concina Date: 4/6/2025 """ import os import torch import librosa as lb import gradio as gr from transformers import AutoProcessor, pipeline from datasets import load_dataset def load_fama(model_id, input_lang, task_type): processor = AutoProcessor.from_pretrained(model_id) device = "cuda:0" if torch.cuda.is_available() else "cpu" tgt_lang = "it" #select the right lang depending by Utterance lang and Task type output_lang = "" if task_type == "ASR": output_lang = input_lang elif task_type == "ST" and input_lang == "it": output_lang = "en" elif task_type == "ST" and input_lang == "en": output_lang = "it" # Force the model to start with the language tag lang_tag = "".format(output_lang) lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag) generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id} pipe = pipeline( "automatic-speech-recognition", model=model_id, trust_remote_code=True, torch_dtype=torch.float32, device=device, return_timestamps=False, generate_kwargs=generate_kwargs, chunk_length_s=60, stride_length_s=1 ) return pipe def load_audio_file(audio_path): y, sr = lb.load(audio_path, sr=16000, mono=True) return y def transcribe(audio, task_type, model_id, input_lang): """ Function called by gradio interface. It runs model inference on an audio sample """ pipeline = load_fama(model_id, input_lang, task_type) if isinstance(audio, str) and os.path.isfile(audio): #load the audio with Librosa utterance = load_audio_file(audio) result = pipeline(utterance) else: #user used the mic result = pipeline(audio) return result["text"] def update_model_options(task_type): if task_type == "ST": model_choices = ["FBK-MT/fama-small", "FBK-MT/fama-medium"] default_model = "FBK-MT/fama-small" button_label = "Translate" textbox_label = "Translation" else: model_choices = [ "FBK-MT/fama-small", "FBK-MT/fama-medium", "FBK-MT/fama-small-asr", "FBK-MT/fama-medium-asr" ] default_model = "FBK-MT/fama-small" button_label = "Transcribe" textbox_label = "Transcription" return ( gr.update(choices=model_choices, value=default_model), gr.update(value=button_label), gr.update(label=textbox_label) ) # Language options (languages supported by FAMA models) language_choices = ["en", "it"] if __name__ == "__main__": with gr.Blocks() as iface: gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo for English and Italian powered by FAMA models, developed at FBK. \ More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""") #with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload or record audio") #task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type") lang_input = gr.Dropdown(choices=language_choices, value="it", label="Utterance Language") task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type") model_input = gr.Radio(choices=[ "FBK-MT/fama-small", "FBK-MT/fama-medium", "FBK-MT/fama-small-asr", "FBK-MT/fama-medium-asr" ], value="FBK-MT/fama-small", label="Select a FAMA model") output = gr.Textbox(label="Transcription") transcribe_btn = gr.Button("Transcribe") #Dinamically change object when task changes task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=[model_input, transcribe_btn, output]) transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output) gr.Markdown(""" ### Instructions: \n 1 - Load an audio file or record yourself talking with a microphone \n 2 - Specify the language of the utterance (FAMA supports English and Italian)\n 3 - Select the task to run: Speech recognition or Speech Translation. \n 4 - Select a FAMA model among the available ones \n 4 - Click on Transcribe/Translate """) iface.launch()