from transformers import pipeline, WhisperModel import gradio as gr import pandas as pd import string pipe = pipeline(model="matteocirca/whisper-small-it-2",return_timestamps="word") current_audio = None segments = {} def audio2segments(audio,word): global segments,current_audio if audio != current_audio or current_audio == None: segments = pipe(audio) current_audio = audio if not word: if current_audio != None: return segments["text"],"

No Word inserted!

" else: return "","

No Word inserted!

" df = pd.DataFrame(columns=["Occurrence n","Starting TimeStamp","Ending TimeStamp"]) if word: ranges_list = [] ranges = [] print(segments) for w in segments['chunks']: if word == w["text"].translate(str.maketrans('', '', string.punctuation)).replace(" ","").lower() : ranges_list.append(w["timestamp"]) res = "" for i,r in enumerate(ranges_list): # ranges_list.append({"Occurrence n":i,"Starting TimeStamp":r[0],"Ending TimeStamp":r[1]}) res += f"" res+="
Occurrence n°StartEnd
{i}{r[0]}{r[1]}
" print(res) return segments["text"],res def clear(): segments = {} iface = gr.Interface( fn=audio2segments, inputs=[gr.Audio(sources=["upload","microphone"], type="filepath"),"text"], outputs=["text","html"], title="Whisper Small Italian", description="Realtime demo for Italian speech recognition using a fine-tuned Whisper small model.", ) iface.launch()