import gradio as gr from transformers import pipeline import numpy as np model_id = "NbAiLab/nb-whisper-small-beta" transcriber = pipeline("automatic-speech-recognition", model=model_id) total_time = 0 counter = 0 def make_timestamp(ref): global total_time hh = int((total_time + ref) / 3600) mm = int((total_time + ref) / 60) % 60 ss = int((total_time + ref) % 60) mmm = int((total_time + ref) % 1000) return f"{hh:02d}:{mm:02d}:{ss:02d},{mmm:03d}" def transcribe(audio): global counter global total_time sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) conf = {"sampling_rate": sr, "raw": y} kwargs = {"task": "transcribe", "language": "no"} res = transcriber(conf, generate_kwargs=kwargs, return_timestamps=True) chunks = res["chunks"] timestamps = [c["timestamp"] for c in chunks] text = [c["text"].strip() for c in chunks] entries = [] for (start, end), txt in zip(timestamps, text): start_srt = make_timestamp(start) end_srt = make_timestamp(end) srt_entry = f"{counter}\n{start_srt} --> {end_srt}\n{txt}\n" entries.append(srt_entry) total_time += end counter += 1 return "\n".join(entries) demo = gr.Interface( transcribe, gr.Audio(source="microphone"), "text", ) demo.launch()