import gradio as gr
from transformers import pipeline
import numpy as np

model_id = "NbAiLab/nb-whisper-small-beta"
transcriber = pipeline("automatic-speech-recognition", model=model_id)
total_time = 0
counter = 0


def make_timestamp(ref):
    global total_time
    hh = int((total_time + ref) / 3600)
    mm = int((total_time + ref) / 60) % 60
    ss = int((total_time + ref) % 60)
    mmm = int((total_time + ref) % 1000)
    return f"{hh:02d}:{mm:02d}:{ss:02d},{mmm:03d}"


def transcribe(audio):
    global counter
    global total_time
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    conf = {"sampling_rate": sr, "raw": y}
    kwargs = {"task": "transcribe", "language": "no"}
    res = transcriber(conf, generate_kwargs=kwargs, return_timestamps=True)
    chunks = res["chunks"]
    timestamps = [c["timestamp"] for c in chunks]
    text = [c["text"].strip() for c in chunks]

    entries = []
    for (start, end), txt in zip(timestamps, text):
        start_srt = make_timestamp(start)
        end_srt = make_timestamp(end)
        srt_entry = f"{counter}\n{start_srt} --> {end_srt}\n{txt}\n"
        entries.append(srt_entry)
        total_time += end
        counter += 1

    return "\n".join(entries)
   

demo = gr.Interface(
    transcribe,
    gr.Audio(source="microphone"),
    "text",
)

demo.launch()