from transformers import pipeline
import gradio as gr
import requests
from moviepy.editor import *
import os

pipe = pipeline(model="esnagy/whisper-small-hu")


def transcribe_audio(audio_file):
    print("Transcribing audio: ", audio_file)
    text = pipe(audio_file)["text"]
    return text


def transcribe_video(video_url):
    # Download the video from the URL
    video_filename = "temp_video.mp4"
    with open(video_filename, "wb") as f:
        response = requests.get(video_url)
        f.write(response.content)

    # Load the video using moviepy
    video = VideoFileClip(video_filename)
    audio = video.audio

    audio_file = "temp_audio.wav"
    audio.write_audiofile(audio_file, codec="pcm_s16le")

    text = transcribe_audio(audio_file)

    # Remove temporary files
    os.remove(video_filename)
    os.remove(audio_file)

    return text


def transcribe(video_url="", audio=None):
    print("[transcribe] Transcribing...")
    print("[transcribe] video_url: ", video_url)
    print("[transcribe] audio: ", audio)
    if video_url.strip() != "":
        return transcribe_video(video_url)
    else:
        return transcribe_audio(audio)


iface = gr.Interface(
    lambda video_url, audio: transcribe(video_url, audio),
    inputs=[
        gr.Textbox(label="Enter video URL", placeholder="Or leave empty to use microphone"),
        gr.Audio(sources=["microphone"], type="filepath"),
    ],
    outputs="text",
    title="Whisper Small Hungarian",
    description="Realtime demo for Hungarian speech recognition using a fine-tuned Whisper small model. Enter a video URL or record your voice to transcribe.\nExample video URL: https://github.com/pwang697/Scalable-Machine-Learning-Lab_2/raw/test/vasar-hu.mp4",
)

iface.launch()