File size: 1,753 Bytes
7094a1f
e98332d
623ac6b
 
fe93032
e98332d
7ffdd10
e98332d
8c68f65
32ebaf4
0fef6ce
7ffdd10
32ebaf4
623ac6b
8c68f65
c286cd9
 
 
 
 
 
623ac6b
c286cd9
 
 
623ac6b
c286cd9
 
623ac6b
c286cd9
623ac6b
c286cd9
 
 
32ebaf4
c286cd9
 
 
 
c67a8f0
 
 
aa65169
c286cd9
aa65169
 
32ebaf4
691f320
f9bd2ea
fed7807
8c68f65
aa65169
8c68f65
 
4e62bcc
f9bd2ea
47a9fe0
7094a1f
 
f9bd2ea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from transformers import pipeline
import gradio as gr
import requests
from moviepy.editor import *
import os

pipe = pipeline(model="esnagy/whisper-small-hu")


def transcribe_audio(audio_file):
    print("Transcribing audio: ", audio_file)
    text = pipe(audio_file)["text"]
    return text


def transcribe_video(video_url):
    # Download the video from the URL
    video_filename = "temp_video.mp4"
    with open(video_filename, "wb") as f:
        response = requests.get(video_url)
        f.write(response.content)

    # Load the video using moviepy
    video = VideoFileClip(video_filename)
    audio = video.audio

    audio_file = "temp_audio.wav"
    audio.write_audiofile(audio_file, codec="pcm_s16le")

    text = transcribe_audio(audio_file)

    # Remove temporary files
    os.remove(video_filename)
    os.remove(audio_file)

    return text


def transcribe(video_url="", audio=None):
    print("[transcribe] Transcribing...")
    print("[transcribe] video_url: ", video_url)
    print("[transcribe] audio: ", audio)
    if video_url.strip() != "":
        return transcribe_video(video_url)
    else:
        return transcribe_audio(audio)


iface = gr.Interface(
    lambda video_url, audio: transcribe(video_url, audio),
    inputs=[
        gr.Textbox(label="Enter video URL", placeholder="Or leave empty to use microphone"),
        gr.Audio(sources=["microphone"], type="filepath"),
    ],
    outputs="text",
    title="Whisper Small Hungarian",
    description="Realtime demo for Hungarian speech recognition using a fine-tuned Whisper small model. Enter a video URL or record your voice to transcribe.\nExample video URL: https://github.com/pwang697/Scalable-Machine-Learning-Lab_2/raw/test/vasar-hu.mp4",
)

iface.launch()