File size: 3,304 Bytes
e767c1f
 
574cd0e
 
 
 
 
e767c1f
574cd0e
 
 
 
 
 
e767c1f
 
 
 
 
 
574cd0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from transformers import pipeline
import gradio as gr
import os
import subprocess
from pytube import YouTube
 
pipe = pipeline(model="tilos/whisper-small-zh-HK")  # change to "your-username/the-name-you-picked"

def video2mp3(video_file, output_ext="mp3"):
    filename, ext = os.path.splitext(video_file)
    subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"],
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
    return f"{filename}.{output_ext}"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text


def get_text(url):
    result = pipe(get_audio(url))
    return result['text'].strip()

def get_audio(url):
    website = YouTube(url)
    video = website.streams.filter(only_audio=True).first()
    out_file = video.download(output_path=".")
    base, ext = os.path.splitext(out_file)
    new_file = base + '.mp3'
    os.rename(out_file, new_file)
    audio = new_file
    return audio

def offline_video(video):
    audio_file = video2mp3(video)
    text = transcribe(audio_file)
    return text


with gr.Blocks() as demo:

    # video file input
    gr.Interface(
            title="Whisper: Real Time Cantonese Recognition",
            description="Realtime demo for Cantonese speech recognition using a fine-tuned Whisper small model. "
                        "Generate zh-HK subtitle from video file, audio file, your microphone, and Youtube URL",
            fn=offline_video,
            inputs="video",
            outputs="text",
            allow_flagging="never",
        )

    # audio file input
    with gr.Row():
        with gr.Column():
            input_audio = gr.Audio(source="upload", type="filepath")
            micro_btn = gr.Button('Generate Voice Subtitles')
        with gr.Column():
            output_audio = gr.Textbox(placeholder='Transcript from audio', label='Subtitles')
            micro_btn.click(transcribe, inputs=input_audio, outputs=output_audio)
    """
    gr.Interface(
            fn=transcribe,
            title="Whisper: zh-HK Subtitle Generator",
            description="Generate zh-HK subtitle from audio file, your microphone and Youtube",
            inputs = gr.Audio(source="upload", type="filepath", optional=True),
            outputs = "text",
            allow_flagging= "never",
    )
    """

    # microphone input
    with gr.Row():
        with gr.Column():
            input_mircro = gr.Audio(source="microphone", type="filepath")
            micro_btn = gr.Button('Generate Voice Subtitles')
        with gr.Column():
            output_micro = gr.Textbox(placeholder='Transcript from mic', label='Subtitles')
            micro_btn.click(transcribe, inputs=input_mircro, outputs=output_micro)

    # Youtube url input
    with gr.Row():
        with gr.Column():
            inputs_url = gr.Textbox(placeholder='Youtube URL', label='URL')
            url_btn = gr.Button('Generate Youtube Video Subtitles')
            examples = gr.Examples(examples=["https://www.youtube.com/watch?v=Yw4EoGWe0vw"],inputs=[inputs_url])
        with gr.Column():
            output_url = gr.Textbox(placeholder='Transcript from video.', label='Transcript')
            url_btn.click(get_text, inputs=inputs_url, outputs=output_url )



demo.launch(debug=True)