File size: 3,663 Bytes
f62c279
051ee03
 
f62c279
f80a7eb
f4549cb
 
1b65381
afbb88c
f4549cb
051ee03
f62c279
 
 
035bc5f
f62c279
051ee03
b444720
051ee03
 
f62c279
59fe5e1
f62c279
051ee03
 
f62c279
 
 
 
59fe5e1
 
 
f62c279
59fe5e1
 
 
afbb88c
 
 
 
 
 
 
59fe5e1
f62c279
59fe5e1
f62c279
051ee03
 
f62c279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59fe5e1
 
 
f62c279
59fe5e1
 
 
 
 
 
 
 
 
 
 
f62c279
 
59fe5e1
051ee03
 
 
f62c279
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import whisper
import gradio as gr
from download_video import download_mp3_yt_dlp 

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="torch")

# Function to download the audio, title, and thumbnail from YouTube
def download_video_info(url):
    try:
        # Call the function to download video and get title, thumbnail
        title, thumbnail_url = download_mp3_yt_dlp(url)
        audio_file = "downloaded_video.mp3"  # Path to the downloaded audio (MP3)

        return audio_file, title, thumbnail_url
    except Exception as e:
        return None, None, None, str(e)

# Function to transcribe the downloaded audio using Whisper
def transcribe_audio(audio_path, model_size="base", language="en"):
    model = whisper.load_model(model_size)
    result = model.transcribe(audio_path, language=language)
    return result['text']

# Split logic: First fetch title and thumbnail, then transcribe
def get_video_info_and_transcribe(youtube_url, model_size="base", language="en"):
    # Fetch title and thumbnail first
    audio_path, title, thumbnail_url = download_video_info(youtube_url)
    
    # If fetching video info fails
    if not audio_path or not os.path.exists(audio_path):
        return gr.update(value="Error fetching video."), None, None

    # Show title and thumbnail to the user while the transcription is happening
    title_output = gr.update(value=title)
    
    # Show the thumbnail if available
    if thumbnail_url:
        thumbnail_output = gr.update(value=thumbnail_url)
    else:
        thumbnail_output = gr.update(visible=False)  # Hide if no thumbnail
    
    # Start transcription
    transcription = transcribe_audio(audio_path, model_size, language)

    return title_output, thumbnail_output, gr.update(value=transcription)

# Gradio interface setup using gradio.components
with gr.Blocks() as demo:

    title = "<center><h1>YouTube Whisper ⚡️ </h1></center>"
    gr.HTML(title)

    gr.Markdown(
    """
    This tool lets you transcribe YouTube videos in multiple languages using **[Whisper](https://openai.com/research/whisper)**, an open-source speech recognition (ASR) model developed by OpenAI.


    ### Key Features:
    - **Fast transcription**: Using the **base** model, transcribing a **3 minute** video takes approximately **30 seconds**.
    - **Multiple language support**: Choose from **English**, **Spanish**, **French**, and more!
    - **Simple workflow**: 
        1. Paste a YouTube link.
        2. Select the model size and language.
        3. Click "Transcribe" to get the text from the video.

    _Transcription times may vary based on model size and video length._
    """)

    with gr.Row():
        youtube_url = gr.Textbox(label="YouTube Link", elem_id="yt_link", scale=5)
        model_size = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], label="Model Size", value="base", scale=1)
        language = gr.Dropdown(choices=["en", "es", "fr", "de", "it", "ja"], label="Language", value="en", scale=1)
    
    title_output = gr.Textbox(label="Video Title", interactive=False)

    with gr.Row():
        thumbnail_output = gr.Image(label="Thumbnail", interactive=False, scale=1)
        transcription_output = gr.Textbox(label="Transcription", interactive=False, scale=1)
    
    transcribe_button = gr.Button("Transcribe")

    transcribe_button.click(
        get_video_info_and_transcribe, 
        inputs=[youtube_url, model_size, language],
        outputs=[title_output, thumbnail_output, transcription_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)