File size: 5,550 Bytes
d903faf
 
 
d255f96
d903faf
4de615e
8377508
d903faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4de615e
 
 
 
 
 
 
d903faf
 
d255f96
d903faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1a0091
 
 
d903faf
d1a0091
 
 
 
 
 
 
 
d1f9d6f
d903faf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2079594
 
 
 
 
 
 
 
 
 
d903faf
 
 
 
4865ee0
d903faf
 
4de615e
d903faf
4de615e
d903faf
 
 
 
 
 
 
 
 
 
 
 
 
 
8377508
 
 
4de615e
d903faf
01d4c13
 
 
b273369
01d4c13
fe8b1e3
 
 
b273369
01d4c13
b273369
 
ebe7599
f5d0648
01d4c13
d903faf
 
 
 
 
4865ee0
d903faf
 
4865ee0
 
d903faf
4865ee0
 
d903faf
3d8816a
d903faf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import whisper
import gradio as gr
import ffmpeg
from yt_dlp import YoutubeDL
import os
import sys
from subprocess import PIPE, run

youtube_livestream_codes = [
    91,
    92,
    93,
    94,
    95,
    96,
    300,
    301,
]
youtube_mp4_codes = [
    298,
    18,
    22,
    140,
    133,
    134
]

def second_to_timecode(x: float) -> str:
    hour, x = divmod(x, 3600)
    minute, x = divmod(x, 60)
    second, x = divmod(x, 1)
    millisecond = int(x * 1000.)

    return '%.2d:%.2d:%.2d,%.3d' % (hour, minute, second, millisecond)

def get_video_metadata(video_url: str = "https://www.youtube.com/watch?v=21X5lGlDOfg&ab_channel=NASA")-> dict:
    with YoutubeDL({'outtmpl': '%(id)s.%(ext)s'}) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)
        video_title = info_dict.get('title', None)
        uploader_id = info_dict.get('uploader_id', None)
        print(f"[youtube] {video_title}: {uploader_id}")
    return info_dict


def parse_metadata(metadata) -> dict:
    """
    Parse metadata and send to discord.
    After a video is done recording, 
    it will have both the livestream format and the mp4 format.
    """
    # send metadata to discord
    formats = metadata.get("formats", [])
    # filter for ext = mp4
    mp4_formats = [f for f in formats if f.get("ext", "") == "mp4"]
    try:
        format_ids = [int(f.get("format_id", 0)) for f in mp4_formats]
        video_entries = sorted(set(format_ids).intersection(youtube_mp4_codes))

        is_livestream = True
        if len(video_entries) > 0:
            # use video format id over livestream id if available
            selected_id = video_entries[0]
            is_livestream = False
    except Exception as e:
        print(e)
        selected_id = mp4_formats[0].get("format_id")
        is_livestream = False


    return {
        "selected_id": selected_id,
        "is_livestream": is_livestream,
    }

def get_video(url: str, config: dict):
    """
    Get video from start time.
    """
    # result = subprocess.run()
    # could delay start time by a few seconds to just sync up and capture the full video length
    # but would need to time how long it takes to fetch the video using youtube-dl and other adjustments and start a bit before
    filename = config.get("filename", "livestream01.mp4")
    end = config.get("end", "00:15:00")
    overlay_file = ffmpeg.input(filename)
    (
        ffmpeg
        .input(url, t=end)
        .output(filename)
        .run()
    )

def get_all_files(url: str, end: str = "00:15:00"):
    metadata = get_video_metadata(url)
    temp_dict = parse_metadata(metadata)
    selected_id = temp_dict.get("selected_id", 0)
    formats = metadata.get("formats", [])
    selected_format = [f for f in formats if f.get("format_id", "") == str(selected_id)][0]
    format_url = selected_format.get("url", "")
    filename = "temp.mp4"
    get_video(format_url, {"filename": filename, "end": end})
    return filename

def get_text_from_mp3_whisper(inputType:str, mp3_file: str, url_path: str, taskName: str, srcLanguage: str)->str:
    # remove the file if it exists
    if os.path.exists("transcript.srt"):
        os.remove("transcript.srt")
    
    if os.path.exists("temp.mp4"):
        os.remove("temp.mp4")
    
    if os.path.exists("subtitled.mp4"):
        os.remove("subtitled.mp4")
    
    model = whisper.load_model("medium")
    # options = whisper.DecodingOptions(language="en", without_timestamps=True)
    options = dict(language=srcLanguage)
    transcribe_options = dict(task=taskName, **options)
    # return if url_path is not set, taskName is not set, srcLanguage is not set
    if inputType == "url":
        filename = get_all_files(url_path)
        print("Retrieved the file")
        result = model.transcribe(filename, **transcribe_options)
        print("transcribing the file")
    else:
        result = model.transcribe(mp3_file, **transcribe_options)
    # adjust for spacy mode
    html_text = ""
    lines = []
    for count, segment in enumerate(result.get("segments")):
        # print(segment)
        start = segment.get("start")
        end = segment.get("end")
        lines.append(f"{count}")
        lines.append(f"{second_to_timecode(start)} --> {second_to_timecode(end)}")
        lines.append(segment.get("text", "").strip())
        lines.append('')
    words = '\n'.join(lines)
    # save to transcript.srt
    with open("transcript.srt", "w") as f:
        f.write(words)
    print("done transcribing")

    input_file = 'temp.mp4'
    subtitles_file = 'transcript.srt'
    output_file = 'subtitled.mp4'
    try:
        print("attempt to output file")
        video = ffmpeg.input(input_file)
        audio = video.audio
        ffmpeg.concat(video.filter("subtitles", subtitles_file), audio, v=1, a=1).output(output_file).run()
    except Exception as e:
        print("failed to output file")
        print(e)
        output_file = "temp.mp4"
    # return temp.mp4
    
    return result.get("segments"), words, output_file
 
gr.Interface(
    title = 'Download Video From url and extract text from audio', 
    fn=get_text_from_mp3_whisper, 
    inputs=[
        gr.Dropdown(["url", "file"], value="url"),
        gr.inputs.Audio(type="filepath"),
        gr.inputs.Textbox(),
        gr.Dropdown(["translate", "transcribe"], value="translate"),
        gr.Dropdown(["Japanese", "English"], value="Japanese")
    ],
    button_text="Go!",
    button_color="#333333",
    outputs=[
        "json", "text", "file"
    ],
    live=True).launch()