Spaces:
Runtime error
Runtime error
File size: 4,661 Bytes
d903faf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import whisper
import gradio as gr
import ffmpeg
import youtube_dl
import os
youtube_livestream_codes = [
91,
92,
93,
94,
95,
96,
300,
301,
]
youtube_mp4_codes = [
298,
18,
22,
140,
133,
134
]
import sys
def get_video_metadata(video_url: str = "https://www.youtube.com/watch?v=21X5lGlDOfg&ab_channel=NASA")-> dict:
with youtube_dl.YoutubeDL({'outtmpl': '%(id)s.%(ext)s'}) as ydl:
info_dict = ydl.extract_info(video_url, download=False)
video_title = info_dict.get('title', None)
uploader_id = info_dict.get('uploader_id', None)
print(f"[youtube] {video_title}: {uploader_id}")
return info_dict
def parse_metadata(metadata) -> dict:
"""
Parse metadata and send to discord.
After a video is done recording,
it will have both the livestream format and the mp4 format.
"""
# send metadata to discord
formats = metadata.get("formats", [])
# filter for ext = mp4
mp4_formats = [f for f in formats if f.get("ext", "") == "mp4"]
format_ids = [int(f.get("format_id", 0)) for f in mp4_formats]
video_entries = sorted(set(format_ids).intersection(youtube_mp4_codes))
is_livestream = True
if len(video_entries) > 0:
# use video format id over livestream id if available
selected_id = video_entries[0]
is_livestream = False
return {
"selected_id": selected_id,
"is_livestream": is_livestream,
}
def get_video(url: str, config: dict):
"""
Get video from start time.
"""
# result = subprocess.run()
# could delay start time by a few seconds to just sync up and capture the full video length
# but would need to time how long it takes to fetch the video using youtube-dl and other adjustments and start a bit before
filename = config.get("filename", "livestream01.mp4")
end = config.get("end", "00:15:00")
overlay_file = ffmpeg.input(filename)
(
ffmpeg
.input(url, t=end)
.output(filename)
.run()
)
def get_all_files(url: str, end: str = "00:15:00"):
metadata = get_video_metadata(url)
temp_dict = parse_metadata(metadata)
selected_id = temp_dict.get("selected_id", 0)
formats = metadata.get("formats", [])
selected_format = [f for f in formats if f.get("format_id", "") == str(selected_id)][0]
format_url = selected_format.get("url", "")
filename = "temp.mp4"
get_video(format_url, {"filename": filename, "end": end})
return filename
def get_text_from_mp3_whisper(inputType:str, mp3_file: str, url_path: str, taskName: str, srcLanguage: str)->str:
model = whisper.load_model("medium")
# options = whisper.DecodingOptions(language="en", without_timestamps=True)
options = dict(language=srcLanguage)
transcribe_options = dict(task=taskName, **options)
if inputType == "url":
filename = get_all_files(url_path)
result = model.transcribe(filename, **transcribe_options)
else:
result = model.transcribe(mp3_file, **transcribe_options)
# adjust for spacy mode
html_text = ""
lines = []
for count, segment in enumerate(result.get("segments")):
# print(segment)
start = segment.get("start")
end = segment.get("end")
lines.append(f"{count}")
lines.append(f"{second_to_timecode(start)} --> {second_to_timecode(end)}")
lines.append(segment.get("text", "").strip())
lines.append('')
words = '\n'.join(lines)
input_file = filename or mp3_file
# ffmpeg -i testing.mp4 -vf subtitles=transcript.srt mysubtitledmovie.mp4
# use ffmpeg bindings to add subtitles to video
# use python to call ffmpeg -i testing.mp4 -vf subtitles=transcript.srt mysubtitledmovie.mp4
input_video = ffmpeg.input('testing.mp4')
subtitle = ffmpeg.filter('subtitles', 'transcript.srt')
output_video = ffmpeg.output(input_video, subtitle, 'subtitled.mp4', vcodec='libx264', video_filters='[v]subtitles=transcript.srt[v]')
ffmpeg.run(output_video)
# for spacy use advanced logic to extract and append to html_text using tables?
# get output_video as mp4
return result.get("segments"), words, "subtitled.mp4"
gr.Interface(
title = 'Download Video From url and extract text from audio',
fn=get_text_from_mp3_whisper,
inputs=[
gr.Dropdown(["url", "file"]),
gr.inputs.Audio(type="filepath"),
gr.inputs.Textbox(),
gr.Dropdown(["translate", "transcribe"]),
gr.Dropdown(["Japanese", "English"])
],
outputs=[
"json", "text", "mp4"
],
live=True).launch() |