Spaces:
Sleeping
Sleeping
Display detailed reasons for video download failure, improve subtitle line break mechanism, and fix the issue of whisper filter log display.
Browse files1. Display the reasons for video download failure on the front end when yt-dlp fails to download a video.
2. Improved the line break handling mechanism for the maximum subtitle length, now automatically handling line breaks based on Character Width.
3. Fixed the issue where the whisper segments filter log was not displaying completely when processing multiple video files at once.
- app.py +14 -17
- src/download.py +33 -15
- src/utils.py +54 -33
app.py
CHANGED
@@ -369,7 +369,7 @@ class WhisperTranscriber:
|
|
369 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
|
370 |
translationLang = get_lang_from_m2m100_name(madlad400LangName)
|
371 |
elif translateInput == "seamless" and seamlessLangName is not None and len(seamlessLangName) > 0:
|
372 |
-
selectedModelName = seamlessModelName if seamlessModelName is not None and len(seamlessModelName) > 0 else "
|
373 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["seamless"] if modelConfig.name == selectedModelName), None)
|
374 |
translationLang = get_lang_from_seamlessTx_name(seamlessLangName)
|
375 |
|
@@ -383,6 +383,7 @@ class WhisperTranscriber:
|
|
383 |
zip_file_lookup = {}
|
384 |
text = ""
|
385 |
vtt = ""
|
|
|
386 |
|
387 |
# Write result
|
388 |
downloadDirectory = tempfile.mkdtemp()
|
@@ -418,9 +419,9 @@ class WhisperTranscriber:
|
|
418 |
# Transcribe
|
419 |
result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
|
420 |
filterLog = result.get("filterLog", None)
|
421 |
-
filterLogText = [gr.Text.update(visible=False)]
|
422 |
if filterLog:
|
423 |
-
|
|
|
424 |
if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
|
425 |
whisperLang = get_lang_from_whisper_code(result["language"])
|
426 |
translationModel.whisperLang = whisperLang
|
@@ -499,6 +500,10 @@ class WhisperTranscriber:
|
|
499 |
zip.write(download_file, arcname=zip_file_name)
|
500 |
|
501 |
download.insert(0, downloadAllPath)
|
|
|
|
|
|
|
|
|
502 |
|
503 |
return [download, text, vtt] + filterLogText
|
504 |
|
@@ -656,10 +661,11 @@ class WhisperTranscriber:
|
|
656 |
if isFilter: break
|
657 |
if isFilter: break
|
658 |
if isFilter:
|
659 |
-
filterIdx += 1
|
660 |
-
filterLog.append(f"filter{filterIdx:03d} [{filterCondition}]:")
|
661 |
filterLog.append(f"\t{querySegment}\n")
|
662 |
del querySegmentsResult[currentID]
|
|
|
|
|
|
|
663 |
|
664 |
return querySegmentsResult, "\n".join(filterLog)
|
665 |
except Exception as e:
|
@@ -740,7 +746,7 @@ class WhisperTranscriber:
|
|
740 |
text = result["text"]
|
741 |
segments = result["segments"]
|
742 |
language = result["language"]
|
743 |
-
languageMaxLineWidth =
|
744 |
|
745 |
if translationModel is not None and translationModel.translationLang is not None:
|
746 |
try:
|
@@ -769,7 +775,7 @@ class WhisperTranscriber:
|
|
769 |
print(traceback.format_exc())
|
770 |
print("Error process segments: " + str(e))
|
771 |
|
772 |
-
print("Max line
|
773 |
vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
|
774 |
srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
|
775 |
json_result = json.dumps(result, indent=4, ensure_ascii=False)
|
@@ -827,15 +833,6 @@ class WhisperTranscriber:
|
|
827 |
def __get_source(self, urlData, multipleFiles, microphoneData):
|
828 |
return get_audio_source_collection(urlData, multipleFiles, microphoneData, self.inputAudioMaxDuration)
|
829 |
|
830 |
-
def __get_max_line_width(self, language: str) -> int:
|
831 |
-
if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
|
832 |
-
# Chinese characters and kana are wider, so limit line length to 40 characters
|
833 |
-
return 40
|
834 |
-
else:
|
835 |
-
# TODO: Add more languages
|
836 |
-
# 80 latin characters should fit on a 1080p/720p screen
|
837 |
-
return 80
|
838 |
-
|
839 |
def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
|
840 |
segmentStream = StringIO()
|
841 |
|
@@ -1010,7 +1007,7 @@ def create_ui(app_config: ApplicationConfig):
|
|
1010 |
}
|
1011 |
|
1012 |
common_output = lambda : [
|
1013 |
-
gr.File(label="Download", elem_id="outputDownload"),
|
1014 |
gr.Text(label="Transcription", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputTranscription", elem_classes="scroll-show"),
|
1015 |
gr.Text(label="Segments", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputSegments", elem_classes="scroll-show"),
|
1016 |
gr.Text(label="Filtered segment items", autoscroll=False, visible=False, show_copy_button=True, interactive=True, elem_id="outputFiltered", elem_classes="scroll-show"),
|
|
|
369 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
|
370 |
translationLang = get_lang_from_m2m100_name(madlad400LangName)
|
371 |
elif translateInput == "seamless" and seamlessLangName is not None and len(seamlessLangName) > 0:
|
372 |
+
selectedModelName = seamlessModelName if seamlessModelName is not None and len(seamlessModelName) > 0 else "seamless-m4t-v2-large/facebook"
|
373 |
selectedModel = next((modelConfig for modelConfig in self.app_config.models["seamless"] if modelConfig.name == selectedModelName), None)
|
374 |
translationLang = get_lang_from_seamlessTx_name(seamlessLangName)
|
375 |
|
|
|
383 |
zip_file_lookup = {}
|
384 |
text = ""
|
385 |
vtt = ""
|
386 |
+
filterLogs = ""
|
387 |
|
388 |
# Write result
|
389 |
downloadDirectory = tempfile.mkdtemp()
|
|
|
419 |
# Transcribe
|
420 |
result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
|
421 |
filterLog = result.get("filterLog", None)
|
|
|
422 |
if filterLog:
|
423 |
+
filterLogs += source.get_full_name() + ":\n" + filterLog + "\n\n"
|
424 |
+
|
425 |
if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
|
426 |
whisperLang = get_lang_from_whisper_code(result["language"])
|
427 |
translationModel.whisperLang = whisperLang
|
|
|
500 |
zip.write(download_file, arcname=zip_file_name)
|
501 |
|
502 |
download.insert(0, downloadAllPath)
|
503 |
+
|
504 |
+
filterLogText = [gr.Text.update(visible=False)]
|
505 |
+
if filterLogs:
|
506 |
+
filterLogText = [gr.Text.update(visible=True, value=filterLogs)]
|
507 |
|
508 |
return [download, text, vtt] + filterLogText
|
509 |
|
|
|
661 |
if isFilter: break
|
662 |
if isFilter: break
|
663 |
if isFilter:
|
|
|
|
|
664 |
filterLog.append(f"\t{querySegment}\n")
|
665 |
del querySegmentsResult[currentID]
|
666 |
+
|
667 |
+
if filterLog:
|
668 |
+
filterLog = [f"filter{idx:03d} [{filterCondition}]:\n{log}" for idx, log in enumerate(reversed(filterLog))]
|
669 |
|
670 |
return querySegmentsResult, "\n".join(filterLog)
|
671 |
except Exception as e:
|
|
|
746 |
text = result["text"]
|
747 |
segments = result["segments"]
|
748 |
language = result["language"]
|
749 |
+
languageMaxLineWidth = 80 #Use east_asian_width to automatically determine the Character Width of the string, replacing the __get_max_line_width function. 80 latin characters should fit on a 1080p/720p screen
|
750 |
|
751 |
if translationModel is not None and translationModel.translationLang is not None:
|
752 |
try:
|
|
|
775 |
print(traceback.format_exc())
|
776 |
print("Error process segments: " + str(e))
|
777 |
|
778 |
+
print("Max line Character Width " + str(languageMaxLineWidth) + " for language:" + language)
|
779 |
vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
|
780 |
srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
|
781 |
json_result = json.dumps(result, indent=4, ensure_ascii=False)
|
|
|
833 |
def __get_source(self, urlData, multipleFiles, microphoneData):
|
834 |
return get_audio_source_collection(urlData, multipleFiles, microphoneData, self.inputAudioMaxDuration)
|
835 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
836 |
def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
|
837 |
segmentStream = StringIO()
|
838 |
|
|
|
1007 |
}
|
1008 |
|
1009 |
common_output = lambda : [
|
1010 |
+
gr.File(label="Download", height=200, elem_id="outputDownload"),
|
1011 |
gr.Text(label="Transcription", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputTranscription", elem_classes="scroll-show"),
|
1012 |
gr.Text(label="Segments", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputSegments", elem_classes="scroll-show"),
|
1013 |
gr.Text(label="Filtered segment items", autoscroll=False, visible=False, show_copy_button=True, interactive=True, elem_id="outputFiltered", elem_classes="scroll-show"),
|
src/download.py
CHANGED
@@ -5,6 +5,9 @@ from yt_dlp import YoutubeDL
|
|
5 |
import yt_dlp
|
6 |
from yt_dlp.postprocessor import PostProcessor
|
7 |
|
|
|
|
|
|
|
8 |
class FilenameCollectorPP(PostProcessor):
|
9 |
def __init__(self):
|
10 |
super(FilenameCollectorPP, self).__init__(None)
|
@@ -42,27 +45,32 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
|
|
42 |
if outputTemplate:
|
43 |
ydl_opts['outtmpl'] = outputTemplate
|
44 |
|
|
|
|
|
45 |
filename_collector = FilenameCollectorPP()
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
if maxDuration and maxDuration > 0:
|
49 |
-
info = ydl.extract_info(url, download=False)
|
50 |
-
entries = "entries" in info and info["entries"] or [info]
|
51 |
|
52 |
-
|
|
|
|
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
total_duration += float(entry["duration"])
|
57 |
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
|
62 |
-
|
63 |
|
64 |
if len(filename_collector.filenames) <= 0:
|
65 |
-
raise Exception("Cannot download " +
|
66 |
|
67 |
result = []
|
68 |
|
@@ -70,10 +78,20 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
|
|
70 |
result.append(filename)
|
71 |
print("Downloaded " + filename)
|
72 |
|
73 |
-
return result
|
74 |
|
75 |
class ExceededMaximumDuration(Exception):
|
76 |
def __init__(self, videoDuration, maxDuration, message):
|
77 |
self.videoDuration = videoDuration
|
78 |
self.maxDuration = maxDuration
|
79 |
-
super().__init__(message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import yt_dlp
|
6 |
from yt_dlp.postprocessor import PostProcessor
|
7 |
|
8 |
+
import io
|
9 |
+
from contextlib import redirect_stderr
|
10 |
+
|
11 |
class FilenameCollectorPP(PostProcessor):
|
12 |
def __init__(self):
|
13 |
super(FilenameCollectorPP, self).__init__(None)
|
|
|
45 |
if outputTemplate:
|
46 |
ydl_opts['outtmpl'] = outputTemplate
|
47 |
|
48 |
+
errStrIO = EventStringIO(on_write=lambda text: print(f"\033[91m{text}\033[0m"))
|
49 |
+
|
50 |
filename_collector = FilenameCollectorPP()
|
51 |
+
with redirect_stderr(errStrIO):
|
52 |
+
with YoutubeDL(ydl_opts) as ydl:
|
53 |
+
if maxDuration and maxDuration > 0:
|
54 |
+
info = ydl.extract_info(url, download=False)
|
55 |
+
entries = "entries" in info and info["entries"] or [info]
|
56 |
|
57 |
+
total_duration = 0
|
|
|
|
|
|
|
58 |
|
59 |
+
# Compute total duration
|
60 |
+
for entry in entries:
|
61 |
+
total_duration += float(entry["duration"])
|
62 |
|
63 |
+
if total_duration >= maxDuration:
|
64 |
+
raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=maxDuration, message="Video is too long")
|
|
|
65 |
|
66 |
+
ydl.add_post_processor(filename_collector)
|
67 |
+
ydl.download([url])
|
68 |
|
69 |
+
errMsg = errStrIO.getvalue()
|
70 |
+
errMsg = [text for text in errMsg.split("\n") if text.startswith("ERROR")] if errMsg else ""
|
71 |
|
72 |
if len(filename_collector.filenames) <= 0:
|
73 |
+
raise Exception(f"Cannot download {url}, " + "\n".join(errMsg) if errMsg else "")
|
74 |
|
75 |
result = []
|
76 |
|
|
|
78 |
result.append(filename)
|
79 |
print("Downloaded " + filename)
|
80 |
|
81 |
+
return result
|
82 |
|
83 |
class ExceededMaximumDuration(Exception):
|
84 |
def __init__(self, videoDuration, maxDuration, message):
|
85 |
self.videoDuration = videoDuration
|
86 |
self.maxDuration = maxDuration
|
87 |
+
super().__init__(message)
|
88 |
+
|
89 |
+
class EventStringIO(io.StringIO):
|
90 |
+
def __init__(self, on_write=None, *args, **kwargs):
|
91 |
+
super().__init__(*args, **kwargs)
|
92 |
+
self.on_write = on_write
|
93 |
+
|
94 |
+
def write(self, text):
|
95 |
+
super().write(text)
|
96 |
+
if self.on_write:
|
97 |
+
self.on_write(text)
|
src/utils.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
-
import
|
2 |
-
import unicodedata
|
3 |
-
import re
|
4 |
|
5 |
import zlib
|
6 |
from typing import Iterator, TextIO, Union
|
7 |
import tqdm
|
8 |
|
9 |
import urllib3
|
|
|
10 |
|
11 |
|
12 |
def exact_div(x, y):
|
@@ -139,7 +138,7 @@ def write_srt_original(transcript: Iterator[dict], file: TextIO,
|
|
139 |
file=file,
|
140 |
flush=True)
|
141 |
|
142 |
-
def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
|
143 |
for segment in transcript:
|
144 |
words: list = segment.get('words', [])
|
145 |
|
@@ -236,45 +235,67 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
|
|
236 |
'text' : subtitle_text
|
237 |
}
|
238 |
if original_text is not None and len(original_text) > 0:
|
239 |
-
result.update({'original': original_text})
|
240 |
yield result
|
241 |
|
242 |
def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
|
|
|
|
|
243 |
if maxLineWidth is None or maxLineWidth < 0:
|
244 |
-
return
|
245 |
|
246 |
-
|
247 |
-
current_line = ""
|
248 |
-
current_length = 0
|
249 |
-
|
250 |
-
for entry in words:
|
251 |
-
# Either accept a string or a dict with a 'word' and 'length' field
|
252 |
-
if isinstance(entry, dict):
|
253 |
-
word = entry['word']
|
254 |
-
word_length = entry['length']
|
255 |
-
else:
|
256 |
-
word = entry
|
257 |
-
word_length = len(word)
|
258 |
-
|
259 |
-
if current_length > 0 and current_length + word_length > maxLineWidth:
|
260 |
-
lines.append(current_line)
|
261 |
-
current_line = ""
|
262 |
-
current_length = 0
|
263 |
-
|
264 |
-
current_length += word_length
|
265 |
-
# The word will be prefixed with a space by Whisper, so we don't need to add one here
|
266 |
-
current_line += word
|
267 |
-
|
268 |
-
if len(current_line) > 0:
|
269 |
-
lines.append(current_line)
|
270 |
-
|
271 |
-
return "\n".join(lines)
|
272 |
|
273 |
def process_text(text: str, maxLineWidth=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
if (maxLineWidth is None or maxLineWidth < 0):
|
275 |
return text
|
276 |
|
277 |
-
lines =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
return '\n'.join(lines)
|
279 |
|
280 |
def slugify(value, allow_unicode=False, is_lower=False):
|
|
|
1 |
+
import re
|
|
|
|
|
2 |
|
3 |
import zlib
|
4 |
from typing import Iterator, TextIO, Union
|
5 |
import tqdm
|
6 |
|
7 |
import urllib3
|
8 |
+
import unicodedata
|
9 |
|
10 |
|
11 |
def exact_div(x, y):
|
|
|
138 |
file=file,
|
139 |
flush=True)
|
140 |
|
141 |
+
def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
|
142 |
for segment in transcript:
|
143 |
words: list = segment.get('words', [])
|
144 |
|
|
|
235 |
'text' : subtitle_text
|
236 |
}
|
237 |
if original_text is not None and len(original_text) > 0:
|
238 |
+
result.update({'original': process_text(original_text, maxLineWidth)})
|
239 |
yield result
|
240 |
|
241 |
def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
|
242 |
+
result = "".join(words)
|
243 |
+
|
244 |
if maxLineWidth is None or maxLineWidth < 0:
|
245 |
+
return result
|
246 |
|
247 |
+
return process_text(result, maxLineWidth)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
def process_text(text: str, maxLineWidth=None):
|
250 |
+
"""
|
251 |
+
Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.
|
252 |
+
|
253 |
+
# East_Asian_Width (ea)
|
254 |
+
|
255 |
+
ea ; A ; Ambiguous
|
256 |
+
ea ; F ; Fullwidth
|
257 |
+
ea ; H ; Halfwidth
|
258 |
+
ea ; N ; Neutral
|
259 |
+
ea ; Na ; Narrow
|
260 |
+
ea ; W ; Wide
|
261 |
+
https://stackoverflow.com/a/31666966
|
262 |
+
"""
|
263 |
if (maxLineWidth is None or maxLineWidth < 0):
|
264 |
return text
|
265 |
|
266 |
+
lines = []
|
267 |
+
currentLine = ""
|
268 |
+
currentWidth = 0
|
269 |
+
|
270 |
+
for word in text.split():
|
271 |
+
wordWidth = 0
|
272 |
+
wordStart = 0
|
273 |
+
if currentLine:
|
274 |
+
currentLine += " "
|
275 |
+
wordWidth += 1
|
276 |
+
for wordIdx, char in enumerate(word):
|
277 |
+
if unicodedata.east_asian_width(char) not in {'W', 'F'}:
|
278 |
+
wordWidth += 1
|
279 |
+
else:
|
280 |
+
if currentWidth + wordWidth + 2 > maxLineWidth:
|
281 |
+
lines.append(currentLine + word[wordStart:wordIdx])
|
282 |
+
currentLine = ""
|
283 |
+
currentWidth = 0
|
284 |
+
wordStart = wordIdx
|
285 |
+
wordWidth = 0
|
286 |
+
wordWidth += 2
|
287 |
+
|
288 |
+
if currentWidth + wordWidth > maxLineWidth:
|
289 |
+
lines.append(currentLine)
|
290 |
+
currentLine = word[wordStart:]
|
291 |
+
currentWidth = wordWidth
|
292 |
+
else:
|
293 |
+
currentLine += word[wordStart:]
|
294 |
+
currentWidth += wordWidth
|
295 |
+
|
296 |
+
if currentLine:
|
297 |
+
lines.append(currentLine)
|
298 |
+
|
299 |
return '\n'.join(lines)
|
300 |
|
301 |
def slugify(value, allow_unicode=False, is_lower=False):
|