avans06 commited on
Commit
922fe2a
1 Parent(s): 67af9e2

Display detailed reasons for video download failure, improve subtitle line break mechanism, and fix the issue of whisper filter log display.

Browse files

1. Display the reasons for video download failure on the front end when yt-dlp fails to download a video.

2. Improved the line break handling mechanism for the maximum subtitle length, now automatically handling line breaks based on Character Width.

3. Fixed the issue where the whisper segments filter log was not displaying completely when processing multiple video files at once.

Files changed (3) hide show
  1. app.py +14 -17
  2. src/download.py +33 -15
  3. src/utils.py +54 -33
app.py CHANGED
@@ -369,7 +369,7 @@ class WhisperTranscriber:
369
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
370
  translationLang = get_lang_from_m2m100_name(madlad400LangName)
371
  elif translateInput == "seamless" and seamlessLangName is not None and len(seamlessLangName) > 0:
372
- selectedModelName = seamlessModelName if seamlessModelName is not None and len(seamlessModelName) > 0 else "facebook/seamless-m4t-v2-large"
373
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["seamless"] if modelConfig.name == selectedModelName), None)
374
  translationLang = get_lang_from_seamlessTx_name(seamlessLangName)
375
 
@@ -383,6 +383,7 @@ class WhisperTranscriber:
383
  zip_file_lookup = {}
384
  text = ""
385
  vtt = ""
 
386
 
387
  # Write result
388
  downloadDirectory = tempfile.mkdtemp()
@@ -418,9 +419,9 @@ class WhisperTranscriber:
418
  # Transcribe
419
  result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
420
  filterLog = result.get("filterLog", None)
421
- filterLogText = [gr.Text.update(visible=False)]
422
  if filterLog:
423
- filterLogText = [gr.Text.update(visible=True, value=filterLog)]
 
424
  if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
425
  whisperLang = get_lang_from_whisper_code(result["language"])
426
  translationModel.whisperLang = whisperLang
@@ -499,6 +500,10 @@ class WhisperTranscriber:
499
  zip.write(download_file, arcname=zip_file_name)
500
 
501
  download.insert(0, downloadAllPath)
 
 
 
 
502
 
503
  return [download, text, vtt] + filterLogText
504
 
@@ -656,10 +661,11 @@ class WhisperTranscriber:
656
  if isFilter: break
657
  if isFilter: break
658
  if isFilter:
659
- filterIdx += 1
660
- filterLog.append(f"filter{filterIdx:03d} [{filterCondition}]:")
661
  filterLog.append(f"\t{querySegment}\n")
662
  del querySegmentsResult[currentID]
 
 
 
663
 
664
  return querySegmentsResult, "\n".join(filterLog)
665
  except Exception as e:
@@ -740,7 +746,7 @@ class WhisperTranscriber:
740
  text = result["text"]
741
  segments = result["segments"]
742
  language = result["language"]
743
- languageMaxLineWidth = self.__get_max_line_width(language)
744
 
745
  if translationModel is not None and translationModel.translationLang is not None:
746
  try:
@@ -769,7 +775,7 @@ class WhisperTranscriber:
769
  print(traceback.format_exc())
770
  print("Error process segments: " + str(e))
771
 
772
- print("Max line width " + str(languageMaxLineWidth) + " for language:" + language)
773
  vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
774
  srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
775
  json_result = json.dumps(result, indent=4, ensure_ascii=False)
@@ -827,15 +833,6 @@ class WhisperTranscriber:
827
  def __get_source(self, urlData, multipleFiles, microphoneData):
828
  return get_audio_source_collection(urlData, multipleFiles, microphoneData, self.inputAudioMaxDuration)
829
 
830
- def __get_max_line_width(self, language: str) -> int:
831
- if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
832
- # Chinese characters and kana are wider, so limit line length to 40 characters
833
- return 40
834
- else:
835
- # TODO: Add more languages
836
- # 80 latin characters should fit on a 1080p/720p screen
837
- return 80
838
-
839
  def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
840
  segmentStream = StringIO()
841
 
@@ -1010,7 +1007,7 @@ def create_ui(app_config: ApplicationConfig):
1010
  }
1011
 
1012
  common_output = lambda : [
1013
- gr.File(label="Download", elem_id="outputDownload"),
1014
  gr.Text(label="Transcription", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputTranscription", elem_classes="scroll-show"),
1015
  gr.Text(label="Segments", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputSegments", elem_classes="scroll-show"),
1016
  gr.Text(label="Filtered segment items", autoscroll=False, visible=False, show_copy_button=True, interactive=True, elem_id="outputFiltered", elem_classes="scroll-show"),
 
369
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
370
  translationLang = get_lang_from_m2m100_name(madlad400LangName)
371
  elif translateInput == "seamless" and seamlessLangName is not None and len(seamlessLangName) > 0:
372
+ selectedModelName = seamlessModelName if seamlessModelName is not None and len(seamlessModelName) > 0 else "seamless-m4t-v2-large/facebook"
373
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["seamless"] if modelConfig.name == selectedModelName), None)
374
  translationLang = get_lang_from_seamlessTx_name(seamlessLangName)
375
 
 
383
  zip_file_lookup = {}
384
  text = ""
385
  vtt = ""
386
+ filterLogs = ""
387
 
388
  # Write result
389
  downloadDirectory = tempfile.mkdtemp()
 
419
  # Transcribe
420
  result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
421
  filterLog = result.get("filterLog", None)
 
422
  if filterLog:
423
+ filterLogs += source.get_full_name() + ":\n" + filterLog + "\n\n"
424
+
425
  if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
426
  whisperLang = get_lang_from_whisper_code(result["language"])
427
  translationModel.whisperLang = whisperLang
 
500
  zip.write(download_file, arcname=zip_file_name)
501
 
502
  download.insert(0, downloadAllPath)
503
+
504
+ filterLogText = [gr.Text.update(visible=False)]
505
+ if filterLogs:
506
+ filterLogText = [gr.Text.update(visible=True, value=filterLogs)]
507
 
508
  return [download, text, vtt] + filterLogText
509
 
 
661
  if isFilter: break
662
  if isFilter: break
663
  if isFilter:
 
 
664
  filterLog.append(f"\t{querySegment}\n")
665
  del querySegmentsResult[currentID]
666
+
667
+ if filterLog:
668
+ filterLog = [f"filter{idx:03d} [{filterCondition}]:\n{log}" for idx, log in enumerate(reversed(filterLog))]
669
 
670
  return querySegmentsResult, "\n".join(filterLog)
671
  except Exception as e:
 
746
  text = result["text"]
747
  segments = result["segments"]
748
  language = result["language"]
749
+ languageMaxLineWidth = 80 #Use east_asian_width to automatically determine the Character Width of the string, replacing the __get_max_line_width function. 80 latin characters should fit on a 1080p/720p screen
750
 
751
  if translationModel is not None and translationModel.translationLang is not None:
752
  try:
 
775
  print(traceback.format_exc())
776
  print("Error process segments: " + str(e))
777
 
778
+ print("Max line Character Width " + str(languageMaxLineWidth) + " for language:" + language)
779
  vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
780
  srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
781
  json_result = json.dumps(result, indent=4, ensure_ascii=False)
 
833
  def __get_source(self, urlData, multipleFiles, microphoneData):
834
  return get_audio_source_collection(urlData, multipleFiles, microphoneData, self.inputAudioMaxDuration)
835
 
 
 
 
 
 
 
 
 
 
836
  def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
837
  segmentStream = StringIO()
838
 
 
1007
  }
1008
 
1009
  common_output = lambda : [
1010
+ gr.File(label="Download", height=200, elem_id="outputDownload"),
1011
  gr.Text(label="Transcription", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputTranscription", elem_classes="scroll-show"),
1012
  gr.Text(label="Segments", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputSegments", elem_classes="scroll-show"),
1013
  gr.Text(label="Filtered segment items", autoscroll=False, visible=False, show_copy_button=True, interactive=True, elem_id="outputFiltered", elem_classes="scroll-show"),
src/download.py CHANGED
@@ -5,6 +5,9 @@ from yt_dlp import YoutubeDL
5
  import yt_dlp
6
  from yt_dlp.postprocessor import PostProcessor
7
 
 
 
 
8
  class FilenameCollectorPP(PostProcessor):
9
  def __init__(self):
10
  super(FilenameCollectorPP, self).__init__(None)
@@ -42,27 +45,32 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
42
  if outputTemplate:
43
  ydl_opts['outtmpl'] = outputTemplate
44
 
 
 
45
  filename_collector = FilenameCollectorPP()
 
 
 
 
 
46
 
47
- with YoutubeDL(ydl_opts) as ydl:
48
- if maxDuration and maxDuration > 0:
49
- info = ydl.extract_info(url, download=False)
50
- entries = "entries" in info and info["entries"] or [info]
51
 
52
- total_duration = 0
 
 
53
 
54
- # Compute total duration
55
- for entry in entries:
56
- total_duration += float(entry["duration"])
57
 
58
- if total_duration >= maxDuration:
59
- raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=maxDuration, message="Video is too long")
60
 
61
- ydl.add_post_processor(filename_collector)
62
- ydl.download([url])
63
 
64
  if len(filename_collector.filenames) <= 0:
65
- raise Exception("Cannot download " + url)
66
 
67
  result = []
68
 
@@ -70,10 +78,20 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
70
  result.append(filename)
71
  print("Downloaded " + filename)
72
 
73
- return result
74
 
75
  class ExceededMaximumDuration(Exception):
76
  def __init__(self, videoDuration, maxDuration, message):
77
  self.videoDuration = videoDuration
78
  self.maxDuration = maxDuration
79
- super().__init__(message)
 
 
 
 
 
 
 
 
 
 
 
5
  import yt_dlp
6
  from yt_dlp.postprocessor import PostProcessor
7
 
8
+ import io
9
+ from contextlib import redirect_stderr
10
+
11
  class FilenameCollectorPP(PostProcessor):
12
  def __init__(self):
13
  super(FilenameCollectorPP, self).__init__(None)
 
45
  if outputTemplate:
46
  ydl_opts['outtmpl'] = outputTemplate
47
 
48
+ errStrIO = EventStringIO(on_write=lambda text: print(f"\033[91m{text}\033[0m"))
49
+
50
  filename_collector = FilenameCollectorPP()
51
+ with redirect_stderr(errStrIO):
52
+ with YoutubeDL(ydl_opts) as ydl:
53
+ if maxDuration and maxDuration > 0:
54
+ info = ydl.extract_info(url, download=False)
55
+ entries = "entries" in info and info["entries"] or [info]
56
 
57
+ total_duration = 0
 
 
 
58
 
59
+ # Compute total duration
60
+ for entry in entries:
61
+ total_duration += float(entry["duration"])
62
 
63
+ if total_duration >= maxDuration:
64
+ raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=maxDuration, message="Video is too long")
 
65
 
66
+ ydl.add_post_processor(filename_collector)
67
+ ydl.download([url])
68
 
69
+ errMsg = errStrIO.getvalue()
70
+ errMsg = [text for text in errMsg.split("\n") if text.startswith("ERROR")] if errMsg else ""
71
 
72
  if len(filename_collector.filenames) <= 0:
73
+ raise Exception(f"Cannot download {url}, " + "\n".join(errMsg) if errMsg else "")
74
 
75
  result = []
76
 
 
78
  result.append(filename)
79
  print("Downloaded " + filename)
80
 
81
+ return result
82
 
83
  class ExceededMaximumDuration(Exception):
84
  def __init__(self, videoDuration, maxDuration, message):
85
  self.videoDuration = videoDuration
86
  self.maxDuration = maxDuration
87
+ super().__init__(message)
88
+
89
+ class EventStringIO(io.StringIO):
90
+ def __init__(self, on_write=None, *args, **kwargs):
91
+ super().__init__(*args, **kwargs)
92
+ self.on_write = on_write
93
+
94
+ def write(self, text):
95
+ super().write(text)
96
+ if self.on_write:
97
+ self.on_write(text)
src/utils.py CHANGED
@@ -1,12 +1,11 @@
1
- import textwrap
2
- import unicodedata
3
- import re
4
 
5
  import zlib
6
  from typing import Iterator, TextIO, Union
7
  import tqdm
8
 
9
  import urllib3
 
10
 
11
 
12
  def exact_div(x, y):
@@ -139,7 +138,7 @@ def write_srt_original(transcript: Iterator[dict], file: TextIO,
139
  file=file,
140
  flush=True)
141
 
142
- def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
143
  for segment in transcript:
144
  words: list = segment.get('words', [])
145
 
@@ -236,45 +235,67 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
236
  'text' : subtitle_text
237
  }
238
  if original_text is not None and len(original_text) > 0:
239
- result.update({'original': original_text})
240
  yield result
241
 
242
  def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
 
 
243
  if maxLineWidth is None or maxLineWidth < 0:
244
- return " ".join(words)
245
 
246
- lines = []
247
- current_line = ""
248
- current_length = 0
249
-
250
- for entry in words:
251
- # Either accept a string or a dict with a 'word' and 'length' field
252
- if isinstance(entry, dict):
253
- word = entry['word']
254
- word_length = entry['length']
255
- else:
256
- word = entry
257
- word_length = len(word)
258
-
259
- if current_length > 0 and current_length + word_length > maxLineWidth:
260
- lines.append(current_line)
261
- current_line = ""
262
- current_length = 0
263
-
264
- current_length += word_length
265
- # The word will be prefixed with a space by Whisper, so we don't need to add one here
266
- current_line += word
267
-
268
- if len(current_line) > 0:
269
- lines.append(current_line)
270
-
271
- return "\n".join(lines)
272
 
273
  def process_text(text: str, maxLineWidth=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  if (maxLineWidth is None or maxLineWidth < 0):
275
  return text
276
 
277
- lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  return '\n'.join(lines)
279
 
280
  def slugify(value, allow_unicode=False, is_lower=False):
 
1
+ import re
 
 
2
 
3
  import zlib
4
  from typing import Iterator, TextIO, Union
5
  import tqdm
6
 
7
  import urllib3
8
+ import unicodedata
9
 
10
 
11
  def exact_div(x, y):
 
138
  file=file,
139
  flush=True)
140
 
141
+ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
142
  for segment in transcript:
143
  words: list = segment.get('words', [])
144
 
 
235
  'text' : subtitle_text
236
  }
237
  if original_text is not None and len(original_text) > 0:
238
+ result.update({'original': process_text(original_text, maxLineWidth)})
239
  yield result
240
 
241
  def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
242
+ result = "".join(words)
243
+
244
  if maxLineWidth is None or maxLineWidth < 0:
245
+ return result
246
 
247
+ return process_text(result, maxLineWidth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  def process_text(text: str, maxLineWidth=None):
250
+ """
251
+ Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.
252
+
253
+ # East_Asian_Width (ea)
254
+
255
+ ea ; A ; Ambiguous
256
+ ea ; F ; Fullwidth
257
+ ea ; H ; Halfwidth
258
+ ea ; N ; Neutral
259
+ ea ; Na ; Narrow
260
+ ea ; W ; Wide
261
+ https://stackoverflow.com/a/31666966
262
+ """
263
  if (maxLineWidth is None or maxLineWidth < 0):
264
  return text
265
 
266
+ lines = []
267
+ currentLine = ""
268
+ currentWidth = 0
269
+
270
+ for word in text.split():
271
+ wordWidth = 0
272
+ wordStart = 0
273
+ if currentLine:
274
+ currentLine += " "
275
+ wordWidth += 1
276
+ for wordIdx, char in enumerate(word):
277
+ if unicodedata.east_asian_width(char) not in {'W', 'F'}:
278
+ wordWidth += 1
279
+ else:
280
+ if currentWidth + wordWidth + 2 > maxLineWidth:
281
+ lines.append(currentLine + word[wordStart:wordIdx])
282
+ currentLine = ""
283
+ currentWidth = 0
284
+ wordStart = wordIdx
285
+ wordWidth = 0
286
+ wordWidth += 2
287
+
288
+ if currentWidth + wordWidth > maxLineWidth:
289
+ lines.append(currentLine)
290
+ currentLine = word[wordStart:]
291
+ currentWidth = wordWidth
292
+ else:
293
+ currentLine += word[wordStart:]
294
+ currentWidth += wordWidth
295
+
296
+ if currentLine:
297
+ lines.append(currentLine)
298
+
299
  return '\n'.join(lines)
300
 
301
  def slugify(value, allow_unicode=False, is_lower=False):