whisper-webui-translate

Sleeping

avans06 commited on Dec 24, 2023

Commit

922fe2a

1 Parent(s): 67af9e2

Display detailed reasons for video download failure, improve subtitle line break mechanism, and fix the issue of whisper filter log display.

1. Display the reasons for video download failure on the front end when yt-dlp fails to download a video.

2. Improved the line break handling mechanism for the maximum subtitle length, now automatically handling line breaks based on Character Width.

3. Fixed the issue where the whisper segments filter log was not displaying completely when processing multiple video files at once.

Files changed (3) hide show

app.py +14 -17
src/download.py +33 -15
src/utils.py +54 -33

app.py CHANGED Viewed

@@ -369,7 +369,7 @@ class WhisperTranscriber:
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(madlad400LangName)
                 elif translateInput == "seamless" and seamlessLangName is not None and len(seamlessLangName) > 0:
-                    selectedModelName = seamlessModelName if seamlessModelName is not None and len(seamlessModelName) > 0 else "facebook/seamless-m4t-v2-large"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["seamless"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_seamlessTx_name(seamlessLangName)
@@ -383,6 +383,7 @@ class WhisperTranscriber:
                 zip_file_lookup = {}
                 text = ""
                 vtt = ""
                 # Write result
                 downloadDirectory = tempfile.mkdtemp()
@@ -418,9 +419,9 @@ class WhisperTranscriber:
                     # Transcribe
                     result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
                     filterLog = result.get("filterLog", None)
-                    filterLogText = [gr.Text.update(visible=False)]
                     if filterLog:
-                        filterLogText = [gr.Text.update(visible=True, value=filterLog)]
                     if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
                         whisperLang = get_lang_from_whisper_code(result["language"])
                         translationModel.whisperLang = whisperLang
@@ -499,6 +500,10 @@ class WhisperTranscriber:
                             zip.write(download_file, arcname=zip_file_name)
                     download.insert(0, downloadAllPath)
                 return [download, text, vtt] + filterLogText
@@ -656,10 +661,11 @@ class WhisperTranscriber:
                             if isFilter: break
                     if isFilter: break
                 if isFilter:
-                    filterIdx += 1
-                    filterLog.append(f"filter{filterIdx:03d} [{filterCondition}]:")
                     filterLog.append(f"\t{querySegment}\n")
                     del querySegmentsResult[currentID]
             return querySegmentsResult, "\n".join(filterLog)
         except Exception as e:
@@ -740,7 +746,7 @@ class WhisperTranscriber:
         text = result["text"]
         segments = result["segments"]
         language = result["language"]
-        languageMaxLineWidth = self.__get_max_line_width(language)
         if translationModel is not None and translationModel.translationLang is not None:
             try:
@@ -769,7 +775,7 @@ class WhisperTranscriber:
                 print(traceback.format_exc())
                 print("Error process segments: " + str(e))
-        print("Max line width " + str(languageMaxLineWidth) + " for language:" + language)
         vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
         srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
         json_result = json.dumps(result, indent=4, ensure_ascii=False)
@@ -827,15 +833,6 @@ class WhisperTranscriber:
     def __get_source(self, urlData, multipleFiles, microphoneData):
         return get_audio_source_collection(urlData, multipleFiles, microphoneData, self.inputAudioMaxDuration)
-    def __get_max_line_width(self, language: str) -> int:
-        if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
-            # Chinese characters and kana are wider, so limit line length to 40 characters
-            return 40
-        else:
-            # TODO: Add more languages
-            # 80 latin characters should fit on a 1080p/720p screen
-            return 80
     def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
         segmentStream = StringIO()
@@ -1010,7 +1007,7 @@ def create_ui(app_config: ApplicationConfig):
     }
     common_output = lambda : [
-        gr.File(label="Download", elem_id="outputDownload"),
         gr.Text(label="Transcription", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputTranscription", elem_classes="scroll-show"),
         gr.Text(label="Segments", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputSegments", elem_classes="scroll-show"),
         gr.Text(label="Filtered segment items", autoscroll=False, visible=False, show_copy_button=True, interactive=True, elem_id="outputFiltered", elem_classes="scroll-show"),

                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(madlad400LangName)
                 elif translateInput == "seamless" and seamlessLangName is not None and len(seamlessLangName) > 0:
+                    selectedModelName = seamlessModelName if seamlessModelName is not None and len(seamlessModelName) > 0 else "seamless-m4t-v2-large/facebook"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["seamless"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_seamlessTx_name(seamlessLangName)
                 zip_file_lookup = {}
                 text = ""
                 vtt = ""
+                filterLogs = ""
                 # Write result
                 downloadDirectory = tempfile.mkdtemp()
                     # Transcribe
                     result = self.transcribe_file(model, source.source_path, whisperLangCode, task, vadOptions, scaled_progress_listener, **decodeOptions)
                     filterLog = result.get("filterLog", None)
                     if filterLog:
+                        filterLogs += source.get_full_name() + ":\n" + filterLog + "\n\n"
                     if translationModel is not None and whisperLang is None and result["language"] is not None and len(result["language"]) > 0:
                         whisperLang = get_lang_from_whisper_code(result["language"])
                         translationModel.whisperLang = whisperLang
                             zip.write(download_file, arcname=zip_file_name)
                     download.insert(0, downloadAllPath)
+                filterLogText = [gr.Text.update(visible=False)]
+                if filterLogs:
+                    filterLogText = [gr.Text.update(visible=True, value=filterLogs)]
                 return [download, text, vtt] + filterLogText
                             if isFilter: break
                     if isFilter: break
                 if isFilter:
                     filterLog.append(f"\t{querySegment}\n")
                     del querySegmentsResult[currentID]
+            if filterLog:
+                filterLog = [f"filter{idx:03d} [{filterCondition}]:\n{log}" for idx, log in enumerate(reversed(filterLog))]
             return querySegmentsResult, "\n".join(filterLog)
         except Exception as e:
         text = result["text"]
         segments = result["segments"]
         language = result["language"]
+        languageMaxLineWidth = 80 #Use east_asian_width to automatically determine the Character Width of the string, replacing the __get_max_line_width function. 80 latin characters should fit on a 1080p/720p screen
         if translationModel is not None and translationModel.translationLang is not None:
             try:
                 print(traceback.format_exc())
                 print("Error process segments: " + str(e))
+        print("Max line Character Width " + str(languageMaxLineWidth) + " for language:" + language)
         vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
         srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
         json_result = json.dumps(result, indent=4, ensure_ascii=False)
     def __get_source(self, urlData, multipleFiles, microphoneData):
         return get_audio_source_collection(urlData, multipleFiles, microphoneData, self.inputAudioMaxDuration)
     def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
         segmentStream = StringIO()
     }
     common_output = lambda : [
+        gr.File(label="Download", height=200, elem_id="outputDownload"),
         gr.Text(label="Transcription", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputTranscription", elem_classes="scroll-show"),
         gr.Text(label="Segments", autoscroll=False, show_copy_button=True, interactive=True, elem_id="outputSegments", elem_classes="scroll-show"),
         gr.Text(label="Filtered segment items", autoscroll=False, visible=False, show_copy_button=True, interactive=True, elem_id="outputFiltered", elem_classes="scroll-show"),

src/download.py CHANGED Viewed

@@ -5,6 +5,9 @@ from yt_dlp import YoutubeDL
 import yt_dlp
 from yt_dlp.postprocessor import PostProcessor
 class FilenameCollectorPP(PostProcessor):
     def __init__(self):
         super(FilenameCollectorPP, self).__init__(None)
@@ -42,27 +45,32 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
     if outputTemplate:
         ydl_opts['outtmpl'] = outputTemplate
     filename_collector = FilenameCollectorPP()
-    with YoutubeDL(ydl_opts) as ydl:
-        if maxDuration and maxDuration > 0:
-            info = ydl.extract_info(url, download=False)
-            entries = "entries" in info and info["entries"] or [info]
-            total_duration = 0
-            # Compute total duration
-            for entry in entries:
-                total_duration += float(entry["duration"])
-            if total_duration >= maxDuration:
-                raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=maxDuration, message="Video is too long")
-        ydl.add_post_processor(filename_collector)
-        ydl.download([url])
     if len(filename_collector.filenames) <= 0:
-        raise Exception("Cannot download " + url)
     result = []
@@ -70,10 +78,20 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
         result.append(filename)
         print("Downloaded " + filename)
-    return result
 class ExceededMaximumDuration(Exception):
     def __init__(self, videoDuration, maxDuration, message):
         self.videoDuration = videoDuration
         self.maxDuration = maxDuration
-        super().__init__(message)

 import yt_dlp
 from yt_dlp.postprocessor import PostProcessor
+import io
+from contextlib import redirect_stderr
 class FilenameCollectorPP(PostProcessor):
     def __init__(self):
         super(FilenameCollectorPP, self).__init__(None)
     if outputTemplate:
         ydl_opts['outtmpl'] = outputTemplate
+    errStrIO = EventStringIO(on_write=lambda text: print(f"\033[91m{text}\033[0m"))
     filename_collector = FilenameCollectorPP()
+    with redirect_stderr(errStrIO):
+        with YoutubeDL(ydl_opts) as ydl:
+            if maxDuration and maxDuration > 0:
+                info = ydl.extract_info(url, download=False)
+                entries = "entries" in info and info["entries"] or [info]
+                total_duration = 0
+                # Compute total duration
+                for entry in entries:
+                    total_duration += float(entry["duration"])
+                if total_duration >= maxDuration:
+                    raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=maxDuration, message="Video is too long")
+            ydl.add_post_processor(filename_collector)
+            ydl.download([url])
+    errMsg = errStrIO.getvalue()
+    errMsg = [text for text in errMsg.split("\n") if text.startswith("ERROR")] if errMsg else ""
     if len(filename_collector.filenames) <= 0:
+        raise Exception(f"Cannot download {url}, " + "\n".join(errMsg) if errMsg else "")
     result = []
         result.append(filename)
         print("Downloaded " + filename)
+    return result
 class ExceededMaximumDuration(Exception):
     def __init__(self, videoDuration, maxDuration, message):
         self.videoDuration = videoDuration
         self.maxDuration = maxDuration
+        super().__init__(message)
+class EventStringIO(io.StringIO):
+    def __init__(self, on_write=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.on_write = on_write
+    def write(self, text):
+        super().write(text)
+        if self.on_write:
+            self.on_write(text)

src/utils.py CHANGED Viewed

@@ -1,12 +1,11 @@
-import textwrap
-import unicodedata
-import re
 import zlib
 from typing import Iterator, TextIO, Union
 import tqdm
 import urllib3
 def exact_div(x, y):
@@ -139,7 +138,7 @@ def write_srt_original(transcript: Iterator[dict], file: TextIO,
         file=file,
         flush=True)
-def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
     for segment in transcript:
         words: list = segment.get('words', [])
@@ -236,45 +235,67 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i
                 'text' : subtitle_text
             }
             if original_text is not None and len(original_text) > 0:
-                result.update({'original': original_text})
             yield result
 def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
     if maxLineWidth is None or maxLineWidth < 0:
-        return " ".join(words)
-    lines = []
-    current_line = ""
-    current_length = 0
-    for entry in words:
-        # Either accept a string or a dict with a 'word' and 'length' field
-        if isinstance(entry, dict):
-            word = entry['word']
-            word_length = entry['length']
-        else:
-            word = entry
-            word_length = len(word)
-        if current_length > 0 and current_length + word_length > maxLineWidth:
-            lines.append(current_line)
-            current_line = ""
-            current_length = 0
-        current_length += word_length
-        # The word will be prefixed with a space by Whisper, so we don't need to add one here
-        current_line += word
-    if len(current_line) > 0:
-        lines.append(current_line)
-    return "\n".join(lines)
 def process_text(text: str, maxLineWidth=None):
     if (maxLineWidth is None or maxLineWidth < 0):
         return text
-    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
     return '\n'.join(lines)
 def slugify(value, allow_unicode=False, is_lower=False):

+import re
 import zlib
 from typing import Iterator, TextIO, Union
 import tqdm
 import urllib3
+import unicodedata
 def exact_div(x, y):
         file=file,
         flush=True)
+def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
     for segment in transcript:
         words: list = segment.get('words', [])
                 'text' : subtitle_text
             }
             if original_text is not None and len(original_text) > 0:
+                result.update({'original': process_text(original_text, maxLineWidth)})
             yield result
 def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
+    result = "".join(words)
     if maxLineWidth is None or maxLineWidth < 0:
+        return result
+    return process_text(result, maxLineWidth)
 def process_text(text: str, maxLineWidth=None):
+    """
+    Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.
+    # East_Asian_Width (ea)
+    ea ; A         ; Ambiguous
+    ea ; F         ; Fullwidth
+    ea ; H         ; Halfwidth
+    ea ; N         ; Neutral
+    ea ; Na        ; Narrow
+    ea ; W         ; Wide
+    https://stackoverflow.com/a/31666966
+    """
     if (maxLineWidth is None or maxLineWidth < 0):
         return text
+    lines = []
+    currentLine = ""
+    currentWidth = 0
+    for word in text.split():
+        wordWidth = 0
+        wordStart = 0
+        if currentLine:
+            currentLine += " "
+            wordWidth += 1
+        for wordIdx, char in enumerate(word):
+            if unicodedata.east_asian_width(char) not in {'W', 'F'}:
+                wordWidth += 1
+            else:
+                if currentWidth + wordWidth + 2 > maxLineWidth:
+                    lines.append(currentLine + word[wordStart:wordIdx])
+                    currentLine = ""
+                    currentWidth = 0
+                    wordStart = wordIdx
+                    wordWidth = 0
+                wordWidth += 2
+        if currentWidth + wordWidth > maxLineWidth:
+            lines.append(currentLine)
+            currentLine = word[wordStart:]
+            currentWidth = wordWidth
+        else:
+            currentLine += word[wordStart:]
+            currentWidth += wordWidth
+    if currentLine:
+        lines.append(currentLine)
     return '\n'.join(lines)
 def slugify(value, allow_unicode=False, is_lower=False):