Spaces:

aadnk
/

whisper-webui

Runtime error

App Files Files Community

aadnk commited on Dec 6, 2022

Commit

cb9ee50

1 Parent(s): 479b187

Add support for multiple input files and output files

Browse files

Files changed (4) hide show

app.py +77 -29
cli.py +0 -3
src/download.py +9 -3
src/source.py +70 -0

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import math
 from typing import Iterator
 import argparse
@@ -6,9 +7,11 @@ from io import StringIO
 import os
 import pathlib
 import tempfile
 import torch
 from src.modelCache import ModelCache
 from src.vadParallel import ParallelContext, ParallelTranscription
 # External programs
@@ -78,9 +81,9 @@ class WhisperTranscriber:
             self.vad_cpu_cores = min(os.cpu_count(), MAX_AUTO_CPU_CORES)
             print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
-    def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
         try:
-            source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
@@ -88,22 +91,84 @@ class WhisperTranscriber:
                 model = WhisperContainer(model_name=selectedModel, cache=self.model_cache)
-                # Execute whisper
-                result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
                 # Write result
                 downloadDirectory = tempfile.mkdtemp()
-                filePrefix = slugify(sourceName, allow_unicode=True)
-                download, text, vtt = self.write_result(result, filePrefix, downloadDirectory)
                 return download, text, vtt
             finally:
                 # Cleanup source
                 if self.deleteUploadedFiles:
-                    print("Deleting source file " + source)
-                    os.remove(source)
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
@@ -222,25 +287,8 @@ class WhisperTranscriber:
         self.model_cache.clear()
         self.vad_model = None
-    def __get_source(self, urlData, uploadFile, microphoneData):
-        if urlData:
-            # Download from YouTube
-            source = download_url(urlData, self.inputAudioMaxDuration)[0]
-        else:
-            # File input
-            source = uploadFile if uploadFile is not None else microphoneData
-            if self.inputAudioMaxDuration > 0:
-                # Calculate audio length
-                audioDuration = ffmpeg.probe(source)["format"]["duration"]
-                if float(audioDuration) > self.inputAudioMaxDuration:
-                    raise ExceededMaximumDuration(videoDuration=audioDuration, maxDuration=self.inputAudioMaxDuration, message="Video is too long")
-        file_path = pathlib.Path(source)
-        sourceName = file_path.stem[:MAX_FILE_PREFIX_LENGTH] + file_path.suffix
-        return source, sourceName
     def __get_max_line_width(self, language: str) -> int:
         if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
@@ -304,7 +352,7 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
         gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value=default_model_name, label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
-        gr.Audio(source="upload", type="filepath", label="Upload Audio"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=default_vad, label="VAD"),

+from datetime import datetime
 import math
 from typing import Iterator
 import argparse
 import os
 import pathlib
 import tempfile
+import zipfile
 import torch
 from src.modelCache import ModelCache
+from src.source import get_audio_source_collection
 from src.vadParallel import ParallelContext, ParallelTranscription
 # External programs
             self.vad_cpu_cores = min(os.cpu_count(), MAX_AUTO_CPU_CORES)
             print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
+    def transcribe_webui(self, modelName, languageName, urlData, multipleFiles, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
         try:
+            sources = self.__get_source(urlData, multipleFiles, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 model = WhisperContainer(model_name=selectedModel, cache=self.model_cache)
+                # Result
+                download = []
+                zip_file_lookup = {}
+                text = ""
+                vtt = ""
                 # Write result
                 downloadDirectory = tempfile.mkdtemp()
+                source_index = 0
+                # Execute whisper
+                for source in sources:
+                    source_prefix = ""
+                    if (len(sources) > 1):
+                        # Prefix (minimum 2 digits)
+                        source_index += 1
+                        source_prefix = str(source_index).zfill(2) + "_"
+                        print("Transcribing ", source.source_path)
+                    # Transcribe
+                    result = self.transcribe_file(model, source.source_path, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+                    filePrefix = slugify(source_prefix + source.get_short_name(), allow_unicode=True)
+                    source_download, source_text, source_vtt = self.write_result(result, filePrefix, downloadDirectory)
+                    if len(sources) > 1:
+                        # Add new line separators
+                        if (len(source_text) > 0):
+                            source_text += os.linesep + os.linesep
+                        if (len(source_vtt) > 0):
+                            source_vtt += os.linesep + os.linesep
+                        # Append file name to source text too
+                        source_text = source.get_full_name() + ":" + os.linesep + source_text
+                        source_vtt = source.get_full_name() + ":" + os.linesep + source_vtt
+                    # Add to result
+                    download.extend(source_download)
+                    text += source_text
+                    vtt += source_vtt
+                    if (len(sources) > 1):
+                        # Zip files support at least 260 characters, but we'll play it safe and use 200
+                        zipFilePrefix = slugify(source_prefix + source.get_short_name(max_length=200), allow_unicode=True)
+                        # File names in ZIP file can be longer
+                        for source_download_file in source_download:
+                            # Get file postfix (after last -)
+                            filePostfix = os.path.basename(source_download_file).split("-")[-1]
+                            zip_file_name = zipFilePrefix + "-" + filePostfix
+                            zip_file_lookup[source_download_file] = zip_file_name
+                # Create zip file from all sources
+                if len(sources) > 1:
+                    downloadAllPath = os.path.join(downloadDirectory, "All_Output-" + datetime.now().strftime("%Y%m%d-%H%M%S") + ".zip")
+                    with zipfile.ZipFile(downloadAllPath, 'w', zipfile.ZIP_DEFLATED) as zip:
+                        for download_file in download:
+                            # Get file name from lookup
+                            zip_file_name = zip_file_lookup.get(download_file, os.path.basename(download_file))
+                            zip.write(download_file, arcname=zip_file_name)
+                    download.insert(0, downloadAllPath)
                 return download, text, vtt
             finally:
                 # Cleanup source
                 if self.deleteUploadedFiles:
+                    for source in sources:
+                        print("Deleting source file " + source.source_path)
+                        try:
+                            os.remove(source.source_path)
+                        except Exception as e:
+                            # Ignore error - it's just a cleanup
+                            print("Error deleting source file " + source.source_path + ": " + str(e))
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
         self.model_cache.clear()
         self.vad_model = None
+    def __get_source(self, urlData, multipleFiles, microphoneData):
+        return get_audio_source_collection(urlData, multipleFiles, microphoneData, self.inputAudioMaxDuration)
     def __get_max_line_width(self, language: str) -> int:
         if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
         gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value=default_model_name, label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
+        gr.File(label="Upload Files", file_count="multiple"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=default_vad, label="VAD"),

cli.py CHANGED Viewed

@@ -5,8 +5,6 @@ from urllib.parse import urlparse
 import warnings
 import numpy as np
-import whisper
 import torch
 from app import LANGUAGES, WhisperTranscriber
 from src.download import download_url
@@ -14,7 +12,6 @@ from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
 from src.whisperContainer import WhisperContainer
 def cli():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")

 import warnings
 import numpy as np
 import torch
 from app import LANGUAGES, WhisperTranscriber
 from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
 from src.whisperContainer import WhisperContainer
 def cli():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")

src/download.py CHANGED Viewed

@@ -46,10 +46,16 @@ def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = N
     with YoutubeDL(ydl_opts) as ydl:
         if maxDuration and maxDuration > 0:
             info = ydl.extract_info(url, download=False)
-            duration = info['duration']
-            if duration >= maxDuration:
-                raise ExceededMaximumDuration(videoDuration=duration, maxDuration=maxDuration, message="Video is too long")
         ydl.add_post_processor(filename_collector)
         ydl.download([url])

     with YoutubeDL(ydl_opts) as ydl:
         if maxDuration and maxDuration > 0:
             info = ydl.extract_info(url, download=False)
+            entries = "entries" in info and info["entries"] or [info]
+            total_duration = 0
+            # Compute total duration
+            for entry in entries:
+                total_duration += float(entry["duration"])
+            if total_duration >= maxDuration:
+                raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=maxDuration, message="Video is too long")
         ydl.add_post_processor(filename_collector)
         ydl.download([url])

src/source.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
+import os
+import pathlib
+from typing import List
+import zipfile
+import ffmpeg
+from more_itertools import unzip
+from src.download import ExceededMaximumDuration, download_url
+MAX_FILE_PREFIX_LENGTH = 17
+class AudioSource:
+    def __init__(self, source_path, source_name = None):
+        self.source_path = source_path
+        self.source_name = source_name
+        # Load source name if not provided
+        if (self.source_name is None):
+            file_path = pathlib.Path(self.source_path)
+            self.source_name = file_path.name
+    def get_full_name(self):
+        return self.source_name
+    def get_short_name(self, max_length: int = MAX_FILE_PREFIX_LENGTH):
+        file_path = pathlib.Path(self.source_name)
+        short_name = file_path.stem[:max_length] + file_path.suffix
+        return short_name
+    def __str__(self) -> str:
+        return self.source_path
+class AudioSourceCollection:
+    def __init__(self, sources: List[AudioSource]):
+        self.sources = sources
+    def __iter__(self):
+        return iter(self.sources)
+def get_audio_source_collection(urlData: str, multipleFiles: List, microphoneData: str, input_audio_max_duration: float = -1) -> List[AudioSource]:
+    output: List[AudioSource] = []
+    if urlData:
+        # Download from YouTube. This could also be a playlist or a channel.
+        output.extend([ AudioSource(x) for x in download_url(urlData, input_audio_max_duration, playlistItems=None) ])
+    else:
+        # Add input files
+        if (multipleFiles is not None):
+            output.extend([ AudioSource(x.name) for x in multipleFiles ])
+        if (microphoneData is not None):
+            output.append(AudioSource(microphoneData))
+        total_duration = 0
+        # Calculate total audio length. We do this even if input_audio_max_duration
+        # is disabled to ensure that all the audio files are valid.
+        for source in output:
+            audioDuration = ffmpeg.probe(source.source_path)["format"]["duration"]
+            total_duration += float(audioDuration)
+        # Ensure the total duration of the audio is not too long
+        if input_audio_max_duration > 0:
+            if float(total_duration) > input_audio_max_duration:
+                raise ExceededMaximumDuration(videoDuration=total_duration, maxDuration=input_audio_max_duration, message="Video(s) is too long")
+    # Return a list of audio sources
+    return output