Spaces:

jhj0517
/

Whisper-WebUI

Running on T4

App Files Files Community

jhj0517 commited on Jan 26

Commit

abd7185

•

1 Parent(s): 78d87d3

initial_commit

Browse files

Files changed (21) hide show

.gitignore +7 -0
Install.bat +22 -0
Install.sh +21 -0
LICENSE +201 -0
app.py +241 -0
modules/__init__.py +0 -0
modules/base_interface.py +21 -0
modules/deepl_api.py +196 -0
modules/faster_whisper_inference.py +438 -0
modules/nllb_inference.py +345 -0
modules/subtitle_manager.py +135 -0
modules/whisper_Inference.py +424 -0
modules/youtube_manager.py +15 -0
notebook/whisper-webui.ipynb +125 -0
requirements.txt +7 -0
screenshot.png +0 -0
start-webui.bat +18 -0
start-webui.sh +12 -0
ui/__init__.py +0 -0
ui/htmls.py +97 -0
user-start-webui.bat +41 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+venv/
+ui/__pycache__/
+outputs/
+modules/__pycache__/
+models/
+modules/yt_tmp.wav
+.idea/

Install.bat ADDED Viewed

	@@ -0,0 +1,22 @@

+@echo off
+if not exist "%~dp0\venv\Scripts" (
+    echo Creating venv...
+    python -m venv venv
+)
+echo checked the venv folder. now installing requirements..
+cd /d "%~dp0\venv\Scripts"
+call activate.bat
+cd /d "%~dp0"
+pip install -r requirements.txt
+if errorlevel 1 (
+    echo.
+    echo Requirements installation failed. please remove venv folder and run install.bat again.
+) else (
+    echo.
+    echo Requirements installed successfully.
+)
+pause

Install.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+if [ ! -d "venv" ]; then
+    echo "Creating virtual environment..."
+    python -m venv venv
+fi
+source venv/bin/activate
+pip install -r requirements.txt
+if [ $? -ne 0 ]; then
+    echo ""
+    echo "Requirements installation failed. please remove venv folder and run install.sh again."
+    deactivate
+    exit 1
+fi
+echo ""
+echo "Requirements installed successfully."
+deactivate

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2023 jhj0517
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import gradio as gr
+import os
+import argparse
+from modules.whisper_Inference import WhisperInference
+from modules.faster_whisper_inference import FasterWhisperInference
+from modules.nllb_inference import NLLBInference
+from ui.htmls import *
+from modules.youtube_manager import get_ytmetas
+from modules.deepl_api import DeepLAPI
+class App:
+    def __init__(self, args):
+        self.args = args
+        self.app = gr.Blocks(css=CSS, theme=self.args.theme)
+        self.whisper_inf = WhisperInference() if self.args.disable_faster_whisper else FasterWhisperInference()
+        if isinstance(self.whisper_inf, FasterWhisperInference):
+            print("Use Faster Whisper implementation")
+        else:
+            print("Use Open AI Whisper implementation")
+        print(f"Device \"{self.whisper_inf.device}\" is detected")
+        self.nllb_inf = NLLBInference()
+        self.deepl_api = DeepLAPI()
+    @staticmethod
+    def open_folder(folder_path: str):
+        if os.path.exists(folder_path):
+            os.system(f"start {folder_path}")
+        else:
+            print(f"The folder {folder_path} does not exist.")
+    @staticmethod
+    def on_change_models(model_size: str):
+        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
+        if model_size not in translatable_model:
+            return gr.Checkbox(visible=False, value=False, interactive=False)
+        else:
+            return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
+    def launch(self):
+        with self.app:
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown(MARKDOWN, elem_id="md_project")
+            with gr.Tabs():
+                with gr.TabItem("File"):  # tab1
+                    with gr.Row():
+                        input_file = gr.Files(type="filepath", label="Upload File here")
+                    with gr.Row():
+                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v3",
+                                               label="Model")
+                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                              value="Automatic Detection", label="Language")
+                        dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+                    with gr.Row():
+                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Row():
+                        cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                    with gr.Row():
+                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
+                    with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=4)
+                        files_subtitles = gr.Files(label="Downloadable output file", scale=4, interactive=False)
+                        btn_openfolder = gr.Button('📂', scale=1)
+                    params = [input_file, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
+                    btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                  inputs=params + advanced_params,
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+                with gr.TabItem("Youtube"):  # tab2
+                    with gr.Row():
+                        tb_youtubelink = gr.Textbox(label="Youtube Link")
+                    with gr.Row(equal_height=True):
+                        with gr.Column():
+                            img_thumbnail = gr.Image(label="Youtube Thumbnail")
+                        with gr.Column():
+                            tb_title = gr.Label(label="Youtube Title")
+                            tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
+                    with gr.Row():
+                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v3",
+                                               label="Model")
+                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                              value="Automatic Detection", label="Language")
+                        dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+                    with gr.Row():
+                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Row():
+                        cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
+                                                   interactive=True)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                    with gr.Row():
+                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
+                    with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=4)
+                        files_subtitles = gr.Files(label="Downloadable output file", scale=4)
+                        btn_openfolder = gr.Button('📂', scale=1)
+                    params = [tb_youtubelink, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
+                    btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                  inputs=params + advanced_params,
+                                  outputs=[tb_indicator, files_subtitles])
+                    tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
+                                          outputs=[img_thumbnail, tb_title, tb_description])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+                with gr.TabItem("Mic"):  # tab3
+                    with gr.Row():
+                        mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
+                    with gr.Row():
+                        dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v3",
+                                               label="Model")
+                        dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
+                                              value="Automatic Detection", label="Language")
+                        dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
+                    with gr.Row():
+                        cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Accordion("Advanced_Parameters", open=False):
+                        nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
+                        nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
+                        nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                    with gr.Row():
+                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
+                    with gr.Row():
+                        tb_indicator = gr.Textbox(label="Output", scale=4)
+                        files_subtitles = gr.Files(label="Downloadable output file", scale=4)
+                        btn_openfolder = gr.Button('📂', scale=1)
+                    params = [mic_input, dd_model, dd_lang, dd_file_format, cb_translate]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
+                    btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                  inputs=params + advanced_params,
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+                with gr.TabItem("T2T Translation"):  # tab 4
+                    with gr.Row():
+                        file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here",
+                                             file_types=['.vtt', '.srt'])
+                    with gr.TabItem("DeepL API"):  # sub tab1
+                        with gr.Row():
+                            tb_authkey = gr.Textbox(label="Your Auth Key (API KEY)",
+                                                    value="")
+                        with gr.Row():
+                            dd_deepl_sourcelang = gr.Dropdown(label="Source Language", value="Automatic Detection",
+                                                              choices=list(
+                                                                  self.deepl_api.available_source_langs.keys()))
+                            dd_deepl_targetlang = gr.Dropdown(label="Target Language", value="English",
+                                                              choices=list(
+                                                                  self.deepl_api.available_target_langs.keys()))
+                        with gr.Row():
+                            cb_deepl_ispro = gr.Checkbox(label="Pro User?", value=False)
+                        with gr.Row():
+                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label="Output", scale=4)
+                            files_subtitles = gr.Files(label="Downloadable output file", scale=4)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                    btn_run.click(fn=self.deepl_api.translate_deepl,
+                                  inputs=[tb_authkey, file_subs, dd_deepl_sourcelang, dd_deepl_targetlang,
+                                          cb_deepl_ispro],
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
+                                         inputs=None,
+                                         outputs=None)
+                    with gr.TabItem("NLLB"):  # sub tab2
+                        with gr.Row():
+                            dd_nllb_model = gr.Dropdown(label="Model", value=self.nllb_inf.default_model_size,
+                                                        choices=self.nllb_inf.available_models)
+                            dd_nllb_sourcelang = gr.Dropdown(label="Source Language",
+                                                             choices=self.nllb_inf.available_source_langs)
+                            dd_nllb_targetlang = gr.Dropdown(label="Target Language",
+                                                             choices=self.nllb_inf.available_target_langs)
+                        with gr.Row():
+                            cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
+                                                       interactive=True)
+                        with gr.Row():
+                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label="Output", scale=4)
+                            files_subtitles = gr.Files(label="Downloadable output file", scale=4)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        with gr.Column():
+                            md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
+                    btn_run.click(fn=self.nllb_inf.translate_file,
+                                  inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang, cb_timestamp],
+                                  outputs=[tb_indicator, files_subtitles])
+                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
+                                         inputs=None,
+                                         outputs=None)
+        # Launch the app with optional gradio settings
+        launch_args = {}
+        if self.args.share:
+            launch_args['share'] = self.args.share
+        if self.args.server_name:
+            launch_args['server_name'] = self.args.server_name
+        if self.args.server_port:
+            launch_args['server_port'] = self.args.server_port
+        if self.args.username and self.args.password:
+            launch_args['auth'] = (self.args.username, self.args.password)
+        self.app.queue(api_open=False).launch(**launch_args)
+# Create the parser for command-line arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--disable_faster_whisper', type=bool, default=False, nargs='?', const=True, help='Disable the faster_whisper implementation. faster_whipser is implemented by https://github.com/guillaumekln/faster-whisper')
+parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
+parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
+parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
+parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
+parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
+parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
+parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
+_args = parser.parse_args()
+if __name__ == "__main__":
+    app = App(args=_args)
+    app.launch()

modules/__init__.py ADDED Viewed

File without changes

modules/base_interface.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import torch
+from typing import List
+class BaseInterface:
+    def __init__(self):
+        pass
+    @staticmethod
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        for file_path in file_paths:
+            if file_path is None or not os.path.exists(file_path):
+                continue
+            os.remove(file_path)

modules/deepl_api.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import requests
+import time
+import os
+from datetime import datetime
+import gradio as gr
+from modules.subtitle_manager import *
+"""
+This is written with reference to the DeepL API documentation.
+If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
+"""
+DEEPL_AVAILABLE_TARGET_LANGS = {
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'English (British)': 'EN-GB',
+    'English (American)': 'EN-US',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese': 'PT',
+    'Portuguese (Brazilian)': 'PT-BR',
+    'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese (simplified)': 'ZH'
+}
+DEEPL_AVAILABLE_SOURCE_LANGS = {
+    'Automatic Detection': None,
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese (all Portuguese varieties mixed)': 'PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese': 'ZH'
+}
+class DeepLAPI:
+    def __init__(self):
+        self.api_interval = 1
+        self.max_text_batch_size = 50
+        self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
+        self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
+    def translate_deepl(self,
+                        auth_key: str,
+                        fileobjs: list,
+                        source_lang: str,
+                        target_lang: str,
+                        is_pro: bool,
+                        progress=gr.Progress()) -> list:
+        """
+        Translate subtitle files using DeepL API
+        Parameters
+        ----------
+        auth_key: str
+            API Key for DeepL from gr.Textbox()
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        source_lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        target_lang: str
+            Target language of the file to transcribe from gr.Dropdown()
+        is_pro: str
+            Boolean value that is about pro user or not from gr.Checkbox().
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        files_info = {}
+        for fileobj in fileobjs:
+            file_path = fileobj.name
+            file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+            if file_ext == ".srt":
+                parsed_dicts = parse_srt(file_path=file_path)
+                batch_size = self.max_text_batch_size
+                for batch_start in range(0, len(parsed_dicts), batch_size):
+                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
+                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                    target_lang, is_pro)
+                    for i, translated_text in enumerate(translated_texts):
+                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+                    progress(batch_end / len(parsed_dicts), desc="Translating..")
+                subtitle = get_serialized_srt(parsed_dicts)
+                timestamp = datetime.now().strftime("%m%d%H%M%S")
+                file_name = file_name[:-9]
+                output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}.srt")
+                write_file(subtitle, output_path)
+            elif file_ext == ".vtt":
+                parsed_dicts = parse_vtt(file_path=file_path)
+                batch_size = self.max_text_batch_size
+                for batch_start in range(0, len(parsed_dicts), batch_size):
+                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
+                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                    target_lang, is_pro)
+                    for i, translated_text in enumerate(translated_texts):
+                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+                    progress(batch_end / len(parsed_dicts), desc="Translating..")
+                subtitle = get_serialized_vtt(parsed_dicts)
+                timestamp = datetime.now().strftime("%m%d%H%M%S")
+                file_name = file_name[:-9]
+                output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}.vtt")
+                write_file(subtitle, output_path)
+            files_info[file_name] = subtitle
+        total_result = ''
+        for file_name, subtitle in files_info.items():
+            total_result += '------------------------------------\n'
+            total_result += f'{file_name}\n\n'
+            total_result += f'{subtitle}'
+        gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+        return [gr_str, output_path]
+    def request_deepl_translate(self,
+                                auth_key: str,
+                                text: list,
+                                source_lang: str,
+                                target_lang: str,
+                                is_pro: bool):
+        """Request API response to DeepL server"""
+        url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
+        headers = {
+            'Authorization': f'DeepL-Auth-Key {auth_key}'
+        }
+        data = {
+            'text': text,
+            'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
+            'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
+        }
+        response = requests.post(url, headers=headers, data=data).json()
+        time.sleep(self.api_interval)
+        return response["translations"]

modules/faster_whisper_inference.py ADDED Viewed

	@@ -0,0 +1,438 @@

+import os
+import tqdm
+import time
+import numpy as np
+from typing import BinaryIO, Union, Tuple
+from datetime import datetime, timedelta
+import faster_whisper
+import ctranslate2
+import whisper
+import torch
+import gradio as gr
+from .base_interface import BaseInterface
+from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
+from modules.youtube_manager import get_ytdata, get_ytaudio
+class FasterWhisperInference(BaseInterface):
+    def __init__(self):
+        super().__init__()
+        self.current_model_size = None
+        self.model = None
+        self.available_models = whisper.available_models()
+        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.available_compute_types = ctranslate2.get_supported_compute_types("cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
+        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+        self.default_beam_size = 1
+    def transcribe_file(self,
+                        fileobjs: list,
+                        model_size: str,
+                        lang: str,
+                        file_format: str,
+                        istranslate: bool,
+                        add_timestamp: bool,
+                        beam_size: int,
+                        log_prob_threshold: float,
+                        no_speech_threshold: float,
+                        compute_type: str,
+                        progress=gr.Progress()
+                        ) -> list:
+        """
+        Write subtitle file from Files
+        Parameters
+        ----------
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        file_format: str
+            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
+            files_info = {}
+            for fileobj in fileobjs:
+                transcribed_segments, time_for_task = self.transcribe(
+                    audio=fileobj.name,
+                    lang=lang,
+                    istranslate=istranslate,
+                    beam_size=beam_size,
+                    log_prob_threshold=log_prob_threshold,
+                    no_speech_threshold=no_speech_threshold,
+                    progress=progress
+                )
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+                file_name = safe_filename(file_name)
+                subtitle, file_path = self.generate_and_write_file(
+                    file_name=file_name,
+                    transcribed_segments=transcribed_segments,
+                    add_timestamp=add_timestamp,
+                    file_format=file_format
+                )
+                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path":  file_path}
+            total_result = ''
+            total_time = 0
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
+                total_time += info["time_for_task"]
+            gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            gr_file_path = [info['path'] for info in files_info.values()]
+            return [gr_str, gr_file_path]
+        except Exception as e:
+            print(f"Error transcribing file on line {e}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([fileobj.name for fileobj in fileobjs])
+    def transcribe_youtube(self,
+                           youtubelink: str,
+                           model_size: str,
+                           lang: str,
+                           file_format: str,
+                           istranslate: bool,
+                           add_timestamp: bool,
+                           beam_size: int,
+                           log_prob_threshold: float,
+                           no_speech_threshold: float,
+                           compute_type: str,
+                           progress=gr.Progress()
+                           ) -> list:
+        """
+        Write subtitle file from Youtube
+        Parameters
+        ----------
+        youtubelink: str
+            Link of Youtube to transcribe from gr.Textbox()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        file_format: str
+            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
+            progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtubelink)
+            audio = get_ytaudio(yt)
+            transcribed_segments, time_for_task = self.transcribe(
+                audio=audio,
+                lang=lang,
+                istranslate=istranslate,
+                beam_size=beam_size,
+                log_prob_threshold=log_prob_threshold,
+                no_speech_threshold=no_speech_threshold,
+                progress=progress
+            )
+            progress(1, desc="Completed!")
+            file_name = safe_filename(yt.title)
+            subtitle, file_path = self.generate_and_write_file(
+                file_name=file_name,
+                transcribed_segments=transcribed_segments,
+                add_timestamp=add_timestamp,
+                file_format=file_format
+            )
+            gr_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [gr_str, file_path]
+        except Exception as e:
+            print(f"Error transcribing file on line {e}")
+        finally:
+            try:
+                if 'yt' not in locals():
+                    yt = get_ytdata(youtubelink)
+                    file_path = get_ytaudio(yt)
+                else:
+                    file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
+                self.remove_input_files([file_path])
+            except Exception as cleanup_error:
+                pass
+    def transcribe_mic(self,
+                       micaudio: str,
+                       model_size: str,
+                       lang: str,
+                       file_format: str,
+                       istranslate: bool,
+                       beam_size: int,
+                       log_prob_threshold: float,
+                       no_speech_threshold: float,
+                       compute_type: str,
+                       progress=gr.Progress()
+                       ) -> list:
+        """
+        Write subtitle file from microphone
+        Parameters
+        ----------
+        micaudio: str
+            Audio file path from gr.Microphone()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        file_format: str
+            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+        compute_type: str
+            compute type from gr.Dropdown().
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+            consider the segment as silent.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
+            progress(0, desc="Loading Audio..")
+            transcribed_segments, time_for_task = self.transcribe(
+                audio=micaudio,
+                lang=lang,
+                istranslate=istranslate,
+                beam_size=beam_size,
+                log_prob_threshold=log_prob_threshold,
+                no_speech_threshold=no_speech_threshold,
+                progress=progress
+            )
+            progress(1, desc="Completed!")
+            subtitle, file_path = self.generate_and_write_file(
+                file_name="Mic",
+                transcribed_segments=transcribed_segments,
+                add_timestamp=True,
+                file_format=file_format
+            )
+            gr_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [gr_str, file_path]
+        except Exception as e:
+            print(f"Error transcribing file on line {e}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([micaudio])
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   lang: str,
+                   istranslate: bool,
+                   beam_size: int,
+                   log_prob_threshold: float,
+                   no_speech_threshold: float,
+                   progress: gr.Progress
+                   ) -> Tuple[list, float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        segments_result: list[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        if lang == "Automatic Detection":
+            lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            lang = language_code_dict[lang]
+        segments, info = self.model.transcribe(
+            audio=audio,
+            language=lang,
+            task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe",
+            beam_size=beam_size,
+            log_prob_threshold=log_prob_threshold,
+            no_speech_threshold=no_speech_threshold,
+        )
+        progress(0, desc="Loading audio..")
+        segments_result = []
+        for segment in segments:
+            progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append({
+                "start": segment.start,
+                "end": segment.end,
+                "text": segment.text
+            })
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model_if_needed(self,
+                               model_size: str,
+                               compute_type: str,
+                               progress: gr.Progress
+                               ):
+        """
+        Initialize model if it doesn't match with current model setting
+        """
+        if model_size != self.current_model_size or self.model is None or self.current_compute_type != compute_type:
+            progress(0, desc="Initializing Model..")
+            self.current_model_size = model_size
+            self.current_compute_type = compute_type
+            self.model = faster_whisper.WhisperModel(
+                device=self.device,
+                model_size_or_path=model_size,
+                download_root=os.path.join("models", "Whisper", "faster-whisper"),
+                compute_type=self.current_compute_type
+            )
+    @staticmethod
+    def generate_and_write_file(file_name: str,
+                                transcribed_segments: list,
+                                add_timestamp: bool,
+                                file_format: str,
+                                ) -> str:
+        """
+        This method writes subtitle file and returns str to gr.Textbox
+        """
+        timestamp = datetime.now().strftime("%m%d%H%M%S")
+        if add_timestamp:
+            output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
+        else:
+            output_path = os.path.join("outputs", f"{file_name}")
+        if file_format == "SRT":
+            content = get_srt(transcribed_segments)
+            output_path += '.srt'
+            write_file(content, output_path)
+        elif file_format == "WebVTT":
+            content = get_vtt(transcribed_segments)
+            output_path += '.vtt'
+            write_file(content, output_path)
+        elif file_format == "txt":
+            content = get_txt(transcribed_segments)
+            output_path += '.txt'
+            write_file(content, output_path)
+        return content, output_path
+    @staticmethod
+    def format_time(elapsed_time: float) -> str:
+        hours, rem = divmod(elapsed_time, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_str = ""
+        if hours:
+            time_str += f"{hours} hours "
+        if minutes:
+            time_str += f"{minutes} minutes "
+        seconds = round(seconds)
+        time_str += f"{seconds} seconds"
+        return time_str.strip()

modules/nllb_inference.py ADDED Viewed

	@@ -0,0 +1,345 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import gradio as gr
+import torch
+import os
+from datetime import datetime
+from .base_interface import BaseInterface
+from modules.subtitle_manager import *
+DEFAULT_MODEL_SIZE = "facebook/nllb-200-1.3B"
+NLLB_MODELS = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
+class NLLBInference(BaseInterface):
+    def __init__(self):
+        super().__init__()
+        self.default_model_size = DEFAULT_MODEL_SIZE
+        self.current_model_size = None
+        self.model = None
+        self.tokenizer = None
+        self.available_models = NLLB_MODELS
+        self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.device = 0 if torch.cuda.is_available() else -1
+        self.pipeline = None
+    def translate_text(self, text):
+        result = self.pipeline(text)
+        return result[0]['translation_text']
+    def translate_file(self,
+                       fileobjs: list,
+                       model_size: str,
+                       src_lang: str,
+                       tgt_lang: str,
+                       add_timestamp: bool,
+                       progress=gr.Progress()) -> list:
+        """
+        Translate subtitle file from source language to target language
+        Parameters
+        ----------
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        src_lang: str
+            Source language of the file to translate from gr.Dropdown()
+        tgt_lang: str
+            Target language of the file to translate from gr.Dropdown()
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            if model_size != self.current_model_size or self.model is None:
+                print("\nInitializing NLLB Model..\n")
+                progress(0, desc="Initializing NLLB Model..")
+                self.current_model_size = model_size
+                self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                                   cache_dir=os.path.join("models", "NLLB"))
+                self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                               cache_dir=os.path.join("models", "NLLB", "tokenizers"))
+            src_lang = NLLB_AVAILABLE_LANGS[src_lang]
+            tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
+            self.pipeline = pipeline("translation",
+                                     model=self.model,
+                                     tokenizer=self.tokenizer,
+                                     src_lang=src_lang,
+                                     tgt_lang=tgt_lang,
+                                     device=self.device)
+            files_info = {}
+            for fileobj in fileobjs:
+                file_path = fileobj.name
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+                if file_ext == ".srt":
+                    parsed_dicts = parse_srt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate_text(dic["sentence"])
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_srt(parsed_dicts)
+                    timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    if add_timestamp:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
+                    else:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}")
+                    output_path += '.srt'
+                    write_file(subtitle, output_path)
+                elif file_ext == ".vtt":
+                    parsed_dicts = parse_vtt(file_path=file_path)
+                    total_progress = len(parsed_dicts)
+                    for index, dic in enumerate(parsed_dicts):
+                        progress(index / total_progress, desc="Translating..")
+                        translated_text = self.translate_text(dic["sentence"])
+                        dic["sentence"] = translated_text
+                    subtitle = get_serialized_vtt(parsed_dicts)
+                    timestamp = datetime.now().strftime("%m%d%H%M%S")
+                    if add_timestamp:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}")
+                    else:
+                        output_path = os.path.join("outputs", "translations", f"{file_name}")
+                    output_path += '.vtt'
+                    write_file(subtitle, output_path)
+                files_info[file_name] = subtitle
+            total_result = ''
+            for file_name, subtitle in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{subtitle}'
+            gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+            return [gr_str, output_path]
+        except Exception as e:
+            print(f"Error: {str(e)}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([fileobj.name for fileobj in fileobjs])
+NLLB_AVAILABLE_LANGS = {
+    "Acehnese (Arabic script)": "ace_Arab",
+    "Acehnese (Latin script)": "ace_Latn",
+    "Mesopotamian Arabic": "acm_Arab",
+    "Ta’izzi-Adeni Arabic": "acq_Arab",
+    "Tunisian Arabic": "aeb_Arab",
+    "Afrikaans": "afr_Latn",
+    "South Levantine Arabic": "ajp_Arab",
+    "Akan": "aka_Latn",
+    "Amharic": "amh_Ethi",
+    "North Levantine Arabic": "apc_Arab",
+    "Modern Standard Arabic": "arb_Arab",
+    "Modern Standard Arabic (Romanized)": "arb_Latn",
+    "Najdi Arabic": "ars_Arab",
+    "Moroccan Arabic": "ary_Arab",
+    "Egyptian Arabic": "arz_Arab",
+    "Assamese": "asm_Beng",
+    "Asturian": "ast_Latn",
+    "Awadhi": "awa_Deva",
+    "Central Aymara": "ayr_Latn",
+    "South Azerbaijani": "azb_Arab",
+    "North Azerbaijani": "azj_Latn",
+    "Bashkir": "bak_Cyrl",
+    "Bambara": "bam_Latn",
+    "Balinese": "ban_Latn",
+    "Belarusian": "bel_Cyrl",
+    "Bemba": "bem_Latn",
+    "Bengali": "ben_Beng",
+    "Bhojpuri": "bho_Deva",
+    "Banjar (Arabic script)": "bjn_Arab",
+    "Banjar (Latin script)": "bjn_Latn",
+    "Standard Tibetan": "bod_Tibt",
+    "Bosnian": "bos_Latn",
+    "Buginese": "bug_Latn",
+    "Bulgarian": "bul_Cyrl",
+    "Catalan": "cat_Latn",
+    "Cebuano": "ceb_Latn",
+    "Czech": "ces_Latn",
+    "Chokwe": "cjk_Latn",
+    "Central Kurdish": "ckb_Arab",
+    "Crimean Tatar": "crh_Latn",
+    "Welsh": "cym_Latn",
+    "Danish": "dan_Latn",
+    "German": "deu_Latn",
+    "Southwestern Dinka": "dik_Latn",
+    "Dyula": "dyu_Latn",
+    "Dzongkha": "dzo_Tibt",
+    "Greek": "ell_Grek",
+    "English": "eng_Latn",
+    "Esperanto": "epo_Latn",
+    "Estonian": "est_Latn",
+    "Basque": "eus_Latn",
+    "Ewe": "ewe_Latn",
+    "Faroese": "fao_Latn",
+    "Fijian": "fij_Latn",
+    "Finnish": "fin_Latn",
+    "Fon": "fon_Latn",
+    "French": "fra_Latn",
+    "Friulian": "fur_Latn",
+    "Nigerian Fulfulde": "fuv_Latn",
+    "Scottish Gaelic": "gla_Latn",
+    "Irish": "gle_Latn",
+    "Galician": "glg_Latn",
+    "Guarani": "grn_Latn",
+    "Gujarati": "guj_Gujr",
+    "Haitian Creole": "hat_Latn",
+    "Hausa": "hau_Latn",
+    "Hebrew": "heb_Hebr",
+    "Hindi": "hin_Deva",
+    "Chhattisgarhi": "hne_Deva",
+    "Croatian": "hrv_Latn",
+    "Hungarian": "hun_Latn",
+    "Armenian": "hye_Armn",
+    "Igbo": "ibo_Latn",
+    "Ilocano": "ilo_Latn",
+    "Indonesian": "ind_Latn",
+    "Icelandic": "isl_Latn",
+    "Italian": "ita_Latn",
+    "Javanese": "jav_Latn",
+    "Japanese": "jpn_Jpan",
+    "Kabyle": "kab_Latn",
+    "Jingpho": "kac_Latn",
+    "Kamba": "kam_Latn",
+    "Kannada": "kan_Knda",
+    "Kashmiri (Arabic script)": "kas_Arab",
+    "Kashmiri (Devanagari script)": "kas_Deva",
+    "Georgian": "kat_Geor",
+    "Central Kanuri (Arabic script)": "knc_Arab",
+    "Central Kanuri (Latin script)": "knc_Latn",
+    "Kazakh": "kaz_Cyrl",
+    "Kabiyè": "kbp_Latn",
+    "Kabuverdianu": "kea_Latn",
+    "Khmer": "khm_Khmr",
+    "Kikuyu": "kik_Latn",
+    "Kinyarwanda": "kin_Latn",
+    "Kyrgyz": "kir_Cyrl",
+    "Kimbundu": "kmb_Latn",
+    "Northern Kurdish": "kmr_Latn",
+    "Kikongo": "kon_Latn",
+    "Korean": "kor_Hang",
+    "Lao": "lao_Laoo",
+    "Ligurian": "lij_Latn",
+    "Limburgish": "lim_Latn",
+    "Lingala": "lin_Latn",
+    "Lithuanian": "lit_Latn",
+    "Lombard": "lmo_Latn",
+    "Latgalian": "ltg_Latn",
+    "Luxembourgish": "ltz_Latn",
+    "Luba-Kasai": "lua_Latn",
+    "Ganda": "lug_Latn",
+    "Luo": "luo_Latn",
+    "Mizo": "lus_Latn",
+    "Standard Latvian": "lvs_Latn",
+    "Magahi": "mag_Deva",
+    "Maithili": "mai_Deva",
+    "Malayalam": "mal_Mlym",
+    "Marathi": "mar_Deva",
+    "Minangkabau (Arabic script)": "min_Arab",
+    "Minangkabau (Latin script)": "min_Latn",
+    "Macedonian": "mkd_Cyrl",
+    "Plateau Malagasy": "plt_Latn",
+    "Maltese": "mlt_Latn",
+    "Meitei (Bengali script)": "mni_Beng",
+    "Halh Mongolian": "khk_Cyrl",
+    "Mossi": "mos_Latn",
+    "Maori": "mri_Latn",
+    "Burmese": "mya_Mymr",
+    "Dutch": "nld_Latn",
+    "Norwegian Nynorsk": "nno_Latn",
+    "Norwegian Bokmål": "nob_Latn",
+    "Nepali": "npi_Deva",
+    "Northern Sotho": "nso_Latn",
+    "Nuer": "nus_Latn",
+    "Nyanja": "nya_Latn",
+    "Occitan": "oci_Latn",
+    "West Central Oromo": "gaz_Latn",
+    "Odia": "ory_Orya",
+    "Pangasinan": "pag_Latn",
+    "Eastern Panjabi": "pan_Guru",
+    "Papiamento": "pap_Latn",
+    "Western Persian": "pes_Arab",
+    "Polish": "pol_Latn",
+    "Portuguese": "por_Latn",
+    "Dari": "prs_Arab",
+    "Southern Pashto": "pbt_Arab",
+    "Ayacucho Quechua": "quy_Latn",
+    "Romanian": "ron_Latn",
+    "Rundi": "run_Latn",
+    "Russian": "rus_Cyrl",
+    "Sango": "sag_Latn",
+    "Sanskrit": "san_Deva",
+    "Santali": "sat_Olck",
+    "Sicilian": "scn_Latn",
+    "Shan": "shn_Mymr",
+    "Sinhala": "sin_Sinh",
+    "Slovak": "slk_Latn",
+    "Slovenian": "slv_Latn",
+    "Samoan": "smo_Latn",
+    "Shona": "sna_Latn",
+    "Sindhi": "snd_Arab",
+    "Somali": "som_Latn",
+    "Southern Sotho": "sot_Latn",
+    "Spanish": "spa_Latn",
+    "Tosk Albanian": "als_Latn",
+    "Sardinian": "srd_Latn",
+    "Serbian": "srp_Cyrl",
+    "Swati": "ssw_Latn",
+    "Sundanese": "sun_Latn",
+    "Swedish": "swe_Latn",
+    "Swahili": "swh_Latn",
+    "Silesian": "szl_Latn",
+    "Tamil": "tam_Taml",
+    "Tatar": "tat_Cyrl",
+    "Telugu": "tel_Telu",
+    "Tajik": "tgk_Cyrl",
+    "Tagalog": "tgl_Latn",
+    "Thai": "tha_Thai",
+    "Tigrinya": "tir_Ethi",
+    "Tamasheq (Latin script)": "taq_Latn",
+    "Tamasheq (Tifinagh script)": "taq_Tfng",
+    "Tok Pisin": "tpi_Latn",
+    "Tswana": "tsn_Latn",
+    "Tsonga": "tso_Latn",
+    "Turkmen": "tuk_Latn",
+    "Tumbuka": "tum_Latn",
+    "Turkish": "tur_Latn",
+    "Twi": "twi_Latn",
+    "Central Atlas Tamazight": "tzm_Tfng",
+    "Uyghur": "uig_Arab",
+    "Ukrainian": "ukr_Cyrl",
+    "Umbundu": "umb_Latn",
+    "Urdu": "urd_Arab",
+    "Northern Uzbek": "uzn_Latn",
+    "Venetian": "vec_Latn",
+    "Vietnamese": "vie_Latn",
+    "Waray": "war_Latn",
+    "Wolof": "wol_Latn",
+    "Xhosa": "xho_Latn",
+    "Eastern Yiddish": "ydd_Hebr",
+    "Yoruba": "yor_Latn",
+    "Yue Chinese": "yue_Hant",
+    "Chinese (Simplified)": "zho_Hans",
+    "Chinese (Traditional)": "zho_Hant",
+    "Standard Malay": "zsm_Latn",
+    "Zulu": "zul_Latn",
+}

modules/subtitle_manager.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import re
+def timeformat_srt(time):
+    hours = time // 3600
+    minutes = (time - hours * 3600) // 60
+    seconds = time - hours * 3600 - minutes * 60
+    milliseconds = (time - int(time)) * 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
+def timeformat_vtt(time):
+    hours = time // 3600
+    minutes = (time - hours * 3600) // 60
+    seconds = time - hours * 3600 - minutes * 60
+    milliseconds = (time - int(time)) * 1000
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
+def write_file(subtitle, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(subtitle)
+def get_srt(segments):
+    output = ""
+    for i, segment in enumerate(segments):
+        output += f"{i + 1}\n"
+        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n\n"
+    return output
+def get_vtt(segments):
+    output = "WebVTT\n\n"
+    for i, segment in enumerate(segments):
+        output += f"{i + 1}\n"
+        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n\n"
+    return output
+def get_txt(segments):
+    output = ""
+    for i, segment in enumerate(segments):
+        if segment['text'].startswith(' '):
+            segment['text'] = segment['text'][1:]
+        output += f"{segment['text']}\n"
+    return output
+def parse_srt(file_path):
+    """Reads SRT file and returns as dict"""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        srt_data = file.read()
+    data = []
+    blocks = srt_data.split('\n\n')
+    for block in blocks:
+        if block.strip() != '':
+            lines = block.strip().split('\n')
+            index = lines[0]
+            timestamp = lines[1]
+            sentence = ' '.join(lines[2:])
+            data.append({
+                "index": index,
+                "timestamp": timestamp,
+                "sentence": sentence
+            })
+    return data
+def parse_vtt(file_path):
+    """Reads WebVTT file and returns as dict"""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        webvtt_data = file.read()
+    data = []
+    blocks = webvtt_data.split('\n\n')
+    for block in blocks:
+        if block.strip() != '' and not block.strip().startswith("WebVTT"):
+            lines = block.strip().split('\n')
+            index = lines[0]
+            timestamp = lines[1]
+            sentence = ' '.join(lines[2:])
+            data.append({
+                "index": index,
+                "timestamp": timestamp,
+                "sentence": sentence
+            })
+    return data
+def get_serialized_srt(dicts):
+    output = ""
+    for dic in dicts:
+        output += f'{dic["index"]}\n'
+        output += f'{dic["timestamp"]}\n'
+        output += f'{dic["sentence"]}\n\n'
+    return output
+def get_serialized_vtt(dicts):
+    output = "WebVTT\n\n"
+    for dic in dicts:
+        output += f'{dic["index"]}\n'
+        output += f'{dic["timestamp"]}\n'
+        output += f'{dic["sentence"]}\n\n'
+    return output
+def safe_filename(name):
+    from app import _args
+    INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
+    safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
+    if not _args.colab:
+        return safe_name
+    # Truncate the filename if it exceeds the max_length (20)
+    if len(safe_name) > 20:
+        file_extension = safe_name.split('.')[-1]
+        if len(file_extension) + 1 < 20:
+            truncated_name = safe_name[:20 - len(file_extension) - 1]
+            safe_name = truncated_name + '.' + file_extension
+        else:
+            safe_name = safe_name[:20]
+    return safe_name

modules/whisper_Inference.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import whisper
+import gradio as gr
+import time
+import os
+from typing import BinaryIO, Union, Tuple
+import numpy as np
+from datetime import datetime
+import torch
+from .base_interface import BaseInterface
+from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
+from modules.youtube_manager import get_ytdata, get_ytaudio
+DEFAULT_MODEL_SIZE = "large-v3"
+class WhisperInference(BaseInterface):
+    def __init__(self):
+        super().__init__()
+        self.current_model_size = None
+        self.model = None
+        self.available_models = whisper.available_models()
+        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.available_compute_types = ["float16", "float32"]
+        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+        self.default_beam_size = 1
+    def transcribe_file(self,
+                        fileobjs: list,
+                        model_size: str,
+                        lang: str,
+                        file_format: str,
+                        istranslate: bool,
+                        add_timestamp: bool,
+                        beam_size: int,
+                        log_prob_threshold: float,
+                        no_speech_threshold: float,
+                        compute_type: str,
+                        progress=gr.Progress()) -> list:
+        """
+        Write subtitle file from Files
+        Parameters
+        ----------
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        file_format: str
+            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
+            files_info = {}
+            for fileobj in fileobjs:
+                progress(0, desc="Loading Audio..")
+                audio = whisper.load_audio(fileobj.name)
+                result, elapsed_time = self.transcribe(audio=audio,
+                                                       lang=lang,
+                                                       istranslate=istranslate,
+                                                       beam_size=beam_size,
+                                                       log_prob_threshold=log_prob_threshold,
+                                                       no_speech_threshold=no_speech_threshold,
+                                                       compute_type=compute_type,
+                                                       progress=progress
+                                                       )
+                progress(1, desc="Completed!")
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
+                file_name = safe_filename(file_name)
+                subtitle, file_path = self.generate_and_write_file(
+                    file_name=file_name,
+                    transcribed_segments=result,
+                    add_timestamp=add_timestamp,
+                    file_format=file_format
+                )
+                files_info[file_name] = {"subtitle": subtitle, "elapsed_time": elapsed_time, "path":  file_path}
+            total_result = ''
+            total_time = 0
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f"{info['subtitle']}"
+                total_time += info["elapsed_time"]
+            gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            gr_file_path = [info['path'] for info in files_info.values()]
+            return [gr_str, gr_file_path]
+        except Exception as e:
+            print(f"Error transcribing file: {str(e)}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([fileobj.name for fileobj in fileobjs])
+    def transcribe_youtube(self,
+                           youtubelink: str,
+                           model_size: str,
+                           lang: str,
+                           file_format: str,
+                           istranslate: bool,
+                           add_timestamp: bool,
+                           beam_size: int,
+                           log_prob_threshold: float,
+                           no_speech_threshold: float,
+                           compute_type: str,
+                           progress=gr.Progress()) -> list:
+        """
+        Write subtitle file from Youtube
+        Parameters
+        ----------
+        youtubelink: str
+            Link of Youtube to transcribe from gr.Textbox()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        file_format: str
+            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
+            progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtubelink)
+            audio = whisper.load_audio(get_ytaudio(yt))
+            result, elapsed_time = self.transcribe(audio=audio,
+                                                   lang=lang,
+                                                   istranslate=istranslate,
+                                                   beam_size=beam_size,
+                                                   log_prob_threshold=log_prob_threshold,
+                                                   no_speech_threshold=no_speech_threshold,
+                                                   compute_type=compute_type,
+                                                   progress=progress)
+            progress(1, desc="Completed!")
+            file_name = safe_filename(yt.title)
+            subtitle, file_path = self.generate_and_write_file(
+                file_name=file_name,
+                transcribed_segments=result,
+                add_timestamp=add_timestamp,
+                file_format=file_format
+            )
+            gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [gr_str, file_path]
+        except Exception as e:
+            print(f"Error transcribing youtube video: {str(e)}")
+        finally:
+            try:
+                if 'yt' not in locals():
+                    yt = get_ytdata(youtubelink)
+                    file_path = get_ytaudio(yt)
+                else:
+                    file_path = get_ytaudio(yt)
+                self.release_cuda_memory()
+                self.remove_input_files([file_path])
+            except Exception as cleanup_error:
+                pass
+    def transcribe_mic(self,
+                       micaudio: str,
+                       model_size: str,
+                       lang: str,
+                       file_format: str,
+                       istranslate: bool,
+                       beam_size: int,
+                       log_prob_threshold: float,
+                       no_speech_threshold: float,
+                       compute_type: str,
+                       progress=gr.Progress()) -> list:
+        """
+        Write subtitle file from microphone
+        Parameters
+        ----------
+        micaudio: str
+            Audio file path from gr.Microphone()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        file_format: str
+            Subtitle format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
+            result, elapsed_time = self.transcribe(audio=micaudio,
+                                                   lang=lang,
+                                                   istranslate=istranslate,
+                                                   beam_size=beam_size,
+                                                   log_prob_threshold=log_prob_threshold,
+                                                   no_speech_threshold=no_speech_threshold,
+                                                   compute_type=compute_type,
+                                                   progress=progress)
+            progress(1, desc="Completed!")
+            subtitle, file_path = self.generate_and_write_file(
+                file_name="Mic",
+                transcribed_segments=result,
+                add_timestamp=True,
+                file_format=file_format
+            )
+            gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [gr_str, file_path]
+        except Exception as e:
+            print(f"Error transcribing mic: {str(e)}")
+        finally:
+            self.release_cuda_memory()
+            self.remove_input_files([micaudio])
+    def transcribe(self,
+                   audio: Union[str, np.ndarray, torch.Tensor],
+                   lang: str,
+                   istranslate: bool,
+                   beam_size: int,
+                   log_prob_threshold: float,
+                   no_speech_threshold: float,
+                   compute_type: str,
+                   progress: gr.Progress
+                   ) -> Tuple[list[dict], float]:
+        """
+        transcribe method for OpenAI's Whisper implementation.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, torch.Tensor]
+            Audio path or file binary or Audio numpy array
+        lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        istranslate: bool
+            Boolean value from gr.Checkbox() that determines whether to translate to English.
+            It's Whisper's feature to translate speech from another language directly into English end-to-end.
+        beam_size: int
+            Int value from gr.Number() that is used for decoding option.
+        log_prob_threshold: float
+            float value from gr.Number(). If the average log probability over sampled tokens is
+            below this value, treat as failed.
+        no_speech_threshold: float
+            float value from gr.Number(). If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        segments_result: list[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        def progress_callback(progress_value):
+            progress(progress_value, desc="Transcribing..")
+        if lang == "Automatic Detection":
+            lang = None
+        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
+        segments_result = self.model.transcribe(audio=audio,
+                                                language=lang,
+                                                verbose=False,
+                                                beam_size=beam_size,
+                                                logprob_threshold=log_prob_threshold,
+                                                no_speech_threshold=no_speech_threshold,
+                                                task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
+                                                fp16=True if compute_type == "float16" else False,
+                                                progress_callback=progress_callback)["segments"]
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model_if_needed(self,
+                               model_size: str,
+                               compute_type: str,
+                               progress: gr.Progress,
+                               ):
+        """
+        Initialize model if it doesn't match with current model setting
+        """
+        if compute_type != self.current_compute_type:
+            self.current_compute_type = compute_type
+        if model_size != self.current_model_size or self.model is None:
+            progress(0, desc="Initializing Model..")
+            self.current_model_size = model_size
+            self.model = whisper.load_model(
+                name=model_size,
+                device=self.device,
+                download_root=os.path.join("models", "Whisper")
+            )
+    @staticmethod
+    def generate_and_write_file(file_name: str,
+                                transcribed_segments: list,
+                                add_timestamp: bool,
+                                file_format: str,
+                                ) -> str:
+        """
+        This method writes subtitle file and returns str to gr.Textbox
+        """
+        timestamp = datetime.now().strftime("%m%d%H%M%S")
+        if add_timestamp:
+            output_path = os.path.join("outputs", f"{file_name}-{timestamp}")
+        else:
+            output_path = os.path.join("outputs", f"{file_name}")
+        if file_format == "SRT":
+            content = get_srt(transcribed_segments)
+            output_path += '.srt'
+            write_file(content, output_path)
+        elif file_format == "WebVTT":
+            content = get_vtt(transcribed_segments)
+            output_path += '.vtt'
+            write_file(content, output_path)
+        elif file_format == "txt":
+            content = get_txt(transcribed_segments)
+            output_path += '.txt'
+            write_file(content, output_path)
+        return content, output_path
+    @staticmethod
+    def format_time(elapsed_time: float) -> str:
+        hours, rem = divmod(elapsed_time, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_str = ""
+        if hours:
+            time_str += f"{hours} hours "
+        if minutes:
+            time_str += f"{minutes} minutes "
+        seconds = round(seconds)
+        time_str += f"{seconds} seconds"
+        return time_str.strip()

modules/youtube_manager.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from pytube import YouTube
+import os
+def get_ytdata(link):
+    return YouTube(link)
+def get_ytmetas(link):
+    yt = YouTube(link)
+    return yt.thumbnail_url, yt.title, yt.description
+def get_ytaudio(ytdata: YouTube):
+    return ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))

notebook/whisper-webui.ipynb ADDED Viewed

	@@ -0,0 +1,125 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "---\n",
+        "\n",
+        "📌 **This notebook has been updated [here](https://github.com/jhj0517/Whisper-WebUI.git)!**\n",
+        "\n",
+        "🖋 **Author**: [jhj0517](https://github.com/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)\n",
+        "\n",
+        "😎 **Support the Project**:\n",
+        "\n",
+        "If you find this project useful, please consider supporting it:\n",
+        "\n",
+        "<a href=\"https://www.buymeacoffee.com/jhj0517\" target=\"_blank\" style=\"margin-right: 10px;\">\n",
+        "    <img src=\"https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png\" alt=\"Buy Me A Coffee\" width=\"158\" height=\"36\">\n",
+        "</a>\n",
+        "<a href=\"https://ko-fi.com/A0A7JSQRJ\" target=\"_blank\">\n",
+        "    <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
+        "</a>\n",
+        "\n",
+        "---"
+      ],
+      "metadata": {
+        "id": "doKhBBXIfS21"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title #(Optional) Check GPU\n",
+        "#@markdown Some models may not function correctly on a CPU runtime.\n",
+        "\n",
+        "#@markdown so you should check your GPU setup before run.\n",
+        "!nvidia-smi"
+      ],
+      "metadata": {
+        "id": "23yZvUlagEsx"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kNbSbsctxahq",
+        "cellView": "form"
+      },
+      "outputs": [],
+      "source": [
+        "#@title #Installation\n",
+        "#@markdown This cell will install dependencies for Whisper-WebUI!\n",
+        "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
+        "%cd Whisper-WebUI\n",
+        "!pip install -r requirements.txt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title # (Optional) Configure arguments\n",
+        "#@markdown This section is used to configure some command line arguments.\n",
+        "\n",
+        "#@markdown You can simply ignore this section and the default values will be used.\n",
+        "\n",
+        "USERNAME = '' #@param {type: \"string\"}\n",
+        "PASSWORD = '' #@param {type: \"string\"}\n",
+        "DISABLE_FASTER_WHISPER = False #@param {type: \"boolean\"}\n",
+        "THEME = '' #@param {type: \"string\"}\n",
+        "\n",
+        "arguments = \"\"\n",
+        "if USERNAME:\n",
+        "  arguments += f\" --username {USERNAME}\"\n",
+        "if PASSWORD:\n",
+        "  arguments += f\" --password {PASSWORD}\"\n",
+        "if THEME:\n",
+        "  arguments += f\" --theme {THEME}\"\n",
+        "if DISABLE_FASTER_WHISPER:\n",
+        "  arguments += f\" --disable_faster_whisper\"\n",
+        "\n",
+        "\n",
+        "#@markdown If you wonder how these arguments are used, you can see the [Wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments)."
+      ],
+      "metadata": {
+        "id": "Qosz9BFlGui3"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PQroYRRZzQiN"
+      },
+      "outputs": [],
+      "source": [
+        "#@title #Run\n",
+        "#@markdown Once the installation is complete, you can use public URL that is displayed.\n",
+        "if 'arguments' in locals():\n",
+        "  !python app.py --share --colab{arguments}\n",
+        "else:\n",
+        "    !python app.py --share --colab"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+--extra-index-url https://download.pytorch.org/whl/cu118
+torch
+git+https://github.com/jhj0517/jhj0517-whisper.git
+faster-whisper
+transformers
+gradio==4.14.0
+pytube

screenshot.png ADDED Viewed

start-webui.bat ADDED Viewed

	@@ -0,0 +1,18 @@

+@echo off
+goto :activate_venv
+:launch
+%PYTHON% app.py %*
+pause
+:activate_venv
+set PYTHON="%~dp0\venv\Scripts\Python.exe"
+echo venv %PYTHON%
+goto :launch
+:endofscript
+echo.
+echo Launch unsuccessful. Exiting.
+pause

start-webui.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+source venv/bin/activate
+PYTHON="venv/bin/python"
+echo "venv ${PYTHON}"
+echo ""
+python app.py $*
+deactivate

ui/__init__.py ADDED Viewed

File without changes

ui/htmls.py ADDED Viewed

	@@ -0,0 +1,97 @@

+CSS = """
+.bmc-button {
+    padding: 2px 5px;
+    border-radius: 5px;
+    background-color: #FF813F;
+    color: white;
+    box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
+    text-decoration: none;
+    display: inline-block;
+    font-size: 20px;
+    margin: 2px;
+    cursor: pointer;
+    -webkit-transition: background-color 0.3s ease;
+    -ms-transition: background-color 0.3s ease;
+    transition: background-color 0.3s ease;
+}
+.bmc-button:hover,
+.bmc-button:active,
+.bmc-button:focus {
+    background-color: #FF5633;
+}
+.markdown {
+    margin-bottom: 0;
+    padding-bottom: 0;
+}
+.tabs {
+    margin-top: 0;
+    padding-top: 0;
+}
+#md_project a {
+  color: black;
+  text-decoration: none;
+}
+#md_project a:hover {
+  text-decoration: underline;
+}
+"""
+MARKDOWN = """
+### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
+"""
+NLLB_VRAM_TABLE = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <style>
+    table {
+      border-collapse: collapse;
+      width: 100%;
+    }
+    th, td {
+      border: 1px solid #dddddd;
+      text-align: left;
+      padding: 8px;
+    }
+    th {
+      background-color: #f2f2f2;
+    }
+  </style>
+</head>
+<body>
+<details>
+  <summary>VRAM usage for each model</summary>
+  <table>
+    <thead>
+      <tr>
+        <th>Model name</th>
+        <th>Required VRAM</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>nllb-200-3.3B</td>
+        <td>~16GB</td>
+      </tr>
+      <tr>
+        <td>nllb-200-1.3B</td>
+        <td>~8GB</td>
+      </tr>
+      <tr>
+        <td>nllb-200-distilled-600M</td>
+        <td>~4GB</td>
+      </tr>
+    </tbody>
+  </table>
+  <p><strong>Note:</strong> Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.</p>
+</details>
+</body>
+</html>
+"""

user-start-webui.bat ADDED Viewed

	@@ -0,0 +1,41 @@

+@echo off
+:: This batch file is for launching with command line args
+:: See the wiki for a guide to command line arguments: https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments
+:: Set the values here to whatever you want. See the wiki above for how to set this.
+set SERVER_NAME=
+set SERVER_PORT=
+set USERNAME=
+set PASSWORD=
+set SHARE=
+set THEME=
+set DISABLE_FASTER_WHISPER=
+:: Set args accordingly
+if not "%SERVER_NAME%"=="" (
+    set SERVER_NAME_ARG=--server_name %SERVER_NAME%
+)
+if not "%SERVER_PORT%"=="" (
+    set SERVER_PORT_ARG=--server_port %SERVER_PORT%
+)
+if not "%USERNAME%"=="" (
+    set USERNAME_ARG=--username %USERNAME%
+)
+if not "%PASSWORD%"=="" (
+    set PASSWORD_ARG=--password %PASSWORD%
+)
+if /I "%SHARE%"=="true" (
+    set SHARE_ARG=--share
+)
+if not "%THEME%"=="" (
+    set THEME_ARG=--theme %THEME%
+)
+if /I "%DISABLE_FASTER_WHISPER%"=="true" (
+    set DISABLE_FASTER_WHISPER_ARG=--disable_faster_whisper
+)
+:: Call the original .bat script with optional arguments
+start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %DISABLE_FASTER_WHISPER_ARG%
+pause