Spaces:

expressapi
/

vidverse

Sleeping

App Files Files Community

badal commited on Aug 6, 2023

Commit

2f2406a

0 Parent(s):

feat: initial commit

Browse files

Files changed (36) hide show

.gitattributes +35 -0
.gitignore +29 -0
Dockerfile +41 -0
README.md +10 -0
app/SSML_Customization/Examples.xlsx +0 -0
app/SSML_Customization/Manual_Translations.csv +1 -0
app/SSML_Customization/Phoneme_Pronunciation.csv +1 -0
app/SSML_Customization/READ THIS.txt +46 -0
app/SSML_Customization/aliases.csv +1 -0
app/SSML_Customization/dont_translate_phrases.txt +3 -0
app/SSML_Customization/interpret-as.csv +1 -0
app/SSML_Customization/url_list.txt +4 -0
app/__init__.py +1 -0
app/captioning/__init__.py +1 -0
app/captioning/caption_helper.py +156 -0
app/captioning/captioning.py +370 -0
app/captioning/helper.py +83 -0
app/captioning/user_config_helper.py +133 -0
app/constants.py +29 -0
app/functions/__init__.py +0 -0
app/functions/helper.py +25 -0
app/functions/model.py +18 -0
app/functions/s3_handler.py +25 -0
app/functions/video_url_handler.py +60 -0
app/functions/youtube_summarizer.py +27 -0
app/main.py +24 -0
app/scripts/TTS.py +408 -0
app/scripts/__init__.py +55 -0
app/scripts/audio.py +62 -0
app/scripts/audio_builder.py +181 -0
app/scripts/azure_batch.py +78 -0
app/scripts/azure_translate.py +28 -0
app/scripts/srt.py +86 -0
app/scripts/translate.py +402 -0
app/scripts/utils.py +56 -0
requirements.txt +43 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,29 @@

+.idea
+.ipynb_checkpoints
+.mypy_cache
+.vscode
+__pycache__
+.pytest_cache
+htmlcov
+dist
+site
+.coverage
+coverage.xml
+.netlify
+test.db
+log.txt
+Pipfile.lock
+env3.*
+env
+docs_build
+site_build
+venv
+docs.zip
+archive.zip
+openssl-1.1.1u
+logs
+run.sh
+# vim temporary files
+*~
+.*.sw?
+.cache

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM python:latest
+RUN mkdir -p /code
+RUN chmod 777 /code
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install ffmpeg -y
+RUN apt-get install git -y
+RUN apt-get install -y \
+    build-essential \
+    libssl-dev \
+    ca-certificates \
+    libasound2 \
+    wget
+# Download OpenSSL source, compile, and install it
+RUN wget -O - https://www.openssl.org/source/openssl-1.1.1u.tar.gz | tar zxf -
+WORKDIR openssl-1.1.1u
+RUN ./config --prefix=/usr/local
+RUN make -j $(nproc)
+RUN make install_sw install_ssldirs
+RUN ldconfig -v
+# Set environment variables
+ENV SSL_CERT_DIR=/etc/ssl/certs
+ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+WORKDIR /code
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY ./app /code/app
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: Vidverse
+emoji: 🚀
+colorFrom: green
+colorTo: yellow
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app/SSML_Customization/Examples.xlsx ADDED Viewed

Binary file (11.6 kB). View file

app/SSML_Customization/Manual_Translations.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Original Text,Translated Text,Language Code

app/SSML_Customization/Phoneme_Pronunciation.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Text,Phonetic Pronunciation,Case Sensitive (True/False),Phonetic Alphabet

app/SSML_Customization/READ THIS.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+This folder contains the following three pronunciation customization files by default.
+• dont_translate_phrases.txt
+	- You can add a list of phrases or words you do not want to be translated.
+	- This will work for both Google Translate and DeepL
+• interpret-as.csv (Azure Only)
+	- You can use SSML parameters to customize how specific words or phrases are pronounced
+	- See this article for documentation: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#say-as-element
+	- Note: The script will match the phrases in the TRANSLATED text. You may therefore wish to also add these phrases to 'dont_translate_phrases.txt'.
+	- The first row contains the titles of each column - Do not change anything in the first row!
+	- Descriptions of each column:
+		• Text: The word or phrase that will be pronounced how you specify, if it is found in the text to be spoken
+		• interpret-as Type: The way in which the word/phrase will be pronounced. See documentation link above. (Some examples include: characters, cardinal, ordinal)
+		• Case Sensitive (True/False): Whether to only modify the pronunciation if the word/phrase matches exactly, being case sensitive
+		• Format (Optional): Only applicable to some types, such as 'date', 'time', and others. Otherwise leave blank. See documentation link above for details
+	- See 'Example - interpret-as.csv' for an example of how to use this file
+	- This will only apply if using Azure TTS, not Google
+• aliases.csv (Azure Only)
+	- Lets you effectively change what should be spoken instead of a certain word or phrase
+	- Example: If the text to be spoken contains "BTW" you can have it say "by the way"
+		-Note: It does NOT actually replace the text, but only changes how the voice will pronounce it
+	- The first row contains the titles of each column - Do not change anything in the first row!
+	- Description of each column:
+		- Original Text: The original word or phrase to match
+		- Alias: The word or phrase to speak instead of the original text
+		- Case Sensitive (True/False): Whether it must be an exact match including capital/lower case. If nothing is entered, will default to False
+	- This will only apply if using Azure TTS, not Google
+• Manual_Translations.csv
+	- If you know you are going to use a word that gets incorrectly interpreted or translated, you can enter manual translations for any words for any languages
+	- In Manual_Translations.csv, put the original text in the first column, your translation in the second, and the 2-letter language code for that entry into the 3rd column
+• url_list.txt
+	- If you have any URLs in the original text, you can put them as a list in this file
+	- This makes it so the URL will not be translated, and also improves the pronunciation in the TTS stage
+	- It will really only work on basic URLs, such as "example.com/test". If it has anything other than slashes, periods, and colons, it won't work
+	- See the notes at the top of the url_list.txt file for more details
+• Phoneme_Pronunciation.csv
+	- Allows you to specify exact phonetic pronunciation of words or phrases
+	- Note: This is different from 'aliases'. Using this requires using special phonetic alphabets (see links below)
+	- See: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-ssml-phonetic-sets
+	- See: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#phoneme-element

app/SSML_Customization/aliases.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Original Text,Alias,Case Sensitive (True/False)

app/SSML_Customization/dont_translate_phrases.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+# Add one word or phrase per line that you do not want to be translated. The original word will be left as-is in the translated srt files.
+# Don't include punctuation. This list will NOT be case sensitive
+# Lines beginning with a # will be ignored

app/SSML_Customization/interpret-as.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Text,interpret-as Type,Case Sensitive (True/False),Format (Optional)

app/SSML_Customization/url_list.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+# List any URLs that may appear in the original text, such as "google.com/example"
+# This ensures they will not be translated, and will be spoken as words in the TTS stage
+# Example: "google.com/example" becomes "google dot com slash example", which spoken in spanish would be "google punto c o m diagonal example"
+# The actual text in the subtitles will remain as "google.com/example", and only the spoken audio will change

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import scripts

app/captioning/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .captioning import generate_sub

app/captioning/caption_helper.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+#
+from datetime import date, datetime, time, timedelta
+from typing import List, Optional, Tuple
+import azure.cognitiveservices.speech as speechsdk # type: ignore
+from . import helper
+class Caption(object) :
+    def __init__(self, language : Optional[str], sequence : int, begin : time, end : time, text : str) :
+        self.language = language
+        self.sequence = sequence
+        self.begin = begin
+        self.end = end
+        self.text = text
+def get_captions(language : Optional[str], max_width : int, max_height : int, results : List[dict]) -> List[Caption] :
+    caption_helper = CaptionHelper(language, max_width, max_height, results)
+    return caption_helper.get_captions()
+class CaptionHelper(object) :
+    def __init__(self, language : Optional[str], max_width : int, max_height : int, results : List[speechsdk.RecognitionResult]) :
+        self._language = language
+        self._max_width = max_width
+        self._max_height = max_height
+        self._results = results
+        self._first_pass_terminators = ["?", "!", ",", ";"]
+        self._second_pass_terminators = [" ", "."]
+        self._captions : List[Caption] = []
+        # consider adapting to use http://unicode.org/reports/tr29/#Sentence_Boundaries
+        if self._language is not None :
+            iso639 = self._language.split('-')[0]
+            if "zh" == iso639.lower() :
+                self._first_pass_terminators = ["，", "、", "；", "？", "！", "?", "!", ",", ";"]
+                self._second_pass_terminators = ["。", " "]
+                if (helper.DEFAULT_MAX_LINE_LENGTH_SBCS == self._max_width) :
+                    self._max_width = helper.DEFAULT_MAX_LINE_LENGTH_MBCS
+    def get_captions(self) -> List[Caption] :
+        self.ensure_captions()
+        return self._captions
+    def ensure_captions(self) -> None :
+        if not self._captions :
+            self.add_captions_for_all_results()
+    def add_captions_for_all_results(self) -> None :
+        for result in self._results :
+            if result.offset <= 0 or not self.is_final_result(result) :
+                continue
+            text = self.get_text_or_translation(result)
+            if not text :
+                continue
+            self.add_captions_for_final_result(result, text)
+    def get_text_or_translation(self, result : speechsdk.RecognitionResult) -> Optional[str] :
+        return result.text
+        # 20220921 We do not use this for now because this sample
+        # does not handle TranslationRecognitionResults.
+        #if not self._language :
+        #    return result.text
+        #if type(result) is speechsdk.TranslationRecognitionResult and self._language in result.Translations :
+        #    return result.Translations[self._language]
+        #else :
+        #    return None
+    def add_captions_for_final_result(self, result : speechsdk.RecognitionResult, text : str) -> None :
+        caption_starts_at = 0
+        caption_lines : List[str] = []
+        index = 0
+        while (index < len(text)) :
+            index = self.skip_skippable(text, index)
+            line_length = self.get_best_width(text, index)
+            caption_lines.append(text[index:index + line_length].strip())
+            index += line_length
+            is_last_caption = index >= len(text)
+            max_caption_lines = len(caption_lines) >= self._max_height
+            add_caption = is_last_caption or max_caption_lines
+            if add_caption :
+                caption_text = '\n'.join(caption_lines)
+                caption_lines.clear()
+                caption_sequence = len(self._captions) + 1
+                is_first_caption = 0 == caption_starts_at
+                caption_begin_and_end : Tuple[time, time]
+                if is_first_caption and is_last_caption :
+                    caption_begin_and_end = self.get_full_caption_result_timing(result)
+                else :
+                    caption_begin_and_end = self.get_partial_result_caption_timing(result, text, caption_text, caption_starts_at, index - caption_starts_at)
+                self._captions.append(Caption(self._language, caption_sequence, caption_begin_and_end[0], caption_begin_and_end[1], caption_text))
+                caption_starts_at = index
+    def get_best_width(self, text : str, start_index : int) -> int :
+        remaining = len(text) - start_index
+        best_width = remaining if remaining < self._max_width else self.find_best_width(self._first_pass_terminators, text, start_index)
+        if (best_width < 0) :
+            best_width = self.find_best_width(self._second_pass_terminators, text, start_index)
+        if best_width < 0 :
+            best_width = self._max_width
+        return best_width
+    def find_best_width(self, terminators : List[str], text : str, start_at : int) -> int :
+        remaining = len(text) - start_at
+        check_chars = min(remaining, self._max_width)
+        best_width = -1
+        for terminator in terminators :
+            index = text.rfind(terminator, start_at, start_at + check_chars)
+            width = index - start_at
+            if width > best_width :
+                best_width = width + len(terminator)
+        return best_width
+    def skip_skippable(self, text : str, start_index : int) -> int :
+        index = start_index
+        while len(text) > index and ' ' == text[index] :
+            index += 1
+        return index
+    def get_full_caption_result_timing(self, result : speechsdk.RecognitionResult) -> Tuple[time, time] :
+        begin = helper.time_from_ticks(result.offset)
+        end = helper.time_from_ticks(result.offset + result.duration)
+        return (begin, end)
+    def get_partial_result_caption_timing(self, result : speechsdk.RecognitionResult, text : str, caption_text : str, caption_starts_at : int, caption_length : int) -> Tuple[time, time] :
+        (result_begin, result_end) = self.get_full_caption_result_timing(result)
+        result_duration = helper.subtract_times(result_end, result_begin)
+        text_length = len(text)
+        partial_begin = helper.add_time_and_timedelta(result_begin, result_duration * caption_starts_at / text_length)
+        partial_end = helper.add_time_and_timedelta(result_begin, result_duration * (caption_starts_at + caption_length) / text_length)
+        return (partial_begin, partial_end)
+    def is_final_result(self, result : speechsdk.RecognitionResult) -> bool :
+        return speechsdk.ResultReason.RecognizedSpeech == result.reason or speechsdk.ResultReason.RecognizedIntent == result.reason or speechsdk.ResultReason.TranslatedSpeech == result.reason
+    def lines_from_text(self, text : str) -> List[str] :
+        retval : List[str] = []
+        index = 0
+        while (index < len(text)) :
+            index = self.skip_skippable(text, index)
+            line_length = self.get_best_width(text, index)
+            retval.append(text[index:index + line_length].strip())
+            index += line_length
+        return retval

app/captioning/captioning.py ADDED Viewed

	@@ -0,0 +1,370 @@

+#
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+#
+# Notes:
+# - Install the Speech SDK. Run:
+# pip install azure-cognitiveservices-speech
+# - The Python Speech SDK on Windows requires the Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017, 2019, or 2022 on the system. See:
+# https://docs.microsoft.com/azure/cognitive-services/speech-service/quickstarts/setup-platform
+# - Install gstreamer:
+# https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-use-codec-compressed-audio-input-streams
+from datetime import datetime, time, timezone, timedelta
+from itertools import groupby, pairwise
+from os import linesep, remove, environ
+from os.path import exists
+from pathlib import Path
+from sys import argv
+from time import sleep
+from typing import Any, List, Optional
+import wave
+import azure.cognitiveservices.speech as speechsdk # type: ignore
+from . import caption_helper
+from . import helper
+from . import user_config_helper
+USAGE = """Usage: python captioning.py [...]
+  HELP
+    --help                           Show this help and stop.
+  CONNECTION
+    --key KEY                        Your Azure Speech service resource key.
+                                     Overrides the SPEECH_KEY environment variable. You must set the environment variable (recommended) or use the `--key` option.
+    --region REGION                  Your Azure Speech service region.
+                                     Overrides the SPEECH_REGION environment variable. You must set the environment variable (recommended) or use the `--region` option.
+                                     Examples: westus, eastus
+  LANGUAGE
+    --language LANG1                 Specify language. This is used when breaking captions into lines.
+                                     Default value is en-US.
+                                     Examples: en-US, ja-JP
+  INPUT
+    --input FILE                     Input audio from file (default input is the microphone.)
+    --format FORMAT                  Use compressed audio format.
+                                     If this is not present, uncompressed format (wav) is assumed.
+                                     Valid only with --file.
+                                     Valid values: alaw, any, flac, mp3, mulaw, ogg_opus
+  MODE
+    --offline                        Output offline results.
+                                     Overrides --realTime.
+    --realTime                       Output real-time results.
+                                     Default output mode is offline.
+  ACCURACY
+    --phrases ""PHRASE1;PHRASE2""    Example: ""Constoso;Jessie;Rehaan""
+  OUTPUT
+    --output FILE                    Output captions to FILE.
+    --srt                            Output captions in SubRip Text format (default format is WebVTT.)
+    --maxLineLength LENGTH           Set the maximum number of characters per line for a caption to LENGTH.
+                                     Minimum is 20. Default is 37 (30 for Chinese).
+    --lines LINES                    Set the number of lines for a caption to LINES.
+                                     Minimum is 1. Default is 2.
+    --delay MILLISECONDS             How many MILLISECONDS to delay the appearance of each caption.
+                                     Minimum is 0. Default is 1000.
+    --remainTime MILLISECONDS        How many MILLISECONDS a caption should remain on screen if it is not replaced by another.
+                                     Minimum is 0. Default is 1000.
+    --quiet                          Suppress console output, except errors.
+    --profanity OPTION               Valid values: raw, remove, mask
+                                     Default is mask.
+    --threshold NUMBER               Set stable partial result threshold.
+                                     Default is 3.
+"""
+class Captioning(object) :
+    def __init__(self, language, input_audio, output) :
+        # self._user_config = user_config_helper.user_config_from_args(USAGE)
+        self._user_config = {
+            "language": language,
+            "captioning_mode": user_config_helper.CaptioningMode.OFFLINE,  # or OFFLINE if you prefer offline mode
+            "input_file": input_audio,
+            "output_file": output,
+            "use_sub_rip_text_caption_format": True,
+            "use_compressed_audio": False,
+            "compressed_audio_format": speechsdk.AudioStreamContainerFormat.ANY,
+            "subscription_key" : environ.get("SPEECH_KEY"),
+            "region" : environ.get("SPEECH_REGION"),
+            "profanity_option" : speechsdk.ProfanityOption.Masked,
+            "phrases" : "Constoso;Jessie;Rehaan",
+            "suppress_console_output" : True,
+            "remain_time" : timedelta(milliseconds=1000),
+            "delay" : timedelta(milliseconds=1000),
+            "max_line_length" : helper.DEFAULT_MAX_LINE_LENGTH_SBCS,
+            "lines" : 2,
+            "stable_partial_result_threshold" : "3",
+        }
+        self._srt_sequence_number = 1
+        self._previous_caption : Optional[caption_helper.Caption] = None
+        self._previous_end_time : Optional[time] = None
+        self._previous_result_is_recognized = False
+        self._recognized_lines : List[str] = []
+        self._offline_results : List[speechsdk.SpeechRecognitionResult] = []
+    def get_timestamp(self, start : time, end : time) -> str :
+        time_format = ""
+        if self._user_config["use_sub_rip_text_caption_format"] :
+            # SRT format requires ',' as decimal separator rather than '.'.
+            time_format = "%H:%M:%S,%f"
+        else :
+            time_format = "%H:%M:%S.%f"
+        # Truncate microseconds to milliseconds.
+        return "{} --> {}".format(start.strftime(time_format)[:-3], end.strftime(time_format)[:-3])
+    def string_from_caption(self, caption : caption_helper.Caption) -> str :
+        retval = ""
+        if self._user_config["use_sub_rip_text_caption_format"] :
+            retval += str(caption.sequence) + linesep
+        retval += self.get_timestamp(caption.begin, caption.end) + linesep
+        retval += caption.text + linesep + linesep
+        return retval
+    def adjust_real_time_caption_text(self, text : str, is_recognized_result : bool) -> str :
+        # Split the caption text into multiple lines based on max_line_length and lines.
+        temp_caption_helper = caption_helper.CaptionHelper(self._user_config["language"], self._user_config["max_line_length"], self._user_config["lines"], [])
+        lines = temp_caption_helper.lines_from_text(text)
+        # Recognizing results can change with each new result, so we do not save previous Recognizing results.
+        # Recognized results are final, so we save them in a member value.
+        recognizing_lines : List[str] = []
+        if is_recognized_result :
+            self._recognized_lines = self._recognized_lines + lines
+        else :
+            recognizing_lines = lines
+        caption_lines = self._recognized_lines + recognizing_lines
+        return '\n'.join(caption_lines[-self._user_config["lines"]:])
+    def caption_from_real_time_result(self, result : speechsdk.SpeechRecognitionResult, is_recognized_result : bool) -> Optional[str] :
+        retval : Optional[str] = None
+        start_time = helper.time_from_ticks(result.offset)
+        end_time = helper.time_from_ticks(result.offset + result.duration)
+        # If the end timestamp for the previous result is later
+        # than the end timestamp for this result, drop the result.
+        # This sometimes happens when we receive a lot of Recognizing results close together.
+        if self._previous_end_time is not None and self._previous_end_time > end_time :
+            pass
+        else :
+            # Record the end timestamp for this result.
+            self._previous_end_time = end_time
+            # Convert the SpeechRecognitionResult to a caption.
+            # We are not ready to set the text for this caption.
+            # First we need to determine whether to clear _recognizedLines.
+            caption = caption_helper.Caption(self._user_config["language"], self._srt_sequence_number, helper.add_time_and_timedelta(start_time, self._user_config["delay"]), helper.add_time_and_timedelta(end_time, self._user_config["delay"]), "")
+            # Increment the sequence number.
+            self._srt_sequence_number += 1
+            # If we have a previous caption...
+            if self._previous_caption is not None :
+                # If the previous result was type Recognized...
+                if self._previous_result_is_recognized :
+                    # Set the end timestamp for the previous caption to the earliest of:
+                    # - The end timestamp for the previous caption plus the remain time.
+                    # - The start timestamp for the current caption.
+                    previous_end = helper.add_time_and_timedelta(self._previous_caption.end, self._user_config["remain_time"])
+                    self._previous_caption.end = previous_end if previous_end < caption.begin else caption.begin
+                    # If the gap between the original end timestamp for the previous caption
+                    # and the start timestamp for the current caption is larger than remainTime,
+                    # clear the cached recognized lines.
+                    # Note this needs to be done before we call AdjustRealTimeCaptionText
+                    # for the current caption, because it uses _recognizedLines.
+                    if previous_end < caption.begin :
+                        self._recognized_lines.clear()
+                # If the previous result was type Recognizing, simply set the start timestamp
+                # for the current caption to the end timestamp for the previous caption.
+                # Note this presumes there will not be a large gap between Recognizing results,
+                # because such a gap would cause the previous Recognizing result to be succeeded
+                # by a Recognized result.
+                else :
+                    caption.begin = self._previous_caption.end
+                retval = self.string_from_caption(self._previous_caption)
+            # Break the caption text into lines if needed.
+            caption.text = self.adjust_real_time_caption_text(result.text, is_recognized_result)
+            # Save the current caption as the previous caption.
+            self._previous_caption = caption
+            # Save the result type as the previous result type.
+            self._previous_result_is_recognized = is_recognized_result
+        return retval
+    def captions_from_offline_results(self) -> List[caption_helper.Caption] :
+        captions = caption_helper.get_captions(self._user_config["language"], self._user_config["max_line_length"], self._user_config["lines"], list(self._offline_results))
+        # Save the last caption.
+        last_caption = captions[-1]
+        last_caption.end = helper.add_time_and_timedelta(last_caption.end, self._user_config["remain_time"])
+        # In offline mode, all captions come from RecognitionResults of type Recognized.
+        # Set the end timestamp for each caption to the earliest of:
+        # - The end timestamp for this caption plus the remain time.
+        # - The start timestamp for the next caption.
+        captions_2 : List[caption_helper.Caption] = []
+        for (caption_1, caption_2) in pairwise(captions) :
+            end = helper.add_time_and_timedelta(caption_1.end, self._user_config["remain_time"])
+            caption_1.end = end if end < caption_2.begin else caption_2.begin
+            captions_2.append(caption_1)
+        # Re-add the last caption.
+        captions_2.append(last_caption)
+        return captions_2
+    def finish(self) -> None :
+        if user_config_helper.CaptioningMode.OFFLINE == self._user_config["captioning_mode"] :
+            for caption in self.captions_from_offline_results() :
+                helper.write_to_console_or_file(text=self.string_from_caption(caption), user_config=self._user_config)
+        elif user_config_helper.CaptioningMode.REALTIME == self._user_config["captioning_mode"] :
+            # Show the last "previous" caption, which is actually the last caption.
+            if self._previous_caption is not None :
+                self._previous_caption.end = helper.add_time_and_timedelta(self._previous_caption.end, self._user_config["remain_time"])
+                helper.write_to_console_or_file(text=self.string_from_caption(self._previous_caption), user_config=self._user_config)
+    def initialize(self) :
+        if self._user_config["output_file"] is not None and exists(self._user_config["output_file"]) :
+            remove(self._user_config["output_file"])
+        if not self._user_config["use_sub_rip_text_caption_format"] :
+            helper.write_to_console_or_file(text="WEBVTT{}{}".format(linesep, linesep), user_config=self._user_config)
+        return
+    def audio_config_from_user_config(self) -> helper.Read_Only_Dict :
+        if self._user_config["input_file"] is None :
+            return helper.Read_Only_Dict({
+                "audio_config" : speechsdk.AudioConfig(use_default_microphone=True),
+                "audio_stream_format" : None,
+                "pull_input_audio_stream_callback" : None,
+                "pull_input_audio_stream" : None
+            });
+        else :
+            audio_stream_format = None
+            if not self._user_config["use_compressed_audio"] :
+                reader = wave.open(self._user_config["input_file"], mode=None)
+                audio_stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=reader.getframerate(), bits_per_sample=reader.getsampwidth() * 8, channels=reader.getnchannels())
+                reader.close()
+            else :
+                audio_stream_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=self._user_config["compressed_audio_format"])
+            callback = helper.BinaryFileReaderCallback(filename=self._user_config["input_file"])
+            stream = speechsdk.audio.PullAudioInputStream(pull_stream_callback=callback, stream_format=audio_stream_format)
+            # We return the BinaryFileReaderCallback, AudioStreamFormat, and PullAudioInputStream
+            # because we need to keep them in scope until they are actually used.
+            return helper.Read_Only_Dict({
+                "audio_config" : speechsdk.audio.AudioConfig(stream=stream),
+                "audio_stream_format" : audio_stream_format,
+                "pull_input_audio_stream_callback" : callback,
+                "pull_input_audio_stream" : stream,
+            })
+    def speech_config_from_user_config(self) -> speechsdk.SpeechConfig :
+        speech_config = None
+        speech_config = speechsdk.SpeechConfig(subscription=self._user_config["subscription_key"], region=self._user_config["region"])
+        speech_config.set_profanity(self._user_config["profanity_option"])
+        if self._user_config["stable_partial_result_threshold"] is not None :
+            speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_StablePartialResultThreshold, value=self._user_config["stable_partial_result_threshold"])
+        speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_PostProcessingOption, value="TrueText")
+        speech_config.speech_recognition_language=self._user_config["language"]
+        return speech_config
+    def speech_recognizer_from_user_config(self) -> helper.Read_Only_Dict :
+        audio_config_data = self.audio_config_from_user_config()
+        speech_config = self.speech_config_from_user_config()
+        speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config_data["audio_config"])
+        if len(self._user_config["phrases"]) > 0 :
+            grammar = speechsdk.PhraseListGrammar.from_recognizer(recognizer=speech_recognizer)
+            for phrase in self._user_config["phrases"] :
+                grammar.addPhrase(phrase)
+        return helper.Read_Only_Dict({
+            "speech_recognizer" : speech_recognizer,
+            "audio_stream_format" : audio_config_data["audio_stream_format"],
+            "pull_input_audio_stream_callback" : audio_config_data["pull_input_audio_stream_callback"],
+            "pull_input_audio_stream" : audio_config_data["pull_input_audio_stream"],
+        })
+    def recognize_continuous(self, speech_recognizer : speechsdk.SpeechRecognizer, format : speechsdk.audio.AudioStreamFormat, callback : helper.BinaryFileReaderCallback, stream : speechsdk.audio.PullAudioInputStream) :
+        done = False
+        def recognizing_handler(e : speechsdk.SpeechRecognitionEventArgs) :
+            if speechsdk.ResultReason.RecognizingSpeech == e.result.reason and len(e.result.text) > 0 :
+                # This seems to be the only way we can get information about
+                # exceptions raised inside an event handler.
+                try :
+                    caption = self.caption_from_real_time_result(e.result, False)
+                    if caption is not None :
+                        helper.write_to_console_or_file(text=caption, user_config=self._user_config)
+                except Exception as ex :
+                    print('Exception in recognizing_handler: {}'.format(ex))
+            elif speechsdk.ResultReason.NoMatch == e.result.reason :
+                helper.write_to_console(text="NOMATCH: Speech could not be recognized.{}".format(linesep), user_config=self._user_config)
+        def recognized_handler(e : speechsdk.SpeechRecognitionEventArgs) :
+            if speechsdk.ResultReason.RecognizedSpeech == e.result.reason and len(e.result.text) > 0 :
+                try :
+                    if user_config_helper.CaptioningMode.OFFLINE == self._user_config["captioning_mode"] :
+                        self._offline_results.append(e.result)
+                    else :
+                        caption = self.caption_from_real_time_result(e.result, True)
+                        if caption is not None :
+                            helper.write_to_console_or_file(text=caption, user_config=self._user_config)
+                except Exception as ex :
+                    print('Exception in recognized_handler: {}'.format(ex))
+            elif speechsdk.ResultReason.NoMatch == e.result.reason :
+                helper.write_to_console(text="NOMATCH: Speech could not be recognized.{}".format(linesep), user_config=self._user_config)
+        def canceled_handler(e : speechsdk.SpeechRecognitionCanceledEventArgs) :
+            nonlocal done
+            # Notes:
+            # SpeechRecognitionCanceledEventArgs inherits the result property from SpeechRecognitionEventArgs. See:
+            # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitioncanceledeventargs
+            # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitioneventargs
+            # result is type SpeechRecognitionResult, which inherits the reason property from RecognitionResult. See:
+            # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitionresult
+            # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult
+            # e.result.reason is ResultReason.Canceled. To get the cancellation reason, see e.cancellation_details.reason.
+            if speechsdk.CancellationReason.EndOfStream == e.cancellation_details.reason :
+                helper.write_to_console(text="End of stream reached.{}".format(linesep), user_config=self._user_config)
+                done = True
+            elif speechsdk.CancellationReason.CancelledByUser == e.cancellation_details.reason :
+                helper.write_to_console(text="User canceled request.{}".format(linesep), user_config=self._user_config)
+                done = True
+            elif speechsdk.CancellationReason.Error == e.cancellation_details.reason :
+                # Error output should not be suppressed, even if suppress output flag is set.
+                print("Encountered error. Cancellation details: {}{}".format(e.cancellation_details, linesep))
+                done = True
+            else :
+                print("Request was cancelled for an unrecognized reason. Cancellation details: {}{}".format(e.cancellation_details, linesep))
+                done = True
+        def stopped_handler(e : speechsdk.SessionEventArgs) :
+            nonlocal done
+            helper.write_to_console(text="Session stopped.{}".format(linesep), user_config=self._user_config)
+            done = True
+        # We only use Recognizing results in real-time mode.
+        if user_config_helper.CaptioningMode.REALTIME == self._user_config["captioning_mode"] :
+            speech_recognizer.recognizing.connect(recognizing_handler)
+        speech_recognizer.recognized.connect(recognized_handler)
+        speech_recognizer.session_stopped.connect(stopped_handler)
+        speech_recognizer.canceled.connect(canceled_handler)
+        speech_recognizer.start_continuous_recognition()
+        while not done :
+            sleep(5)
+        speech_recognizer.stop_continuous_recognition()
+        return
+def generate_sub(language, input_file, output_file) :
+    captioning = Captioning(language=language, input_audio=input_file, output=output_file)
+    captioning.initialize()
+    speech_recognizer_data = captioning.speech_recognizer_from_user_config()
+    captioning.recognize_continuous(speech_recognizer=speech_recognizer_data["speech_recognizer"], format=speech_recognizer_data["audio_stream_format"], callback=speech_recognizer_data["pull_input_audio_stream_callback"], stream=speech_recognizer_data["pull_input_audio_stream"])
+    captioning.finish()

app/captioning/helper.py ADDED Viewed

	@@ -0,0 +1,83 @@

+#
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+#
+# Note: abc = abstract base classes
+from collections.abc import Mapping
+from datetime import date, datetime, time, timedelta
+from sys import argv
+from typing import Optional
+from pathlib import Path
+import azure.cognitiveservices.speech as speechsdk # type: ignore
+DEFAULT_MAX_LINE_LENGTH_SBCS = 37
+DEFAULT_MAX_LINE_LENGTH_MBCS = 30
+# See speech_recognize_once_compressed_input() in:
+# https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/python/console/speech_sample.py
+class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
+    def __init__(self, filename: str):
+        super().__init__()
+        self._file_h = open(filename, "rb")
+    def read(self, buffer: memoryview) -> int:
+        try:
+            size = buffer.nbytes
+            frames = self._file_h.read(size)
+            buffer[:len(frames)] = frames
+            return len(frames)
+        except Exception as ex:
+            print('Exception in `read`: {}'.format(ex))
+            raise
+    def close(self) -> None:
+        print('closing file')
+        try:
+            self._file_h.close()
+        except Exception as ex:
+            print('Exception in `close`: {}'.format(ex))
+            raise
+class Read_Only_Dict(Mapping):
+    def __init__(self, data):
+        self._data = data
+    def __getitem__(self, key):
+        return self._data[key]
+    def __len__(self):
+        return len(self._data)
+    def __iter__(self):
+        return iter(self._data)
+# See:
+# https://stackoverflow.com/a/12448721
+# https://stackoverflow.com/a/39651061
+def add_time_and_timedelta(t1 : time, t2 : timedelta) -> time :
+    return (datetime.combine(date.min, t1) + t2).time()
+def subtract_times(t1 : time, t2 : time) -> timedelta :
+    return datetime.combine(date.min, t1) - datetime.combine(date.min, t2)
+# We cannot simply create time with ticks.
+def time_from_ticks(ticks) -> time :
+    microseconds_1 = ticks / 10
+    microseconds_2 = microseconds_1 % 1000000
+    seconds_1 = microseconds_1 / 1000000
+    seconds_2 = seconds_1 % 60
+    minutes_1 = seconds_1 / 60
+    minutes_2 = minutes_1 % 60
+    hours = minutes_1 / 60
+    return time(int(hours), int(minutes_2), int(seconds_2), int(microseconds_2))
+def write_to_console(text : str, user_config : Read_Only_Dict) :
+    if not user_config["suppress_console_output"] :
+        print(text, end = "", flush = True)
+    return
+def write_to_console_or_file(text : str, user_config : Read_Only_Dict) :
+    write_to_console(text = text, user_config = user_config)
+    if user_config["output_file"] is not None :
+        file_path = Path(user_config["output_file"])
+        with open(file_path, mode = "a", newline = "", encoding='utf-8') as f :
+            f.write(text)
+    return

app/captioning/user_config_helper.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+#
+from datetime import timedelta
+from enum import Enum
+from os import linesep, environ
+from sys import argv
+from typing import List, Optional
+import azure.cognitiveservices.speech as speechsdk # type: ignore
+from . import helper
+class CaptioningMode(Enum):
+    OFFLINE = 1
+    REALTIME = 2
+def get_cmd_option(option : str) -> Optional[str] :
+    argc = len(argv)
+    if option.lower() in list(map(lambda arg: arg.lower(), argv)) :
+        index = argv.index(option)
+        if index < argc - 1 :
+            # We found the option (for example, "--output"), so advance from that to the value (for example, "filename").
+            return argv[index + 1]
+        else :
+            return None
+    else :
+        return None
+def cmd_option_exists(option : str) -> bool :
+    return option.lower() in list(map(lambda arg : arg.lower(), argv))
+def get_language() -> str :
+    retval = "en-US"
+    language = get_cmd_option("--language")
+    if language is not None :
+        retval = language
+    return retval
+def get_phrases() -> List[str] :
+    retval : List[str] = []
+    phrases = get_cmd_option("--phrases")
+    if phrases is not None :
+        retval = list(map(lambda phrase : phrase.strip(), phrases.split(';')))
+    return retval
+def get_compressed_audio_format() -> speechsdk.AudioStreamContainerFormat :
+    value = get_cmd_option("--format")
+    if value is None :
+        return speechsdk.AudioStreamContainerFormat.ANY
+    else :
+        value = value.lower()
+        if "alaw" == value : return speechsdk.AudioStreamContainerFormat.ALAW
+        elif "flac" == value : return speechsdk.AudioStreamContainerFormat.FLAC
+        elif "mp3" == value : return speechsdk.AudioStreamContainerFormat.MP3
+        elif "mulaw" == value : return speechsdk.AudioStreamContainerFormat.MULAW
+        elif "ogg_opus" == value : return speechsdk.AudioStreamContainerFormat.OGG_OPUS
+        else : return speechsdk.AudioStreamContainerFormat.ANY;
+def get_profanity_option() -> speechsdk.ProfanityOption :
+    value = get_cmd_option("--profanity")
+    if value is None :
+        return speechsdk.ProfanityOption.Masked
+    else :
+        value = value.lower()
+        if "raw"  == value: return speechsdk.ProfanityOption.Raw
+        elif "remove" == value : return speechsdk.ProfanityOption.Removed
+        else : return speechsdk.ProfanityOption.Masked
+def user_config_from_args(usage : str) -> helper.Read_Only_Dict :
+    keyEnv = environ["SPEECH_KEY"] if "SPEECH_KEY" in environ else None
+    keyOption = get_cmd_option("--key")
+    key = keyOption if keyOption is not None else keyEnv
+    if key is None :
+        raise RuntimeError("Please set the SPEECH_KEY environment variable or provide a Speech resource key with the --key option.{}{}".format(linesep, usage))
+    regionEnv = environ["SPEECH_REGION"] if "SPEECH_REGION" in environ else None
+    regionOption = get_cmd_option("--region")
+    region = regionOption if regionOption is not None else regionEnv
+    if region is None :
+        raise RuntimeError("Please set the SPEECH_REGION environment variable or provide a Speech resource region with the --region option.{}{}".format(linesep, usage))
+    captioning_mode = CaptioningMode.REALTIME if cmd_option_exists("--realtime") and not cmd_option_exists("--offline") else CaptioningMode.OFFLINE
+    td_remain_time = timedelta(milliseconds=1000)
+    s_remain_time = get_cmd_option("--remainTime")
+    if s_remain_time is not None :
+        int_remain_time = float(s_remain_time)
+        if int_remain_time < 0 :
+            int_remain_time = 1000
+        td_remain_time = timedelta(milliseconds=int_remain_time)
+    td_delay = timedelta(milliseconds=1000)
+    s_delay = get_cmd_option("--delay")
+    if s_delay is not None :
+        int_delay = float(s_delay)
+        if int_delay < 0 :
+            int_delay = 1000
+        td_delay = timedelta(milliseconds=int_delay)
+    int_max_line_length = helper.DEFAULT_MAX_LINE_LENGTH_SBCS
+    s_max_line_length = get_cmd_option("--maxLineLength")
+    if s_max_line_length is not None :
+        int_max_line_length = int(s_max_line_length)
+        if int_max_line_length < 20 :
+            int_max_line_length = 20
+    int_lines = 2
+    s_lines = get_cmd_option("--lines")
+    if s_lines is not None :
+        int_lines = int(s_lines)
+        if int_lines < 1 :
+            int_lines = 2
+    return helper.Read_Only_Dict({
+        "use_compressed_audio" : cmd_option_exists("--format"),
+        "compressed_audio_format" : get_compressed_audio_format(),
+        "profanity_option" : get_profanity_option(),
+        "language" : get_language(),
+        "input_file" : get_cmd_option("--input"),
+        "output_file" : get_cmd_option("--output"),
+        "phrases" : get_phrases(),
+        "suppress_console_output" : cmd_option_exists("--quiet"),
+        "captioning_mode" : captioning_mode,
+        "remain_time" : td_remain_time,
+        "delay" : td_delay,
+        "use_sub_rip_text_caption_format" : cmd_option_exists("--srt"),
+        "max_line_length" : int_max_line_length,
+        "lines" : int_lines,
+        "stable_partial_result_threshold" : get_cmd_option("--threshold"),
+        "subscription_key" : key,
+        "region" : region,
+    })

app/constants.py ADDED Viewed

	@@ -0,0 +1,29 @@

+MALE_LANGUAGES = {
+    "hi": ["hi-IN", "hi-IN-MadhurNeural"],      # hindi
+    "bn": ["bn-IN", "bn-IN-BashkarNeural"],     # bengali
+    "en": ["en-IN", "en-IN-PrabhatNeural"],     # english
+    "gu": ["gu-IN", "gu-IN-NiranjanNeural"],    # gujarati
+    "kn": ["kn-IN", "kn-IN-GaganNeural"],       # kannada
+    "ml": ["ml-IN", "ml-IN-MidhunNeural"],      # malayalam
+    "mr": ["mr-IN", "mr-IN-ManoharNeural"],     # marathi
+    "ta": ["ta-IN", "ta-IN-ValluvarNeural"],    # tamil
+    "te": ["te-IN", "te-IN-MohanNeural"],       # telugu
+    "ur": ["ur-IN", "ur-IN-SalmanNeural"],      # urdu
+    "de": ["de-DE", "de-DE-ConradNeural"],      # german
+    "ja": ["ja-JP", "ja-JP-KeitaNeural"],       # japanese
+}
+FEMALE_LANGUAGES = {
+    "hi": ["hi-IN", "hi-IN-SwaraNeural"],       # hindi
+    "bn": ["bn-IN", "bn-IN-TanishaaNeural"],    # bengali
+    "en": ["en-IN", "en-IN-NeerjaNeural"],      # english
+    "gu": ["gu-IN", "gu-IN-DhwaniNeural"],      # gujarati
+    "kn": ["kn-IN", "kn-IN-SapnaNeural"],       # kannada
+    "ml": ["ml-IN", "ml-IN-SobhanaNeural"],     # malayalam
+    "mr": ["mr-IN", "mr-IN-AarohiNeural"],      # marathi
+    "ta": ["ta-IN", "ta-IN-PallaviNeural"],     # tamil
+    "te": ["te-IN", "te-IN-ShrutiNeural"],      # telugu
+    "ur": ["ur-IN", "ur-IN-GulNeural"],         # urdu
+    "de": ["de-DE", "de-DE-AmalaNeural"],       # german
+    "ja": ["ja-JP", "ja-JP-NanamiNeural"],      # japanese
+}

app/functions/__init__.py ADDED Viewed

File without changes

app/functions/helper.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from subprocess import run, DEVNULL
+from app.captioning import generate_sub
+def download_video(link, output):
+    command = ["yt-dlp", "-f", "bv*[ext=mp4]", "-o", output, link]
+    run(command, stdout=DEVNULL, stderr=DEVNULL)
+def download_audio(link, output):
+    command = ["yt-dlp", "-f", "ba*[ext=m4a]", "-o", output, link]
+    run(command, stdout=DEVNULL, stderr=DEVNULL)
+def m4a_to_wav(input_video, output):
+    command = ["ffmpeg", "-i", input_video, output]
+    run(command, stdout=DEVNULL, stderr=DEVNULL)
+    print(f"m4a to wav converted, Input: {input_video}, Output: {output}")
+def audio_to_srt(language, audio_file, output):
+    generate_sub(language, audio_file, output)
+    print("audio to srt converted")
+def merge_video_audio(video_file, audio_file, output):
+    command = ["ffmpeg", "-i", video_file, "-i", audio_file, "-c:v", "copy", "-c:a", "copy", output]
+    run(command, stdout=DEVNULL, stderr=DEVNULL)
+    print(f"video and audio merged, Input: {video_file}, {audio_file}, Output: {output}")

app/functions/model.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pydantic import BaseModel
+from fastapi import UploadFile
+class VideoURL(BaseModel):
+    url: str
+    from_lang: str = "en"
+    to_lang: str = "hi"
+    gender: str = "MALE"
+class VideoFile(BaseModel):
+    video: UploadFile
+    from_lang: str = "en"
+    to_lang: str = "hi"
+    gender: str = "MALE"
+class YoutubeURL(BaseModel):
+    url: str

app/functions/s3_handler.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import logging
+import boto3
+from botocore.exceptions import ClientError
+import os
+ACCESS_KEY_ID = os.environ.get("ACCESS_KEY_ID")
+SECRET_ACCESS_KEY = os.environ.get("SECRET_ACCESS_KEY")
+session = boto3.Session(ACCESS_KEY_ID, SECRET_ACCESS_KEY)
+def upload_file(file_name, bucket, folder, object_name=None):
+    # If S3 object_name was not specified, use file_name
+    if object_name is None:
+        object_name = os.path.basename(file_name)
+    # Upload the file
+    s3_client = session.client('s3')
+    try:
+        response = s3_client.upload_file(file_name, bucket, f"{folder}/"+object_name)
+    except ClientError as e:
+        logging.error(e)
+        return False
+    url = f'{os.environ.get("RESULT_URL")}{folder}/{object_name}'
+    return url

app/functions/video_url_handler.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from datetime import datetime
+from uuid import uuid4
+from tempfile import TemporaryDirectory
+from .s3_handler import upload_file
+from app.scripts import synthesise_audio
+from .helper import download_audio, download_video, m4a_to_wav, audio_to_srt, merge_video_audio
+from app.constants import MALE_LANGUAGES, FEMALE_LANGUAGES
+def handler_video_url(url, from_lang, to_lang, gender):
+    with TemporaryDirectory(dir=".") as tempdir:
+        srt_file = f"{tempdir}/audio.srt"
+        video_file = f"{tempdir}/video.mp4"
+        audio_file = f"{tempdir}/audio.m4a"
+        audio_wav_file = f"{tempdir}/audio.wav"
+        translated_video = f"{tempdir}/translated_video.mp4"
+        download_audio(url, audio_file)
+        download_video(url, video_file)
+        m4a_to_wav(audio_file, audio_wav_file)
+        language_code = MALE_LANGUAGES[from_lang][0]
+        audio_to_srt(language_code, audio_wav_file, srt_file)
+        if gender.lower() == "male":
+            language_code = MALE_LANGUAGES[to_lang][0]
+            voice_name = MALE_LANGUAGES[to_lang][1]
+        else:
+            language_code = FEMALE_LANGUAGES[to_lang][0]
+            voice_name = FEMALE_LANGUAGES[to_lang][1]
+        result = synthesise_audio(
+            srt_file=srt_file,
+            video_file=video_file,
+            output_folder=tempdir,
+            language_code=language_code,
+            voice_name=voice_name,
+            from_lang=from_lang,
+            to_lang=to_lang,
+            gender=gender,
+        )
+        translated_srt = result["translated_subtitle"]
+        translated_audio = result["translated_audio"]
+        merge_video_audio(video_file, translated_audio, translated_video)
+        now = datetime.now()
+        today = now.strftime("%Y-%m-%d")
+        id = f"{today}/{str(uuid4()).replace('-', '')[:15]}"
+        srt_url = upload_file(srt_file, "expressapi", id, "subtitle.srt")
+        translated_srt_url = upload_file(
+            translated_srt, "expressapi", id, "translated_subtitle.srt"
+        )
+        translated_audio_url = upload_file(
+            translated_audio, "expressapi", id, "translated_audio.mp3"
+        )
+        translated_video_url = upload_file(translated_video, "expressapi", id, "translated_video.mp4")
+        return {
+            "srt_url": srt_url,
+            "video_url": translated_video_url,
+            "translated_srt_url": translated_srt_url,
+            "translated_audio_url": translated_audio_url,
+        }

app/functions/youtube_summarizer.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import re, os
+import requests
+from youtube_transcript_api import YouTubeTranscriptApi
+API_URL = os.environ.get("SUMMARIZE_API_URL")
+API_TOKEN = os.environ.get("SUMMARIZE_API_TOKEN")
+headers = {"Authorization": f"Bearer {API_TOKEN}"}
+def extract_video_id(youtube_url):
+    video_id_pattern = r"(?:/shorts/|v=)([a-zA-Z0-9_-]+)(?:&|\?|$)"
+    match = re.search(video_id_pattern, youtube_url)
+    if match:
+        video_id = match.group(1)
+        return video_id
+    else:
+        return None
+def youtube_summarizer_handler(link):
+    video_id = extract_video_id(link)
+    subs = YouTubeTranscriptApi.get_transcript(video_id)
+    texts = " ".join([sub["text"] for sub in subs])
+    payload = {"inputs": texts}
+    response = requests.post(API_URL, headers=headers, json=payload)
+    summary = response.json()[0]
+    return summary

app/main.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from fastapi import FastAPI
+from .functions.video_url_handler import handler_video_url
+from .functions.youtube_summarizer import youtube_summarizer_handler
+from .functions.model import VideoURL, VideoFile, YoutubeURL
+app = FastAPI()
+@app.get("/")
+async def home():
+    return {"health_check": "OK"}
+@app.post("/synthesise_video_url")
+async def synthesise_video_url(req: VideoURL):
+    response = handler_video_url(req.url, req.from_lang, req.to_lang, req.gender)
+    return response
+@app.post("/youtube_summarizer")
+async def youtube_summarizer(req: YoutubeURL):
+    response = youtube_summarizer_handler(req.url)
+    return response

app/scripts/TTS.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import json
+import os
+import time
+import azure.cognitiveservices.speech as speechsdk
+import datetime
+import zipfile
+import io
+import copy
+import re
+from urllib.request import urlopen
+from pathlib import Path
+from . import azure_batch
+from . import utils
+from .utils import parseBool
+# Get variables from config
+# Get Azure variables if applicable
+AZURE_SPEECH_KEY = os.environ.get('SPEECH_KEY')
+AZURE_SPEECH_REGION = os.environ.get('SPEECH_REGION')
+azure_sentence_pause = 80
+azure_comma_pause = 50
+debug_mode = False
+tts_service = 'azure'
+# ======================================== Pronunciation Correction Functions ================================================
+BASE_DIR = Path(__file__).resolve().parent.parent / 'SSML_Customization'
+interpretAsOverrideFile = os.path.join(BASE_DIR, 'interpret-as.csv')
+interpretAsEntries = utils.csv_to_dict(interpretAsOverrideFile)
+aliasOverrideFile = os.path.join(BASE_DIR, 'aliases.csv')
+aliasEntries = utils.csv_to_dict(aliasOverrideFile)
+urlListFile = os.path.join(BASE_DIR, 'url_list.txt')
+urlList = utils.txt_to_list(urlListFile)
+phonemeFile = os.path.join(BASE_DIR, 'Phoneme_Pronunciation.csv')
+phonemeEntries = utils.csv_to_dict(phonemeFile)
+def add_all_pronunciation_overrides(text):
+    text = add_interpretas_tags(text)
+    text = add_alias_tags(text)
+    text = add_phoneme_tags(text)
+    return text
+def add_interpretas_tags(text):
+    # Add interpret-as tags from interpret-as.csv
+    for entryDict in interpretAsEntries:
+        # Get entry info
+        entryText = entryDict['Text']
+        entryInterpretAsType = entryDict['interpret-as Type']
+        isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
+        entryFormat = entryDict['Format (Optional)']
+        # Create say-as tag
+        if entryFormat == "":
+            sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}">'
+        else:
+            sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}" format="{entryFormat}">'
+        # Find and replace the word
+        findWordRegex = rf'(\b["\']?{entryText}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
+        if isCaseSensitive:
+            text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text) # Uses group reference, so remember regex must be in parentheses
+        else:
+            text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text, flags=re.IGNORECASE)
+    # Add interpret-as tags from url_list.txt
+    for url in urlList:
+        # This regex expression will match the top level domain extension, and the punctuation before/after it, and any periods, slashes or colons
+        # It will then put the say-as characters tag around all matches
+        punctuationRegex = re.compile(r'((?:\.[a-z]{2,6}(?:\/|$|\s))|(?:[\.\/:]+))')
+        taggedURL = re.sub(punctuationRegex, r'<say-as interpret-as="characters">\1</say-as>', url)
+        # Replace any instances of the URL with the tagged version
+        text = text.replace(url, taggedURL)
+    return text
+def add_alias_tags(text):
+    for entryDict in aliasEntries:
+        # Get entry info
+        entryText = entryDict['Original Text']
+        entryAlias = entryDict['Alias']
+        if entryDict['Case Sensitive (True/False)'] == "":
+            isCaseSensitive = False
+        else:
+            isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
+        # Find and replace the word
+        findWordRegex = rf'\b["\'()]?{entryText}[.,!?()]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
+        if isCaseSensitive:
+            text = re.sub(findWordRegex, rf'{entryAlias}', text)
+        else:
+            text = re.sub(findWordRegex, rf'{entryAlias}', text, flags=re.IGNORECASE)
+    return text
+# Uses the phoneme pronunciation file to add phoneme tags to the text
+def add_phoneme_tags(text):
+    for entryDict in phonemeEntries:
+        # Get entry info
+        entryText = entryDict['Text']
+        entryPhoneme = entryDict['Phonetic Pronunciation']
+        entryAlphabet = entryDict['Phonetic Alphabet']
+        if entryDict['Case Sensitive (True/False)'] == "":
+            isCaseSensitive = False
+        else:
+            isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
+        # Find and replace the word
+        findWordRegex = rf'(\b["\'()]?{entryText}[.,!?()]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
+        if isCaseSensitive:
+            text = re.sub(findWordRegex, rf'<phoneme alphabet="ipa" ph="{entryPhoneme}">\1</phoneme>', text)
+        else:
+            text = re.sub(findWordRegex, rf'<phoneme alphabet="{entryAlphabet}" ph="{entryPhoneme}">\1</phoneme>', text, flags=re.IGNORECASE)
+    return text
+# ================================================== Azure Functions =========================================================
+def synthesize_text_azure(text, duration, voiceName, languageCode):
+    # Create tag for desired duration of clip
+    durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'
+    # Create string for sentence pauses, if not default
+    if not azure_sentence_pause == 'default':
+        sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
+    else:
+        sentencePauseTag = ''
+    # Create string for comma pauses, if not default
+    if not azure_comma_pause == 'default':
+        commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
+    else:
+        commaPauseTag = ''
+    # Set string for tag to set leading and trailing silence times to zero
+    leadSilenceTag = '<mstts:silence  type="Leading-exact" value="0ms"/>'
+    tailSilenceTag = '<mstts:silence  type="Tailing-exact" value="0ms"/>'
+    # Process text using pronunciation customization set by user
+    text = add_all_pronunciation_overrides(text)
+    # Create SSML syntax for Azure TTS
+    ssml = f"<speak version='1.0' xml:lang='{languageCode}' xmlns='http://www.w3.org/2001/10/synthesis' " \
+        "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
+        f"<voice name='{voiceName}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
+        f"{text}</voice></speak>"
+    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
+    # For Azure voices, see: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=stt-tts
+    speech_config.speech_synthesis_voice_name=voiceName
+    # For audio outputs, see: https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechsynthesisoutputformat?view=azure-python
+    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
+    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
+    #result = synthesizer.speak_text_async(text).get()
+    result = synthesizer.speak_ssml_async(ssml).get()
+    stream = speechsdk.AudioDataStream(result)
+    return stream
+def format_percentage_change(speedFactor):
+    # Determine speedFactor value for Azure TTS. It should be either 'default' or a relative change.
+    if speedFactor == 1.0:
+        rate = 'default'
+    else:
+        # Whether to add a plus sign to the number to relative change. A negative will automatically be added
+        if speedFactor >= 1.0:
+            percentSign = '+'
+        else:
+            percentSign = ''
+        # Convert speedFactor float value to a relative percentage
+        rate = percentSign + str(round((speedFactor - 1.0) * 100, 5)) + '%'
+    return rate
+def synthesize_text_azure_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):
+    def create_request_payload(remainingEntriesDict):
+        # Create SSML for all subtitles
+        ssmlJson = []
+        payloadSizeInBytes = 0
+        tempDict = dict(remainingEntriesDict) # Need to do this to avoid changing the original dict which would mess with the loop
+        for key, value in tempDict.items():
+            text = tempDict[key]['translated_text']
+            duration = tempDict[key]['duration_ms_buffered']
+            language = langDict['languageCode']
+            voice = langDict['voiceName']
+            # Create tag for desired duration of clip
+            durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'
+            # Create string for sentence pauses, if not default
+            if not azure_sentence_pause == 'default':
+                sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
+            else:
+                sentencePauseTag = ''
+            # Create string for comma pauses, if not default
+            if not azure_comma_pause == 'default':
+                commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
+            else:
+                commaPauseTag = ''
+            # Set string for tag to set leading and trailing silence times to zero
+            leadSilenceTag = '<mstts:silence  type="Leading-exact" value="0ms"/>'
+            tailSilenceTag = '<mstts:silence  type="Tailing-exact" value="0ms"/>'
+            # Process text using pronunciation customization set by user
+            text = add_all_pronunciation_overrides(text)
+            # Create the SSML for each subtitle
+            ssml = f"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' " \
+            "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
+            f"<voice name='{voice}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
+            f"{text}</voice></speak>"
+            ssmlJson.append({"text": ssml})
+            # Construct request payload with SSML
+            # Reconstruct payload with every loop with new SSML so that the payload size is accurate
+            now = datetime.datetime.now()
+            pendingPayload = {
+                'displayName': langDict['languageCode'] + '-' + now.strftime("%Y-%m-%d %H:%M:%S"),
+                'description': 'Batch synthesis of ' + langDict['languageCode'] + ' subtitles',
+                "textType": "SSML",
+                # To use custom voice, see original example code script linked from azure_batch.py
+                "inputs": ssmlJson,
+                "properties": {
+                    "outputFormat": "audio-48khz-192kbitrate-mono-mp3",
+                    "wordBoundaryEnabled": False,
+                    "sentenceBoundaryEnabled": False,
+                    "concatenateResult": False,
+                    "decompressOutputFiles": False
+                },
+            }
+            # Azure TTS Batch requests require payload must be under 500 kilobytes, so check payload is under 500,000 bytes. Not sure if they actually mean kibibytes, assume worst case.
+            # Payload will be formatted as json so must account for that too by doing json.dumps(), otherwise calculated size will be inaccurate
+            payloadSizeInBytes = len(str(json.dumps(pendingPayload)).encode('utf-8'))
+            if payloadSizeInBytes > 495000 or len(ssmlJson) > 995: # Leave some room for anything unexpected. Also number of inputs must be below 1000
+                # If payload would be too large, ignore the last entry and break out of loop
+                return payload, remainingEntriesDict
+            else:
+                payload = copy.deepcopy(pendingPayload) # Must make deepycopy otherwise ssmlJson will be updated in both instead of just pendingPayload
+                # Remove entry from remainingEntriesDict if it was added to payload
+                remainingEntriesDict.pop(key)
+        # If all the rest of the entries fit, return the payload
+        return payload, remainingEntriesDict
+    # ------------------------- End create_request_payload() -----------------------------------
+    # Create payloads, split into multiple if necessary
+    payloadList = []
+    remainingPayloadEntriesDict = dict(subsDict) # Will remove entries as they are added to payloads
+    while len(remainingPayloadEntriesDict) > 0:
+        payloadToAppend, remainingPayloadEntriesDict = create_request_payload(remainingPayloadEntriesDict)
+        payloadList.append(payloadToAppend)
+    # Tell user if request will be broken up into multiple payloads
+    if len(payloadList) > 1:
+        print(f'Payload will be broken up into {len(payloadList)} requests (due to Azure size limitations).')
+    # Use to keep track of filenames downloaded via separate zip files. WIll remove as they are downloaded
+    remainingDownloadedEntriesList = list(subsDict.keys())
+    # Clear out workingFolder
+    for filename in os.listdir('workingFolder'):
+        if not debug_mode:
+            os.remove(os.path.join('workingFolder', filename))
+    # Loop through payloads and submit to Azure
+    for payload in payloadList:
+        # Reset job_id from previous loops
+        job_id = None
+        # Send request to Azure
+        job_id = azure_batch.submit_synthesis(payload)
+        # Wait for job to finish
+        if job_id is not None:
+            status = "Running"
+            resultDownloadLink = None
+            while True: # Must use break to exit loop
+                # Get status
+                response = azure_batch.get_synthesis(job_id)
+                status = response.json()['status']
+                if status == 'Succeeded':
+                    print('Batch synthesis job succeeded')
+                    resultDownloadLink = azure_batch.get_synthesis(job_id).json()['outputs']['result']
+                    break
+                elif status == 'Failed':
+                    print('ERROR: Batch synthesis job failed!')
+                    print("Reason:" + response.reason)
+                    break
+                else:
+                    print(f'Waiting for Azure batch synthesis job to finish. Status: [{status}]')
+                    time.sleep(5)
+            # Download resultig zip file
+            if resultDownloadLink is not None:
+                # Download zip file
+                urlResponse = urlopen(resultDownloadLink)
+                # If debug mode, save zip file to disk
+                if debug_mode:
+                    if secondPass == False:
+                        zipName = 'azureBatch.zip'
+                    else:
+                        zipName = 'azureBatchPass2.zip'
+                    zipPath = os.path.join('workingFolder', zipName)
+                    with open(zipPath, 'wb') as f:
+                        f.write(urlResponse.read())
+                    # Reset urlResponse so it can be read again
+                    urlResponse = urlopen(resultDownloadLink)
+                # Process zip file
+                virtualResultZip = io.BytesIO(urlResponse.read())
+                zipdata = zipfile.ZipFile(virtualResultZip)
+                zipinfos = zipdata.infolist()
+                # Reorder zipinfos so the file names are in alphanumeric order
+                zipinfos.sort(key=lambda x: x.filename)
+                # Only extract necessary files, and rename them while doing so
+                for file in zipinfos:
+                    if file.filename == "summary.json":
+                        #zipdata.extract(file, 'workingFolder') # For debugging
+                        pass
+                    elif "json" not in file.filename:
+                        # Rename file to match first entry in remainingDownloadedEntriesDict, then extract
+                        currentFileNum = remainingDownloadedEntriesList[0]
+                        file.filename = str(currentFileNum) + '.mp3'
+                        #file.filename = file.filename.lstrip('0')
+                        # Add file path to subsDict then remove from remainingDownloadedEntriesList
+                        subsDict[currentFileNum]['TTS_FilePath'] = os.path.join('workingFolder', str(currentFileNum)) + '.mp3'
+                        # Extract file
+                        zipdata.extract(file, 'workingFolder')
+                        # Remove entry from remainingDownloadedEntriesList
+                        remainingDownloadedEntriesList.pop(0)
+    return subsDict
+def synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):
+    if not skipSynthesize:
+        subsDict = synthesize_text_azure_batch(subsDict, langDict, skipSynthesize, secondPass)
+    return subsDict
+def synthesize_dictionary(subsDict, langDict, outputFolder, skipSynthesize=False, secondPass=False):
+    for key, value in subsDict.items():
+        # TTS each subtitle text, write to file, write filename into dictionary
+        workingFolder = os.path.join(outputFolder, 'workingFolder')
+        filePath = os.path.join(workingFolder, f'{str(key)}.mp3')
+        filePathStem = os.path.join(workingFolder, f'{str(key)}')
+        if not skipSynthesize:
+            duration = value['duration_ms_buffered']
+            if secondPass:
+                # Get speed factor from subsDict
+                speedFactor = subsDict[key]['speed_factor']
+            else:
+                speedFactor = float(1.0)
+            # Prepare output location. If folder doesn't exist, create it
+            if not os.path.exists(os.path.dirname(filePath)):
+                try:
+                    os.makedirs(os.path.dirname(filePath))
+                except OSError:
+                    print("Error creating directory")
+            # If Azure TTS, use Azure API
+            if tts_service == "azure":
+                # Audio variable is an AudioDataStream object
+                audio = synthesize_text_azure(value['translated_text'], duration, langDict['voiceName'], langDict['languageCode'])
+                # Save to file using save_to_wav_file method of audio object
+                audio.save_to_wav_file(filePath)
+                # If debug mode, write to files after Google TTS
+                if debug_mode and secondPass == False:
+                    audio.save_to_wav_file(filePathStem+"_p1.mp3")
+                elif debug_mode and secondPass == True:
+                    audio.save_to_wav_file(filePathStem+"_p2.mp3")
+        subsDict[key]['TTS_FilePath'] = filePath
+        # Get key index
+        keyIndex = list(subsDict.keys()).index(key)
+        # Print progress and overwrite line next time
+        if not secondPass:
+            print(f" Synthesizing TTS Line: {keyIndex+1} of {len(subsDict)}", end="\r")
+        else:
+            print(f" Synthesizing TTS Line (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
+    print("                                               ") # Clear the line
+    return subsDict

app/scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from .audio import process_language
+from .srt import parse_srt_file, get_duration
+import langcodes
+import pathlib
+import os
+def synthesise_audio(
+    srt_file,
+    video_file,
+    output_folder,
+    language_code="hi-IN",
+    voice_name="hi-IN-MadhurNeural",
+    from_lang="en",
+    to_lang="hi",
+    gender="MALE",
+):
+    langData = {
+        "synth_language_code": language_code,
+        "synth_voice_name": voice_name,
+        "translation_source_language": from_lang,
+        "translation_target_language": to_lang,
+        "synth_voice_gender": gender,
+        "translate_service": "azure",
+        "formality": None,
+    }
+    with open(srt_file, "r", encoding="utf-8-sig") as f:
+        originalSubLines = f.readlines()
+    originalLanguageSubsDict = parse_srt_file(originalSubLines)
+    totalAudioLength = get_duration(video_file)
+    # Use video file name to use in the name of the translate srt file, also display regular language name
+    lang = langcodes.get(to_lang).display_name()
+    translatedSrtFileName = pathlib.Path(video_file).stem + f" - {lang} - {to_lang}.srt"
+    # Set path to save translated srt file
+    translatedSrtFileName = f"{output_folder}/{translatedSrtFileName}"
+    lang = langcodes.get(langData['synth_language_code'])
+    langName = langcodes.get(langData['synth_language_code']).get(lang.to_alpha3()).display_name()
+    outputFileName = pathlib.Path(video_file).stem + f" - {langName} - {langData['synth_language_code']}."
+    # Set output path
+    outputFileName = os.path.join(output_folder, outputFileName)
+    process_language(
+        langData,
+        originalLanguageSubsDict,
+        totalAudioLength,
+        translatedSrtFileName,
+        outputFileName,
+        output_folder
+    )
+    return {"translated_subtitle": translatedSrtFileName, "translated_audio": outputFileName+"mp3"}

app/scripts/audio.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import copy
+from .TTS import synthesize_dictionary_batch, synthesize_dictionary
+from .translate import translate_dictionary
+from .audio_builder import build_audio
+original_language = "en"
+batch_tts_synthesize = False
+skip_translation = False
+stop_after_translation = False
+skip_translation = False
+skip_synthesize = False
+two_pass_voice_synth = False # Azure doesn't need two pass voice synth, so disable it
+def manually_prepare_dictionary(dictionaryToPrep):
+    ### Do additional Processing to match the format produced by translation function
+    # Create new key 'translated_text' and set it to the value of 'text'
+    for key, value in dictionaryToPrep.items():
+        dictionaryToPrep[key]['translated_text'] = value['text']
+    # Convert the keys to integers and return the dictionary
+    return {int(k): v for k, v in dictionaryToPrep.items()}
+# Process a language: Translate, Synthesize, and Build Audio
+def process_language(langData, originalLanguageSubsDict, totalAudioLength, translatedSrtFileName, outputFileName, outputFolder):
+    langDict = {
+        'targetLanguage': langData['translation_target_language'],
+        'sourceLanguage': langData['translation_source_language'],
+        'voiceName': langData['synth_voice_name'],
+        'languageCode': langData['synth_language_code'],
+        'voiceGender': langData['synth_voice_gender'],
+        'translateService': langData['translate_service'],
+        'formality': langData['formality']
+        }
+    individualLanguageSubsDict = copy.deepcopy(originalLanguageSubsDict)
+    # Check for special case where original language is the same as the target language
+    if langDict['languageCode'].lower() == original_language.lower():
+        print("Original language is the same as the target language. Skipping translation.")
+        individualLanguageSubsDict = manually_prepare_dictionary(individualLanguageSubsDict)
+    elif skip_translation == False:
+        # Translate
+        individualLanguageSubsDict = translate_dictionary(individualLanguageSubsDict, langDict, translatedSrtFileName, skipTranslation=skip_translation)
+        if stop_after_translation:
+            print("Stopping at translation is enabled. Skipping TTS and building audio.")
+            return
+    # Synthesize
+    if batch_tts_synthesize == True:
+        individualLanguageSubsDict = synthesize_dictionary_batch(individualLanguageSubsDict, langDict, skipSynthesize=skip_synthesize)
+    else:
+        individualLanguageSubsDict = synthesize_dictionary(individualLanguageSubsDict, langDict, outputFolder, skipSynthesize=skip_synthesize)
+    print(individualLanguageSubsDict)
+    # Build audio
+    individualLanguageSubsDict = build_audio(individualLanguageSubsDict, langDict, totalAudioLength, outputFileName, two_pass_voice_synth)

app/scripts/audio_builder.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import soundfile
+import pyrubberband
+import pathlib
+import os
+import io
+from . import TTS
+from pydub import AudioSegment
+from pydub.silence import detect_leading_silence
+import langcodes
+# Set working folder
+workingFolder = "workingFolder"
+synth_sample_rate = 24000
+debug_mode = False
+tts_service = "azure"
+batch_tts_synthesize = False
+skip_translation = False
+stop_after_translation = False
+skip_translation = False
+skip_synthesize = False
+force_stretch_with_twopass = False
+output_format = "mp3"
+def trim_clip(inputSound):
+    trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
+    trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
+    strip_silence: AudioSegment = lambda x: trim_trailing_silence(trim_leading_silence(x))
+    strippedSound = strip_silence(inputSound)
+    return strippedSound
+# Function to insert audio into canvas at specific point
+def insert_audio(canvas, audioToOverlay, startTimeMs):
+    # Create a copy of the canvas
+    canvasCopy = canvas
+    # Overlay the audio onto the copy
+    canvasCopy = canvasCopy.overlay(audioToOverlay, position=int(startTimeMs))
+    # Return the copy
+    return canvasCopy
+# Function to create a canvas of a specific duration in miliseconds
+def create_canvas(canvasDuration, frame_rate=int(synth_sample_rate)):
+    canvas = AudioSegment.silent(duration=canvasDuration, frame_rate=frame_rate)
+    return canvas
+def get_speed_factor(subsDict, trimmedAudio, desiredDuration, num):
+    virtualTempFile = AudioSegment.from_file(trimmedAudio, format="wav")
+    rawDuration = virtualTempFile.duration_seconds
+    trimmedAudio.seek(0) # This MUST be done to reset the file pointer to the start of the file, otherwise will get errors next time try to access the virtual files
+    # Calculate the speed factor, put into dictionary
+    desiredDuration = float(desiredDuration)
+    speedFactor = (rawDuration*1000) / desiredDuration
+    subsDict[num]['speed_factor'] = speedFactor
+    return subsDict
+def stretch_audio(audioFileToStretch, speedFactor, num):
+    virtualTempAudioFile = io.BytesIO()
+    # Write the raw string to virtualtempaudiofile
+    y, sampleRate = soundfile.read(audioFileToStretch)
+    streched_audio = pyrubberband.time_stretch(y, sampleRate, speedFactor, rbargs={'--fine': '--fine'}) # Need to add rbarges in weird way because it demands a dictionary of two values
+    #soundfile.write(f'{workingFolder}\\temp_stretched.wav', streched_audio, sampleRate)
+    soundfile.write(virtualTempAudioFile, streched_audio, sampleRate, format='wav')
+    if debug_mode:
+        soundfile.write(os.path.join(workingFolder, f'{num}_s.wav'), streched_audio, sampleRate) # For debugging, saves the stretched audio files
+    #return AudioSegment.from_file(f'{workingFolder}\\temp_stretched.wav', format="wav")
+    return AudioSegment.from_file(virtualTempAudioFile, format="wav")
+def build_audio(subsDict, langDict, totalAudioLength, outputFileName, twoPassVoiceSynth=False):
+    if tts_service == 'azure':
+        twoPassVoiceSynth = False # Azure doesn't need two pass voice synth, so disable it
+    virtualTrimmedFileDict = {}
+    # First trim silence off the audio files
+    for key, value in subsDict.items():
+        filePathTrimmed = os.path.join(workingFolder,  str(key)) + "_t.wav"
+        subsDict[key]['TTS_FilePath_Trimmed'] = filePathTrimmed
+        # Trim the clip and re-write file
+        rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
+        trimmedClip = trim_clip(rawClip)
+        if debug_mode:
+            trimmedClip.export(filePathTrimmed, format="wav")
+        # Create virtual file in dictionary with audio to be read later
+        tempTrimmedFile = io.BytesIO()
+        trimmedClip.export(tempTrimmedFile, format="wav")
+        virtualTrimmedFileDict[key] = tempTrimmedFile
+        keyIndex = list(subsDict.keys()).index(key)
+        print(f" Trimmed Audio: {keyIndex+1} of {len(subsDict)}", end="\r")
+    print("\n")
+    # Calculates speed factor if necessary. Azure doesn't need this, so skip it
+    if not tts_service == 'azure':
+        # Calculate speed factors for each clip, aka how much to stretch the audio
+        for key, value in subsDict.items():
+            #subsDict = get_speed_factor(subsDict, value['TTS_FilePath_Trimmed'], value['duration_ms'], num=key)
+            subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
+            keyIndex = list(subsDict.keys()).index(key)
+            print(f" Calculated Speed Factor: {keyIndex+1} of {len(subsDict)}", end="\r")
+        print("\n")
+    # If two pass voice synth is enabled, have API re-synthesize the clips at the new speed
+    # Azure allows direct specification of audio duration, so no need to re-synthesize
+    if twoPassVoiceSynth == True and not tts_service == 'azure':
+        if batch_tts_synthesize == True and tts_service == 'azure':
+            subsDict = TTS.synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
+        else:
+            subsDict = TTS.synthesize_dictionary(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
+        for key, value in subsDict.items():
+            # Trim the clip and re-write file
+            rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
+            trimmedClip = trim_clip(rawClip)
+            if debug_mode:
+                # Remove '.wav' from the end of the file path
+                secondPassTrimmedFile = value['TTS_FilePath_Trimmed'][:-4] + "_p2_t.wav"
+                trimmedClip.export(secondPassTrimmedFile, format="wav")
+            trimmedClip.export(virtualTrimmedFileDict[key], format="wav")
+            keyIndex = list(subsDict.keys()).index(key)
+            print(f" Trimmed Audio (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
+        print("\n")
+        if force_stretch_with_twopass == True:
+            for key, value in subsDict.items():
+                subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
+                keyIndex = list(subsDict.keys()).index(key)
+                print(f" Calculated Speed Factor (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
+            print("\n")
+    # Create canvas to overlay audio onto
+    canvas = create_canvas(totalAudioLength)
+    # Stretch audio and insert into canvas
+    for key, value in subsDict.items():
+        if (not twoPassVoiceSynth or force_stretch_with_twopass == True) and not tts_service == 'azure': # Don't stretch if azure is used
+            #stretchedClip = stretch_audio(value['TTS_FilePath_Trimmed'], speedFactor=subsDict[key]['speed_factor'], num=key)
+            stretchedClip = stretch_audio(virtualTrimmedFileDict[key], speedFactor=subsDict[key]['speed_factor'], num=key)
+        else:
+            #stretchedClip = AudioSegment.from_file(value['TTS_FilePath_Trimmed'], format="wav")
+            stretchedClip = AudioSegment.from_file(virtualTrimmedFileDict[key], format="wav")
+            virtualTrimmedFileDict[key].seek(0) # Not 100% sure if this is necessary but it was in the other place it is used
+        canvas = insert_audio(canvas, stretchedClip, value['start_ms'])
+        keyIndex = list(subsDict.keys()).index(key)
+        print(f" Final Audio Processed: {keyIndex+1} of {len(subsDict)}", end="\r")
+    print("\n")
+    # Determine string to use for output format and file extension based on config setting
+    outputFormat=output_format.lower()
+    if outputFormat == "mp3":
+        outputFileName += "mp3"
+        formatString = "mp3"
+    elif outputFormat == "wav":
+        outputFileName += "wav"
+        formatString = "wav"
+    elif outputFormat == "aac":
+        #outputFileName += "m4a"
+        #formatString = "mp4" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
+        outputFileName += "aac"
+        formatString = "adts" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
+    canvas = canvas.set_channels(2) # Change from mono to stereo
+    try:
+        print("\nExporting audio file...")
+        canvas.export(outputFileName, format=formatString, bitrate="192k")
+    except:
+        outputFileName = outputFileName + ".bak"
+        canvas.export(outputFileName, format=formatString, bitrate="192k")
+        print("\nThere was an issue exporting the audio, it might be a permission error. The file was saved as a backup with the extension .bak")
+        print("Try removing the .bak extension then listen to the file to see if it worked.\n")
+        input("Press Enter to exit...")
+    return subsDict

app/scripts/azure_batch.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/usr/bin/env python
+# coding: utf-8
+# Based on Microsoft Azure sample code found here: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch-synthesis/python/synthesis.py
+# Original License Info Below:
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+#--------------------------------------------------------------------------------------------------------
+import os
+import json
+import logging
+import sys
+import requests
+logging.basicConfig(stream=sys.stdout, level=logging.ERROR,
+        format="[%(asctime)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")
+logger = logging.getLogger(__name__)
+# Your Speech resource key and region
+# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
+AZURE_SPEECH_KEY = os.environ.get('SPEECH_KEY')
+AZURE_SPEECH_REGION = os.environ.get('SPEECH_REGION')
+NAME = "Simple synthesis"
+DESCRIPTION = "Simple synthesis description"
+# The service host suffix.
+# For azure.cn the host suffix is "customvoice.api.speech.azure.cn"
+SERVICE_HOST = "customvoice.api.speech.microsoft.com"
+def submit_synthesis(payload):
+    url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis'
+    header = {
+        'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY,
+        'Content-Type': 'application/json'
+    }
+    response = requests.post(url, json.dumps(payload), headers=header)
+    if response.status_code < 400:
+        logger.info('Batch synthesis job submitted successfully')
+        logger.info(f'Job ID: {response.json()["id"]}')
+        return response.json()["id"]
+    else:
+        logger.error(f'Failed to submit batch synthesis job: {response.text}')
+def get_synthesis(job_id):
+    url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/{job_id}'
+    header = {
+        'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY
+    }
+    response = requests.get(url, headers=header)
+    if response.status_code < 400:
+        logger.info('Get batch synthesis job successfully')
+        logger.info(response.json())
+        #return response.json()['status']
+        return response
+    else:
+        logger.error(f'Failed to get batch synthesis job: {response.text}')
+def list_synthesis_jobs(skip: int = 0, top: int = 100):
+    """List all batch synthesis jobs in the subscription"""
+    url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis?skip={skip}&top={top}'
+    header = {
+        'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY
+    }
+    response = requests.get(url, headers=header)
+    if response.status_code < 400:
+        logger.info(f'List batch synthesis jobs successfully, got {len(response.json()["values"])} jobs')
+        logger.info(response.json())
+    else:
+        logger.error(f'Failed to list batch synthesis jobs: {response.text}')

app/scripts/azure_translate.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests, uuid, json, os
+def azure_translate_text(text_list, from_lang="en", to_lang="hi"):
+    TRANSLATE_API_ENDPOINT = os.environ.get("TRANSLATE_API_ENDPOINT")
+    url = f"{TRANSLATE_API_ENDPOINT}/translate"
+    params = {
+        'api-version': '3.0',
+        'from': from_lang,
+        'to': [to_lang]
+    }
+    TRANSLATE_KEY = os.environ.get("TRANSLATE_KEY")
+    LOCATION = os.environ.get("SPEECH_REGION")
+    headers = {
+        'Ocp-Apim-Subscription-Key': TRANSLATE_KEY,
+        'Ocp-Apim-Subscription-Region': LOCATION,
+        'Content-type': 'application/json',
+        'X-ClientTraceId': str(uuid.uuid4())
+    }
+    body = [{"text": text} for text in text_list]
+    request = requests.post(url, params=params, headers=headers, json=body)
+    response = request.json()
+    response = [{"text": text["translations"][0]["text"]} for text in response]
+    return response

app/scripts/srt.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import re
+def parse_srt_file(srtFileLines, preTranslated=False):
+    # Matches the following example with regex:    00:00:20,130 --> 00:00:23,419
+    subtitleTimeLineRegex = re.compile(r'\d\d:\d\d:\d\d,\d\d\d --> \d\d:\d\d:\d\d,\d\d\d')
+    # Create a dictionary
+    subsDict = {}
+    # Will add this many milliseconds of extra silence before and after each audio clip / spoken subtitle line
+    addBufferMilliseconds = 0
+    # Enumerate lines, and if a line in lines contains only an integer, put that number in the key, and a dictionary in the value
+    # The dictionary contains the start, ending, and duration of the subtitles as well as the text
+    # The next line uses the syntax HH:MM:SS,MMM --> HH:MM:SS,MMM . Get the difference between the two times and put that in the dictionary
+    # For the line after that, put the text in the dictionary
+    for lineNum, line in enumerate(srtFileLines):
+        line = line.strip()
+        if line.isdigit() and subtitleTimeLineRegex.match(srtFileLines[lineNum + 1]):
+            lineWithTimestamps = srtFileLines[lineNum + 1].strip()
+            lineWithSubtitleText = srtFileLines[lineNum + 2].strip()
+            # If there are more lines after the subtitle text, add them to the text
+            count = 3
+            while True:
+                # Check if the next line is blank or not
+                if (lineNum+count) < len(srtFileLines) and srtFileLines[lineNum + count].strip():
+                    lineWithSubtitleText += ' ' + srtFileLines[lineNum + count].strip()
+                    count += 1
+                else:
+                    break
+            # Create empty dictionary with keys for start and end times and subtitle text
+            subsDict[line] = {'start_ms': '', 'end_ms': '', 'duration_ms': '', 'text': '', 'break_until_next': '', 'srt_timestamps_line': lineWithTimestamps}
+            time = lineWithTimestamps.split(' --> ')
+            time1 = time[0].split(':')
+            time2 = time[1].split(':')
+            # Converts the time to milliseconds
+            processedTime1 = int(time1[0]) * 3600000 + int(time1[1]) * 60000 + int(time1[2].split(',')[0]) * 1000 + int(time1[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
+            processedTime2 = int(time2[0]) * 3600000 + int(time2[1]) * 60000 + int(time2[2].split(',')[0]) * 1000 + int(time2[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
+            timeDifferenceMs = str(processedTime2 - processedTime1)
+            # Adjust times with buffer
+            if addBufferMilliseconds > 0 and not preTranslated:
+                subsDict[line]['start_ms_buffered'] = str(processedTime1 + addBufferMilliseconds)
+                subsDict[line]['end_ms_buffered'] = str(processedTime2 - addBufferMilliseconds)
+                subsDict[line]['duration_ms_buffered'] = str((processedTime2 - addBufferMilliseconds) - (processedTime1 + addBufferMilliseconds))
+            else:
+                subsDict[line]['start_ms_buffered'] = str(processedTime1)
+                subsDict[line]['end_ms_buffered'] = str(processedTime2)
+                subsDict[line]['duration_ms_buffered'] = str(processedTime2 - processedTime1)
+            # Set the keys in the dictionary to the values
+            subsDict[line]['start_ms'] = str(processedTime1)
+            subsDict[line]['end_ms'] = str(processedTime2)
+            subsDict[line]['duration_ms'] = timeDifferenceMs
+            subsDict[line]['text'] = lineWithSubtitleText
+            if lineNum > 0:
+                # Goes back to previous line's dictionary and writes difference in time to current line
+                subsDict[str(int(line)-1)]['break_until_next'] = processedTime1 - int(subsDict[str(int(line) - 1)]['end_ms'])
+            else:
+                subsDict[line]['break_until_next'] = 0
+    # Apply the buffer to the start and end times by setting copying over the buffer values to main values
+    if addBufferMilliseconds > 0 and not preTranslated:
+        for key, value in subsDict.items():
+            subsDict[key]['start_ms'] = value['start_ms_buffered']
+            subsDict[key]['end_ms'] = value['end_ms_buffered']
+            subsDict[key]['duration_ms'] = value['duration_ms_buffered']
+    return subsDict
+def get_duration(filename):
+    import subprocess, json
+    result = subprocess.check_output(f'ffprobe -i {filename} -show_entries format=duration -v quiet -of csv="p=0" -of json', shell=True).decode()
+    try:
+        duration = json.loads(result)['format']["duration"]
+    except KeyError:
+        print("Error: Could not get duration of video file. Please check the file path and try again.")
+    durationMS = round(float(duration)*1000) # Convert to milliseconds
+    return durationMS

app/scripts/translate.py ADDED Viewed

	@@ -0,0 +1,402 @@

+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+# Imports
+import re, regex
+from . import utils
+from .azure_translate import azure_translate_text
+from operator import itemgetter
+import sys
+import copy
+import os
+import html
+from pathlib import Path
+combine_subtitles_max_chars = 200
+translate_service = 'azure'
+# -------------------------------- No Translate and Manual Translation Functions -----------------------------------
+BASE_DIR = Path(__file__).resolve().parent.parent / 'SSML_Customization'
+# Import files and put into dictionaries
+noTranslateOverrideFile = os.path.join(BASE_DIR, 'dont_translate_phrases.txt')
+dontTranslateList = utils.txt_to_list(noTranslateOverrideFile)
+manualTranslationOverrideFile = os.path.join(BASE_DIR, 'Manual_Translations.csv')
+manualTranslationsDict = utils.csv_to_dict(manualTranslationOverrideFile)
+urlListFile = os.path.join(BASE_DIR, 'url_list.txt')
+urlList = utils.txt_to_list(urlListFile)
+# Add span tags around certain words to exclude them from being translated
+def add_notranslate_tags_from_notranslate_file(text, phraseList):
+    for word in phraseList:
+        findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{word}[.,!?()]?["\']?)(\p{{Z}}|$)' #\p ensures it works with unicode characters
+        findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
+        # Find the word, with optional punctuation after, and optional quotes before or after
+        text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
+    return text
+def remove_notranslate_tags(text):
+    text = text.replace('<span class="notranslate">', '').replace('</span>', '')
+    return text
+def add_notranslate_tags_for_manual_translations(text, langcode):
+    for manualTranslatedText in manualTranslationsDict:
+        # Only replace text if the language matches the entry in the manual translations file
+        if manualTranslatedText['Language Code'] == langcode:
+            originalText = manualTranslatedText['Original Text']
+            findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
+            findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
+            text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
+    return text
+# Replace certain words or phrases with their manual translation
+def replace_manual_translations(text, langcode):
+    for manualTranslatedText in manualTranslationsDict:
+        # Only replace text if the language matches the entry in the manual translations file
+        if manualTranslatedText['Language Code'] == langcode:
+            originalText = manualTranslatedText['Original Text']
+            translatedText = manualTranslatedText['Translated Text']
+            findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
+            findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
+            # Substitute the matched word with the translated text
+            text = findWordRegexCompiled.sub(rf'\1{translatedText}\3', text)
+    return text
+#======================================== Translate Text ================================================
+# Note: This function was almost entirely written by GPT-3 after feeding it my original code and asking it to change it so it
+# would break up the text into chunks if it was too long. It appears to work
+def process_response_text(text, targetLanguage):
+    text = html.unescape(text)
+    text = remove_notranslate_tags(text)
+    text = replace_manual_translations(text, targetLanguage)
+    return text
+def split_transcript_chunks(text, max_length=5000):
+    # Calculate the total number of utf-8 codepoints
+    #totalCodepoints = len(text.encode("utf-8"))
+    # Split the transcript into sentences
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    # Initialize a list to store the chunks of text
+    chunks = []
+    # Initialize a string to store a chunk of text
+    chunk = ""
+    # For each sentence in the list of sentences
+    for sentence in sentences:
+        # If adding the sentence to the chunk would keep it within the maximum length
+        if len(chunk.encode("utf-8")) + len(sentence.encode("utf-8")) + 1 <= max_length:  # Adding 1 to account for space
+            # Add the sentence to the chunk
+            chunk += sentence + " "
+        else:
+            # If adding the sentence would exceed the maximum length and chunk is not empty
+            if chunk:
+                # Add the chunk to the list of chunks
+                chunks.append(chunk.strip())
+            # Start a new chunk with the current sentence
+            chunk = sentence + " "
+    # Add the last chunk to the list of chunks (if it's not empty)
+    if chunk:
+        chunks.append(chunk.strip())
+    # Return the list of chunks
+    return chunks
+def convertChunkListToCompatibleDict(chunkList):
+    # Create dictionary with numbers as keys and chunks as values
+    chunkDict = {}
+    for i, chunk in enumerate(chunkList, 1):
+        chunkDict[i] = {'text': chunk}
+    return chunkDict
+# Translate the text entries of the dictionary
+def translate_dictionary(inputSubsDict, langDict, translatedSrtFileName, skipTranslation=False, ):
+    targetLanguage = langDict['targetLanguage']
+    sourceLanguage = langDict['sourceLanguage']
+    translateService = langDict['translateService']
+    # Create a container for all the text to be translated
+    textToTranslate = []
+    for key in inputSubsDict:
+        originalText = inputSubsDict[key]['text']
+        # Add any 'notranslate' tags to the text
+        processedText = add_notranslate_tags_from_notranslate_file(originalText, dontTranslateList)
+        processedText = add_notranslate_tags_from_notranslate_file(processedText, urlList)
+        processedText = add_notranslate_tags_for_manual_translations(processedText, targetLanguage)
+        # Add the text to the list of text to be translated
+        textToTranslate.append(processedText)
+    # Calculate the total number of utf-8 codepoints
+    codepoints = 0
+    for text in textToTranslate:
+        codepoints += len(text.encode("utf-8"))
+    # If the codepoints are greater than 28000, split the request into multiple
+    # Google's API limit is 30000 Utf-8 codepoints per request, while DeepL's is 130000, but we leave some room just in case
+    if skipTranslation == False:
+        if translateService == 'azure':
+            print("Translating text using Azure...")
+            result = azure_translate_text(textToTranslate, sourceLanguage, targetLanguage)
+            # Add the translated texts to the dictionary
+            for i, key in enumerate(inputSubsDict):
+                inputSubsDict[key]['translated_text'] = process_response_text(result[i]["text"], targetLanguage)
+                # Print progress, overwrite the same line
+                print(f' Translated: {key} of {len(inputSubsDict)}', end='\r')
+        else:
+            print("Error: Invalid translate_service setting. Only 'Azure' is supported.")
+            sys.exit()
+    else:
+        for key in inputSubsDict:
+            inputSubsDict[key]['translated_text'] = process_response_text(inputSubsDict[key]['text'], targetLanguage) # Skips translating, such as for testing
+    print("                                                  ")
+    combinedProcessedDict = combine_subtitles_advanced(inputSubsDict, int(combine_subtitles_max_chars))
+    if skipTranslation == False:
+        # Write new srt file with translated text
+        with open(translatedSrtFileName, 'w', encoding='utf-8-sig') as f:
+            for key in combinedProcessedDict:
+                f.write(str(key) + '\n')
+                f.write(combinedProcessedDict[key]['srt_timestamps_line'] + '\n')
+                f.write(combinedProcessedDict[key]['translated_text'] + '\n')
+                f.write('\n')
+    return combinedProcessedDict
+##### Add additional info to the dictionary for each language #####
+def set_translation_info(languageBatchDict):
+    newBatchSettingsDict = copy.deepcopy(languageBatchDict)
+    # If using Azure, set all languages to use Azure in dictionary
+    if translate_service == 'azure':
+        for langNum, langInfo in languageBatchDict.items():
+            newBatchSettingsDict[langNum]['translate_service'] = 'azure'
+            newBatchSettingsDict[langNum]['formality'] = None
+    else:
+        print("Error: No valid translation service selected. Please choose a valid service or enable 'skip_translation' in config.")
+        sys.exit()
+    return newBatchSettingsDict
+#======================================== Combine Subtitle Lines ================================================
+def combine_subtitles_advanced(inputDict, maxCharacters=200):
+    charRateGoal = 20 #20
+    gapThreshold = 100 # The maximum gap between subtitles to combine
+    noMorePossibleCombines = False
+    # Convert dictionary to list of dictionaries of the values
+    entryList = []
+    for key, value in inputDict.items():
+        value['originalIndex'] = int(key)-1
+        entryList.append(value)
+    while not noMorePossibleCombines:
+        entryList, noMorePossibleCombines = combine_single_pass(entryList, charRateGoal, gapThreshold, maxCharacters)
+    # Convert the list back to a dictionary then return it
+    return dict(enumerate(entryList, start=1))
+def combine_single_pass(entryListLocal, charRateGoal, gapThreshold, maxCharacters):
+    # Want to restart the loop if a change is made, so use this variable, otherwise break only if the end is reached
+    reachedEndOfList = False
+    noMorePossibleCombines = True # Will be set to False if a combination is made
+    # Use while loop because the list is being modified
+    while not reachedEndOfList:
+        # Need to update original index in here
+        for entry in entryListLocal:
+            entry['originalIndex'] = entryListLocal.index(entry)
+        # Will use later to check if an entry is the last one in the list, because the last entry will have originalIndex equal to the length of the list - 1
+        originalNumberOfEntries = len(entryListLocal)
+        # Need to calculate the char_rate for each entry, any time something changes, so put it at the top of this loop
+        entryListLocal = calc_list_speaking_rates(entryListLocal, charRateGoal)
+        # Sort the list by the difference in speaking speed from charRateGoal
+        priorityOrderedList = sorted(entryListLocal, key=itemgetter('char_rate_diff'), reverse=True)
+        # Iterates through the list in order of priority, and uses that index to operate on entryListLocal
+        # For loop is broken after a combination is made, so that the list can be re-sorted and re-iterated
+        for progress, data in enumerate(priorityOrderedList):
+            i = data['originalIndex']
+            # Check if last entry, and therefore will end loop when done with this iteration
+            if progress == len(priorityOrderedList) - 1:
+                reachedEndOfList = True
+            # Check if the current entry is outside the upper and lower bounds
+            if (data['char_rate'] > charRateGoal or data['char_rate'] < charRateGoal):
+                # Check if the entry is the first in entryListLocal, if so do not consider the previous entry
+                if data['originalIndex'] == 0:
+                    considerPrev = False
+                else:
+                    considerPrev = True
+                # Check if the entry is the last in entryListLocal, if so do not consider the next entry
+                if data['originalIndex'] == originalNumberOfEntries - 1:
+                    considerNext = False
+                else:
+                    considerNext = True
+                # Check if current entry is still in the list - if it has been combined with another entry, it will not be
+                # Get the char_rate of the next and previous entries, if they exist, and calculate the difference
+                # If the diff is positive, then it is lower than the current char_rate
+                try:
+                    nextCharRate = entryListLocal[i+1]['char_rate']
+                    nextDiff = data['char_rate'] - nextCharRate
+                except IndexError:
+                    considerNext = False
+                    nextCharRate = None
+                    nextDiff = None
+                try:
+                    prevCharRate = entryListLocal[i-1]['char_rate']
+                    prevDiff = data['char_rate'] - prevCharRate
+                except IndexError:
+                    considerPrev = False
+                    prevCharRate = None
+                    prevDiff = None
+            else:
+                continue
+            # Define functions for combining with previous or next entries - Generated with copilot, it's possible this isn't perfect
+            def combine_with_next():
+                entryListLocal[i]['text'] = entryListLocal[i]['text'] + ' ' + entryListLocal[i+1]['text']
+                entryListLocal[i]['translated_text'] = entryListLocal[i]['translated_text'] + ' ' + entryListLocal[i+1]['translated_text']
+                entryListLocal[i]['end_ms'] = entryListLocal[i+1]['end_ms']
+                entryListLocal[i]['end_ms_buffered'] = entryListLocal[i+1]['end_ms_buffered']
+                entryListLocal[i]['duration_ms'] = int(entryListLocal[i+1]['end_ms']) - int(entryListLocal[i]['start_ms'])
+                entryListLocal[i]['duration_ms_buffered'] = int(entryListLocal[i+1]['end_ms_buffered']) - int(entryListLocal[i]['start_ms_buffered'])
+                entryListLocal[i]['srt_timestamps_line'] = entryListLocal[i]['srt_timestamps_line'].split(' --> ')[0] + ' --> ' + entryListLocal[i+1]['srt_timestamps_line'].split(' --> ')[1]
+                del entryListLocal[i+1]
+            def combine_with_prev():
+                entryListLocal[i-1]['text'] = entryListLocal[i-1]['text'] + ' ' + entryListLocal[i]['text']
+                entryListLocal[i-1]['translated_text'] = entryListLocal[i-1]['translated_text'] + ' ' + entryListLocal[i]['translated_text']
+                entryListLocal[i-1]['end_ms'] = entryListLocal[i]['end_ms']
+                entryListLocal[i-1]['end_ms_buffered'] = entryListLocal[i]['end_ms_buffered']
+                entryListLocal[i-1]['duration_ms'] = int(entryListLocal[i]['end_ms']) - int(entryListLocal[i-1]['start_ms'])
+                entryListLocal[i-1]['duration_ms_buffered'] = int(entryListLocal[i]['end_ms_buffered']) - int(entryListLocal[i-1]['start_ms_buffered'])
+                entryListLocal[i-1]['srt_timestamps_line'] = entryListLocal[i-1]['srt_timestamps_line'].split(' --> ')[0] + ' --> ' + entryListLocal[i]['srt_timestamps_line'].split(' --> ')[1]
+                del entryListLocal[i]
+            # Choose whether to consider next and previous entries, and if neither then continue to next loop
+            if data['char_rate'] > charRateGoal:
+                # Check to ensure next/previous rates are lower than current rate, and the combined entry is not too long, and the gap between entries is not too large
+                # Need to add check for considerNext and considerPrev first, because if run other checks when there is no next/prev value to check, it will throw an error
+                if considerNext == False or nextDiff or nextDiff < 0 or (entryListLocal[i]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i]['translated_text']) + len(entryListLocal[i+1]['translated_text']) > maxCharacters):
+                    considerNext = False
+                try:
+                    if considerPrev == False or not prevDiff or prevDiff < 0 or (entryListLocal[i-1]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i-1]['translated_text']) + len(entryListLocal[i]['translated_text']) > maxCharacters):
+                        considerPrev = False
+                except TypeError:
+                    considerPrev = False
+            elif data['char_rate'] < charRateGoal:
+                # Check to ensure next/previous rates are higher than current rate
+                if considerNext == False or not nextDiff or nextDiff > 0 or (entryListLocal[i]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i]['translated_text']) + len(entryListLocal[i+1]['translated_text']) > maxCharacters):
+                    considerNext = False
+                try:
+                    if considerPrev == False or not prevDiff or prevDiff > 0 or (entryListLocal[i-1]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i-1]['translated_text']) + len(entryListLocal[i]['translated_text']) > maxCharacters):
+                        considerPrev = False
+                except TypeError:
+                    considerPrev = False
+            else:
+                continue
+            # Continue to next loop if neither are considered
+            if not considerNext and not considerPrev:
+                continue
+            # Should only reach this point if two entries are to be combined
+            if data['char_rate'] > charRateGoal:
+                # If both are to be considered, then choose the one with the lower char_rate
+                if considerNext and considerPrev:
+                    if nextDiff < prevDiff:
+                        combine_with_next()
+                        noMorePossibleCombines = False
+                        break
+                    else:
+                        combine_with_prev()
+                        noMorePossibleCombines = False
+                        break
+                # If only one is to be considered, then combine with that one
+                elif considerNext:
+                    combine_with_next()
+                    noMorePossibleCombines = False
+                    break
+                elif considerPrev:
+                    combine_with_prev()
+                    noMorePossibleCombines = False
+                    break
+                else:
+                    print(f"Error U: Should not reach this point! Current entry = {i}")
+                    print(f"Current Entry Text = {data['text']}")
+                    continue
+            elif data['char_rate'] < charRateGoal:
+                # If both are to be considered, then choose the one with the higher char_rate
+                if considerNext and considerPrev:
+                    if nextDiff > prevDiff:
+                        combine_with_next()
+                        noMorePossibleCombines = False
+                        break
+                    else:
+                        combine_with_prev()
+                        noMorePossibleCombines = False
+                        break
+                # If only one is to be considered, then combine with that one
+                elif considerNext:
+                    combine_with_next()
+                    noMorePossibleCombines = False
+                    break
+                elif considerPrev:
+                    combine_with_prev()
+                    noMorePossibleCombines = False
+                    break
+                else:
+                    print(f"Error L: Should not reach this point! Index = {i}")
+                    print(f"Current Entry Text = {data['text']}")
+                    continue
+    return entryListLocal, noMorePossibleCombines
+#-- End of combine_single_pass --
+#----------------------------------------------------------------------
+# Calculate the number of characters per second for each subtitle entry
+def calc_dict_speaking_rates(inputDict, dictKey='translated_text'):
+    tempDict = copy.deepcopy(inputDict)
+    for key, value in tempDict.items():
+        tempDict[key]['char_rate'] = round(len(value[dictKey]) / (int(value['duration_ms']) / 1000), 2)
+    return tempDict
+def calc_list_speaking_rates(inputList, charRateGoal, dictKey='translated_text'):
+    tempList = copy.deepcopy(inputList)
+    for i in range(len(tempList)):
+        # Calculate the number of characters per second based on the duration of the entry
+        tempList[i]['char_rate'] = round(len(tempList[i][dictKey]) / (int(tempList[i]['duration_ms']) / 1000), 2)
+        # Calculate the difference between the current char_rate and the goal char_rate - Absolute Value
+        tempList[i]['char_rate_diff'] = abs(round(tempList[i]['char_rate'] - charRateGoal, 2))
+    return tempList

app/scripts/utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import csv
+# Interprets a string as a boolean. Returns True or False
+def parseBool(string, silent=False):
+    if type(string) == str:
+        if string.lower() == 'true':
+            return True
+        elif string.lower() == 'false':
+            return False
+        else:
+            if not silent:
+                raise ValueError(f'Invalid value "{string}". Must be "True" or "False"')
+            elif silent:
+                return string
+    elif type(string) == bool:
+        if string == True:
+            return True
+        elif string == False:
+            return False
+    else:
+        raise ValueError('Not a valid boolean string')
+def parseConfigSetting(setting):
+    # Remove any quotes user may have added in config file
+    setting = setting.strip("\"").strip("\'")
+    # Check if it is a boolean
+    if type(parseBool(setting, silent=True)) == bool:
+        return parseBool(setting, silent=True)
+    # Check if it is an integer
+    try:
+        return int(setting)
+    except ValueError:
+        pass
+    # Otherwise return the string in lower case
+    return setting.lower()
+# Returns a list of dictionaries from a csv file. Where the key is the column name and the value is the value in that column
+# The column names are set by the first row of the csv file
+def csv_to_dict(csvFilePath):
+    with open(csvFilePath, "r", encoding='utf-8-sig') as data:
+        entriesDictsList = []
+        for line in csv.DictReader(data):
+            entriesDictsList.append(line)
+    return entriesDictsList
+# Returns a list of strings from a txt file. Ignores empty lines and lines that start with '#'
+def txt_to_list(txtFilePath):
+    with open(txtFilePath, "r", encoding='utf-8-sig') as data:
+        entriesList = []
+        for line in data:
+            if line.strip() != '' and line.strip()[0] != '#':
+                entriesList.append(line.strip())
+    return entriesList

requirements.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+annotated-types==0.5.0
+anyio==3.7.1
+azure-cognitiveservices-speech==1.30.0
+boto3==1.28.18
+botocore==1.31.18
+Brotli==1.0.9
+certifi==2023.7.22
+cffi==1.15.1
+charset-normalizer==3.2.0
+click==8.0.3
+essentials==1.1.4
+fastapi==0.100.1
+h11==0.14.0
+idna==3.4
+jmespath==1.0.1
+langcodes==3.3.0
+language-data==1.1
+marisa-trie==0.7.8
+mutagen==1.46.0
+numpy==1.25.2
+pycparser==2.21
+pycryptodomex==3.18.0
+pydantic==2.1.1
+pydantic_core==2.4.0
+pydub==0.25.1
+pyrubberband==0.3.0
+PySoundFile==0.9.0.post1
+python-dateutil==2.8.2
+python-dotenv==0.19.2
+python-multipart==0.0.6
+regex==2023.6.3
+requests==2.31.0
+s3transfer==0.6.1
+six==1.16.0
+sniffio==1.3.0
+soundfile==0.12.1
+starlette==0.27.0
+typing_extensions==4.7.1
+urllib3==1.26.16
+uvicorn==0.23.2
+websockets==11.0.3
+yt-dlp==2023.7.6
+youtube_transcript_api