badal commited on
Commit
2f2406a
·
0 Parent(s):

feat: initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .idea
2
+ .ipynb_checkpoints
3
+ .mypy_cache
4
+ .vscode
5
+ __pycache__
6
+ .pytest_cache
7
+ htmlcov
8
+ dist
9
+ site
10
+ .coverage
11
+ coverage.xml
12
+ .netlify
13
+ test.db
14
+ log.txt
15
+ Pipfile.lock
16
+ env3.*
17
+ env
18
+ docs_build
19
+ site_build
20
+ venv
21
+ docs.zip
22
+ archive.zip
23
+ openssl-1.1.1u
24
+ logs
25
+ run.sh
26
+ # vim temporary files
27
+ *~
28
+ .*.sw?
29
+ .cache
Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:latest
2
+
3
+ RUN mkdir -p /code
4
+ RUN chmod 777 /code
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN apt-get update && apt-get upgrade -y
11
+ RUN apt-get install ffmpeg -y
12
+ RUN apt-get install git -y
13
+
14
+ RUN apt-get install -y \
15
+ build-essential \
16
+ libssl-dev \
17
+ ca-certificates \
18
+ libasound2 \
19
+ wget
20
+
21
+ # Download OpenSSL source, compile, and install it
22
+ RUN wget -O - https://www.openssl.org/source/openssl-1.1.1u.tar.gz | tar zxf -
23
+ WORKDIR openssl-1.1.1u
24
+ RUN ./config --prefix=/usr/local
25
+ RUN make -j $(nproc)
26
+ RUN make install_sw install_ssldirs
27
+ RUN ldconfig -v
28
+
29
+ # Set environment variables
30
+ ENV SSL_CERT_DIR=/etc/ssl/certs
31
+ ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
32
+
33
+ WORKDIR /code
34
+
35
+ RUN pip install --upgrade pip
36
+
37
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
38
+
39
+ COPY ./app /code/app
40
+
41
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vidverse
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app/SSML_Customization/Examples.xlsx ADDED
Binary file (11.6 kB). View file
 
app/SSML_Customization/Manual_Translations.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Original Text,Translated Text,Language Code
app/SSML_Customization/Phoneme_Pronunciation.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Text,Phonetic Pronunciation,Case Sensitive (True/False),Phonetic Alphabet
app/SSML_Customization/READ THIS.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This folder contains the following three pronunciation customization files by default.
2
+
3
+ • dont_translate_phrases.txt
4
+ - You can add a list of phrases or words you do not want to be translated.
5
+ - This will work for both Google Translate and DeepL
6
+
7
+ • interpret-as.csv (Azure Only)
8
+ - You can use SSML parameters to customize how specific words or phrases are pronounced
9
+ - See this article for documentation: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#say-as-element
10
+ - Note: The script will match the phrases in the TRANSLATED text. You may therefore wish to also add these phrases to 'dont_translate_phrases.txt'.
11
+ - The first row contains the titles of each column - Do not change anything in the first row!
12
+ - Descriptions of each column:
13
+ • Text: The word or phrase that will be pronounced how you specify, if it is found in the text to be spoken
14
+ • interpret-as Type: The way in which the word/phrase will be pronounced. See documentation link above. (Some examples include: characters, cardinal, ordinal)
15
+ • Case Sensitive (True/False): Whether to only modify the pronunciation if the word/phrase matches exactly, being case sensitive
16
+ • Format (Optional): Only applicable to some types, such as 'date', 'time', and others. Otherwise leave blank. See documentation link above for details
17
+ - See 'Example - interpret-as.csv' for an example of how to use this file
18
+ - This will only apply if using Azure TTS, not Google
19
+
20
+ • aliases.csv (Azure Only)
21
+ - Lets you effectively change what should be spoken instead of a certain word or phrase
22
+ - Example: If the text to be spoken contains "BTW" you can have it say "by the way"
23
+ -Note: It does NOT actually replace the text, but only changes how the voice will pronounce it
24
+ - The first row contains the titles of each column - Do not change anything in the first row!
25
+ - Description of each column:
26
+ - Original Text: The original word or phrase to match
27
+ - Alias: The word or phrase to speak instead of the original text
28
+ - Case Sensitive (True/False): Whether it must be an exact match including capital/lower case. If nothing is entered, will default to False
29
+ - This will only apply if using Azure TTS, not Google
30
+
31
+ • Manual_Translations.csv
32
+ - If you know you are going to use a word that gets incorrectly interpreted or translated, you can enter manual translations for any words for any languages
33
+ - In Manual_Translations.csv, put the original text in the first column, your translation in the second, and the 2-letter language code for that entry into the 3rd column
34
+
35
+
36
+ • url_list.txt
37
+ - If you have any URLs in the original text, you can put them as a list in this file
38
+ - This makes it so the URL will not be translated, and also improves the pronunciation in the TTS stage
39
+ - It will really only work on basic URLs, such as "example.com/test". If it has anything other than slashes, periods, and colons, it won't work
40
+ - See the notes at the top of the url_list.txt file for more details
41
+
42
+ • Phoneme_Pronunciation.csv
43
+ - Allows you to specify exact phonetic pronunciation of words or phrases
44
+ - Note: This is different from 'aliases'. Using this requires using special phonetic alphabets (see links below)
45
+ - See: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-ssml-phonetic-sets
46
+ - See: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#phoneme-element
app/SSML_Customization/aliases.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Original Text,Alias,Case Sensitive (True/False)
app/SSML_Customization/dont_translate_phrases.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Add one word or phrase per line that you do not want to be translated. The original word will be left as-is in the translated srt files.
2
+ # Don't include punctuation. This list will NOT be case sensitive
3
+ # Lines beginning with a # will be ignored
app/SSML_Customization/interpret-as.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ Text,interpret-as Type,Case Sensitive (True/False),Format (Optional)
app/SSML_Customization/url_list.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # List any URLs that may appear in the original text, such as "google.com/example"
2
+ # This ensures they will not be translated, and will be spoken as words in the TTS stage
3
+ # Example: "google.com/example" becomes "google dot com slash example", which spoken in spanish would be "google punto c o m diagonal example"
4
+ # The actual text in the subtitles will remain as "google.com/example", and only the spoken audio will change
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from . import scripts
app/captioning/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .captioning import generate_sub
app/captioning/caption_helper.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) Microsoft. All rights reserved.
3
+ # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
4
+ #
5
+
6
+ from datetime import date, datetime, time, timedelta
7
+ from typing import List, Optional, Tuple
8
+ import azure.cognitiveservices.speech as speechsdk # type: ignore
9
+ from . import helper
10
+
11
+ class Caption(object) :
12
+ def __init__(self, language : Optional[str], sequence : int, begin : time, end : time, text : str) :
13
+ self.language = language
14
+ self.sequence = sequence
15
+ self.begin = begin
16
+ self.end = end
17
+ self.text = text
18
+
19
+ def get_captions(language : Optional[str], max_width : int, max_height : int, results : List[dict]) -> List[Caption] :
20
+ caption_helper = CaptionHelper(language, max_width, max_height, results)
21
+ return caption_helper.get_captions()
22
+
23
+ class CaptionHelper(object) :
24
+ def __init__(self, language : Optional[str], max_width : int, max_height : int, results : List[speechsdk.RecognitionResult]) :
25
+ self._language = language
26
+ self._max_width = max_width
27
+ self._max_height = max_height
28
+ self._results = results
29
+
30
+ self._first_pass_terminators = ["?", "!", ",", ";"]
31
+ self._second_pass_terminators = [" ", "."]
32
+
33
+ self._captions : List[Caption] = []
34
+
35
+ # consider adapting to use http://unicode.org/reports/tr29/#Sentence_Boundaries
36
+ if self._language is not None :
37
+ iso639 = self._language.split('-')[0]
38
+ if "zh" == iso639.lower() :
39
+ self._first_pass_terminators = [",", "、", ";", "?", "!", "?", "!", ",", ";"]
40
+ self._second_pass_terminators = ["。", " "]
41
+ if (helper.DEFAULT_MAX_LINE_LENGTH_SBCS == self._max_width) :
42
+ self._max_width = helper.DEFAULT_MAX_LINE_LENGTH_MBCS
43
+
44
+ def get_captions(self) -> List[Caption] :
45
+ self.ensure_captions()
46
+ return self._captions
47
+
48
+ def ensure_captions(self) -> None :
49
+ if not self._captions :
50
+ self.add_captions_for_all_results()
51
+
52
+ def add_captions_for_all_results(self) -> None :
53
+ for result in self._results :
54
+ if result.offset <= 0 or not self.is_final_result(result) :
55
+ continue
56
+ text = self.get_text_or_translation(result)
57
+ if not text :
58
+ continue
59
+ self.add_captions_for_final_result(result, text)
60
+
61
+ def get_text_or_translation(self, result : speechsdk.RecognitionResult) -> Optional[str] :
62
+ return result.text
63
+
64
+ # 20220921 We do not use this for now because this sample
65
+ # does not handle TranslationRecognitionResults.
66
+ #if not self._language :
67
+ # return result.text
68
+ #if type(result) is speechsdk.TranslationRecognitionResult and self._language in result.Translations :
69
+ # return result.Translations[self._language]
70
+ #else :
71
+ # return None
72
+
73
+ def add_captions_for_final_result(self, result : speechsdk.RecognitionResult, text : str) -> None :
74
+ caption_starts_at = 0
75
+ caption_lines : List[str] = []
76
+ index = 0
77
+ while (index < len(text)) :
78
+ index = self.skip_skippable(text, index)
79
+
80
+ line_length = self.get_best_width(text, index)
81
+ caption_lines.append(text[index:index + line_length].strip())
82
+ index += line_length
83
+
84
+ is_last_caption = index >= len(text)
85
+ max_caption_lines = len(caption_lines) >= self._max_height
86
+
87
+ add_caption = is_last_caption or max_caption_lines
88
+
89
+ if add_caption :
90
+ caption_text = '\n'.join(caption_lines)
91
+ caption_lines.clear()
92
+
93
+ caption_sequence = len(self._captions) + 1
94
+ is_first_caption = 0 == caption_starts_at
95
+
96
+ caption_begin_and_end : Tuple[time, time]
97
+ if is_first_caption and is_last_caption :
98
+ caption_begin_and_end = self.get_full_caption_result_timing(result)
99
+ else :
100
+ caption_begin_and_end = self.get_partial_result_caption_timing(result, text, caption_text, caption_starts_at, index - caption_starts_at)
101
+
102
+ self._captions.append(Caption(self._language, caption_sequence, caption_begin_and_end[0], caption_begin_and_end[1], caption_text))
103
+
104
+ caption_starts_at = index
105
+
106
+ def get_best_width(self, text : str, start_index : int) -> int :
107
+ remaining = len(text) - start_index
108
+ best_width = remaining if remaining < self._max_width else self.find_best_width(self._first_pass_terminators, text, start_index)
109
+ if (best_width < 0) :
110
+ best_width = self.find_best_width(self._second_pass_terminators, text, start_index)
111
+ if best_width < 0 :
112
+ best_width = self._max_width
113
+ return best_width
114
+
115
+ def find_best_width(self, terminators : List[str], text : str, start_at : int) -> int :
116
+ remaining = len(text) - start_at
117
+ check_chars = min(remaining, self._max_width)
118
+ best_width = -1
119
+ for terminator in terminators :
120
+ index = text.rfind(terminator, start_at, start_at + check_chars)
121
+ width = index - start_at
122
+ if width > best_width :
123
+ best_width = width + len(terminator)
124
+ return best_width
125
+
126
+ def skip_skippable(self, text : str, start_index : int) -> int :
127
+ index = start_index
128
+ while len(text) > index and ' ' == text[index] :
129
+ index += 1
130
+ return index
131
+
132
+ def get_full_caption_result_timing(self, result : speechsdk.RecognitionResult) -> Tuple[time, time] :
133
+ begin = helper.time_from_ticks(result.offset)
134
+ end = helper.time_from_ticks(result.offset + result.duration)
135
+ return (begin, end)
136
+
137
+ def get_partial_result_caption_timing(self, result : speechsdk.RecognitionResult, text : str, caption_text : str, caption_starts_at : int, caption_length : int) -> Tuple[time, time] :
138
+ (result_begin, result_end) = self.get_full_caption_result_timing(result)
139
+ result_duration = helper.subtract_times(result_end, result_begin)
140
+ text_length = len(text)
141
+ partial_begin = helper.add_time_and_timedelta(result_begin, result_duration * caption_starts_at / text_length)
142
+ partial_end = helper.add_time_and_timedelta(result_begin, result_duration * (caption_starts_at + caption_length) / text_length)
143
+ return (partial_begin, partial_end)
144
+
145
+ def is_final_result(self, result : speechsdk.RecognitionResult) -> bool :
146
+ return speechsdk.ResultReason.RecognizedSpeech == result.reason or speechsdk.ResultReason.RecognizedIntent == result.reason or speechsdk.ResultReason.TranslatedSpeech == result.reason
147
+
148
+ def lines_from_text(self, text : str) -> List[str] :
149
+ retval : List[str] = []
150
+ index = 0
151
+ while (index < len(text)) :
152
+ index = self.skip_skippable(text, index)
153
+ line_length = self.get_best_width(text, index)
154
+ retval.append(text[index:index + line_length].strip())
155
+ index += line_length
156
+ return retval
app/captioning/captioning.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) Microsoft. All rights reserved.
3
+ # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
4
+ #
5
+
6
+ # Notes:
7
+ # - Install the Speech SDK. Run:
8
+ # pip install azure-cognitiveservices-speech
9
+ # - The Python Speech SDK on Windows requires the Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017, 2019, or 2022 on the system. See:
10
+ # https://docs.microsoft.com/azure/cognitive-services/speech-service/quickstarts/setup-platform
11
+ # - Install gstreamer:
12
+ # https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-use-codec-compressed-audio-input-streams
13
+
14
+ from datetime import datetime, time, timezone, timedelta
15
+ from itertools import groupby, pairwise
16
+ from os import linesep, remove, environ
17
+ from os.path import exists
18
+ from pathlib import Path
19
+ from sys import argv
20
+ from time import sleep
21
+ from typing import Any, List, Optional
22
+ import wave
23
+ import azure.cognitiveservices.speech as speechsdk # type: ignore
24
+ from . import caption_helper
25
+ from . import helper
26
+ from . import user_config_helper
27
+
28
+ USAGE = """Usage: python captioning.py [...]
29
+
30
+ HELP
31
+ --help Show this help and stop.
32
+
33
+ CONNECTION
34
+ --key KEY Your Azure Speech service resource key.
35
+ Overrides the SPEECH_KEY environment variable. You must set the environment variable (recommended) or use the `--key` option.
36
+ --region REGION Your Azure Speech service region.
37
+ Overrides the SPEECH_REGION environment variable. You must set the environment variable (recommended) or use the `--region` option.
38
+ Examples: westus, eastus
39
+
40
+ LANGUAGE
41
+ --language LANG1 Specify language. This is used when breaking captions into lines.
42
+ Default value is en-US.
43
+ Examples: en-US, ja-JP
44
+
45
+ INPUT
46
+ --input FILE Input audio from file (default input is the microphone.)
47
+ --format FORMAT Use compressed audio format.
48
+ If this is not present, uncompressed format (wav) is assumed.
49
+ Valid only with --file.
50
+ Valid values: alaw, any, flac, mp3, mulaw, ogg_opus
51
+
52
+ MODE
53
+ --offline Output offline results.
54
+ Overrides --realTime.
55
+ --realTime Output real-time results.
56
+ Default output mode is offline.
57
+
58
+ ACCURACY
59
+ --phrases ""PHRASE1;PHRASE2"" Example: ""Constoso;Jessie;Rehaan""
60
+
61
+ OUTPUT
62
+ --output FILE Output captions to FILE.
63
+ --srt Output captions in SubRip Text format (default format is WebVTT.)
64
+ --maxLineLength LENGTH Set the maximum number of characters per line for a caption to LENGTH.
65
+ Minimum is 20. Default is 37 (30 for Chinese).
66
+ --lines LINES Set the number of lines for a caption to LINES.
67
+ Minimum is 1. Default is 2.
68
+ --delay MILLISECONDS How many MILLISECONDS to delay the appearance of each caption.
69
+ Minimum is 0. Default is 1000.
70
+ --remainTime MILLISECONDS How many MILLISECONDS a caption should remain on screen if it is not replaced by another.
71
+ Minimum is 0. Default is 1000.
72
+ --quiet Suppress console output, except errors.
73
+ --profanity OPTION Valid values: raw, remove, mask
74
+ Default is mask.
75
+ --threshold NUMBER Set stable partial result threshold.
76
+ Default is 3.
77
+ """
78
+
79
+ class Captioning(object) :
80
+ def __init__(self, language, input_audio, output) :
81
+ # self._user_config = user_config_helper.user_config_from_args(USAGE)
82
+ self._user_config = {
83
+ "language": language,
84
+ "captioning_mode": user_config_helper.CaptioningMode.OFFLINE, # or OFFLINE if you prefer offline mode
85
+ "input_file": input_audio,
86
+ "output_file": output,
87
+ "use_sub_rip_text_caption_format": True,
88
+ "use_compressed_audio": False,
89
+ "compressed_audio_format": speechsdk.AudioStreamContainerFormat.ANY,
90
+ "subscription_key" : environ.get("SPEECH_KEY"),
91
+ "region" : environ.get("SPEECH_REGION"),
92
+ "profanity_option" : speechsdk.ProfanityOption.Masked,
93
+ "phrases" : "Constoso;Jessie;Rehaan",
94
+ "suppress_console_output" : True,
95
+ "remain_time" : timedelta(milliseconds=1000),
96
+ "delay" : timedelta(milliseconds=1000),
97
+ "max_line_length" : helper.DEFAULT_MAX_LINE_LENGTH_SBCS,
98
+ "lines" : 2,
99
+ "stable_partial_result_threshold" : "3",
100
+ }
101
+ self._srt_sequence_number = 1
102
+ self._previous_caption : Optional[caption_helper.Caption] = None
103
+ self._previous_end_time : Optional[time] = None
104
+ self._previous_result_is_recognized = False
105
+ self._recognized_lines : List[str] = []
106
+ self._offline_results : List[speechsdk.SpeechRecognitionResult] = []
107
+
108
+ def get_timestamp(self, start : time, end : time) -> str :
109
+ time_format = ""
110
+ if self._user_config["use_sub_rip_text_caption_format"] :
111
+ # SRT format requires ',' as decimal separator rather than '.'.
112
+ time_format = "%H:%M:%S,%f"
113
+ else :
114
+ time_format = "%H:%M:%S.%f"
115
+ # Truncate microseconds to milliseconds.
116
+ return "{} --> {}".format(start.strftime(time_format)[:-3], end.strftime(time_format)[:-3])
117
+
118
+ def string_from_caption(self, caption : caption_helper.Caption) -> str :
119
+ retval = ""
120
+ if self._user_config["use_sub_rip_text_caption_format"] :
121
+ retval += str(caption.sequence) + linesep
122
+ retval += self.get_timestamp(caption.begin, caption.end) + linesep
123
+ retval += caption.text + linesep + linesep
124
+ return retval
125
+
126
+ def adjust_real_time_caption_text(self, text : str, is_recognized_result : bool) -> str :
127
+ # Split the caption text into multiple lines based on max_line_length and lines.
128
+ temp_caption_helper = caption_helper.CaptionHelper(self._user_config["language"], self._user_config["max_line_length"], self._user_config["lines"], [])
129
+ lines = temp_caption_helper.lines_from_text(text)
130
+
131
+ # Recognizing results can change with each new result, so we do not save previous Recognizing results.
132
+ # Recognized results are final, so we save them in a member value.
133
+ recognizing_lines : List[str] = []
134
+ if is_recognized_result :
135
+ self._recognized_lines = self._recognized_lines + lines
136
+ else :
137
+ recognizing_lines = lines
138
+
139
+ caption_lines = self._recognized_lines + recognizing_lines
140
+ return '\n'.join(caption_lines[-self._user_config["lines"]:])
141
+
142
+ def caption_from_real_time_result(self, result : speechsdk.SpeechRecognitionResult, is_recognized_result : bool) -> Optional[str] :
143
+ retval : Optional[str] = None
144
+
145
+ start_time = helper.time_from_ticks(result.offset)
146
+ end_time = helper.time_from_ticks(result.offset + result.duration)
147
+
148
+ # If the end timestamp for the previous result is later
149
+ # than the end timestamp for this result, drop the result.
150
+ # This sometimes happens when we receive a lot of Recognizing results close together.
151
+ if self._previous_end_time is not None and self._previous_end_time > end_time :
152
+ pass
153
+ else :
154
+ # Record the end timestamp for this result.
155
+ self._previous_end_time = end_time
156
+
157
+ # Convert the SpeechRecognitionResult to a caption.
158
+ # We are not ready to set the text for this caption.
159
+ # First we need to determine whether to clear _recognizedLines.
160
+ caption = caption_helper.Caption(self._user_config["language"], self._srt_sequence_number, helper.add_time_and_timedelta(start_time, self._user_config["delay"]), helper.add_time_and_timedelta(end_time, self._user_config["delay"]), "")
161
+ # Increment the sequence number.
162
+ self._srt_sequence_number += 1
163
+
164
+ # If we have a previous caption...
165
+ if self._previous_caption is not None :
166
+ # If the previous result was type Recognized...
167
+ if self._previous_result_is_recognized :
168
+ # Set the end timestamp for the previous caption to the earliest of:
169
+ # - The end timestamp for the previous caption plus the remain time.
170
+ # - The start timestamp for the current caption.
171
+ previous_end = helper.add_time_and_timedelta(self._previous_caption.end, self._user_config["remain_time"])
172
+ self._previous_caption.end = previous_end if previous_end < caption.begin else caption.begin
173
+ # If the gap between the original end timestamp for the previous caption
174
+ # and the start timestamp for the current caption is larger than remainTime,
175
+ # clear the cached recognized lines.
176
+ # Note this needs to be done before we call AdjustRealTimeCaptionText
177
+ # for the current caption, because it uses _recognizedLines.
178
+ if previous_end < caption.begin :
179
+ self._recognized_lines.clear()
180
+ # If the previous result was type Recognizing, simply set the start timestamp
181
+ # for the current caption to the end timestamp for the previous caption.
182
+ # Note this presumes there will not be a large gap between Recognizing results,
183
+ # because such a gap would cause the previous Recognizing result to be succeeded
184
+ # by a Recognized result.
185
+ else :
186
+ caption.begin = self._previous_caption.end
187
+
188
+ retval = self.string_from_caption(self._previous_caption)
189
+
190
+ # Break the caption text into lines if needed.
191
+ caption.text = self.adjust_real_time_caption_text(result.text, is_recognized_result)
192
+ # Save the current caption as the previous caption.
193
+ self._previous_caption = caption
194
+ # Save the result type as the previous result type.
195
+ self._previous_result_is_recognized = is_recognized_result
196
+
197
+ return retval
198
+
199
+ def captions_from_offline_results(self) -> List[caption_helper.Caption] :
200
+ captions = caption_helper.get_captions(self._user_config["language"], self._user_config["max_line_length"], self._user_config["lines"], list(self._offline_results))
201
+ # Save the last caption.
202
+ last_caption = captions[-1]
203
+ last_caption.end = helper.add_time_and_timedelta(last_caption.end, self._user_config["remain_time"])
204
+ # In offline mode, all captions come from RecognitionResults of type Recognized.
205
+ # Set the end timestamp for each caption to the earliest of:
206
+ # - The end timestamp for this caption plus the remain time.
207
+ # - The start timestamp for the next caption.
208
+ captions_2 : List[caption_helper.Caption] = []
209
+ for (caption_1, caption_2) in pairwise(captions) :
210
+ end = helper.add_time_and_timedelta(caption_1.end, self._user_config["remain_time"])
211
+ caption_1.end = end if end < caption_2.begin else caption_2.begin
212
+ captions_2.append(caption_1)
213
+ # Re-add the last caption.
214
+ captions_2.append(last_caption)
215
+ return captions_2
216
+
217
+ def finish(self) -> None :
218
+ if user_config_helper.CaptioningMode.OFFLINE == self._user_config["captioning_mode"] :
219
+ for caption in self.captions_from_offline_results() :
220
+ helper.write_to_console_or_file(text=self.string_from_caption(caption), user_config=self._user_config)
221
+ elif user_config_helper.CaptioningMode.REALTIME == self._user_config["captioning_mode"] :
222
+ # Show the last "previous" caption, which is actually the last caption.
223
+ if self._previous_caption is not None :
224
+ self._previous_caption.end = helper.add_time_and_timedelta(self._previous_caption.end, self._user_config["remain_time"])
225
+ helper.write_to_console_or_file(text=self.string_from_caption(self._previous_caption), user_config=self._user_config)
226
+
227
+ def initialize(self) :
228
+ if self._user_config["output_file"] is not None and exists(self._user_config["output_file"]) :
229
+ remove(self._user_config["output_file"])
230
+ if not self._user_config["use_sub_rip_text_caption_format"] :
231
+ helper.write_to_console_or_file(text="WEBVTT{}{}".format(linesep, linesep), user_config=self._user_config)
232
+ return
233
+
234
+ def audio_config_from_user_config(self) -> helper.Read_Only_Dict :
235
+ if self._user_config["input_file"] is None :
236
+ return helper.Read_Only_Dict({
237
+ "audio_config" : speechsdk.AudioConfig(use_default_microphone=True),
238
+ "audio_stream_format" : None,
239
+ "pull_input_audio_stream_callback" : None,
240
+ "pull_input_audio_stream" : None
241
+ });
242
+ else :
243
+ audio_stream_format = None
244
+ if not self._user_config["use_compressed_audio"] :
245
+ reader = wave.open(self._user_config["input_file"], mode=None)
246
+ audio_stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=reader.getframerate(), bits_per_sample=reader.getsampwidth() * 8, channels=reader.getnchannels())
247
+ reader.close()
248
+ else :
249
+ audio_stream_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=self._user_config["compressed_audio_format"])
250
+ callback = helper.BinaryFileReaderCallback(filename=self._user_config["input_file"])
251
+ stream = speechsdk.audio.PullAudioInputStream(pull_stream_callback=callback, stream_format=audio_stream_format)
252
+ # We return the BinaryFileReaderCallback, AudioStreamFormat, and PullAudioInputStream
253
+ # because we need to keep them in scope until they are actually used.
254
+ return helper.Read_Only_Dict({
255
+ "audio_config" : speechsdk.audio.AudioConfig(stream=stream),
256
+ "audio_stream_format" : audio_stream_format,
257
+ "pull_input_audio_stream_callback" : callback,
258
+ "pull_input_audio_stream" : stream,
259
+ })
260
+
261
+ def speech_config_from_user_config(self) -> speechsdk.SpeechConfig :
262
+ speech_config = None
263
+ speech_config = speechsdk.SpeechConfig(subscription=self._user_config["subscription_key"], region=self._user_config["region"])
264
+
265
+ speech_config.set_profanity(self._user_config["profanity_option"])
266
+
267
+ if self._user_config["stable_partial_result_threshold"] is not None :
268
+ speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_StablePartialResultThreshold, value=self._user_config["stable_partial_result_threshold"])
269
+
270
+ speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_PostProcessingOption, value="TrueText")
271
+ speech_config.speech_recognition_language=self._user_config["language"]
272
+
273
+ return speech_config
274
+
275
+ def speech_recognizer_from_user_config(self) -> helper.Read_Only_Dict :
276
+ audio_config_data = self.audio_config_from_user_config()
277
+ speech_config = self.speech_config_from_user_config()
278
+ speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config_data["audio_config"])
279
+
280
+ if len(self._user_config["phrases"]) > 0 :
281
+ grammar = speechsdk.PhraseListGrammar.from_recognizer(recognizer=speech_recognizer)
282
+ for phrase in self._user_config["phrases"] :
283
+ grammar.addPhrase(phrase)
284
+
285
+ return helper.Read_Only_Dict({
286
+ "speech_recognizer" : speech_recognizer,
287
+ "audio_stream_format" : audio_config_data["audio_stream_format"],
288
+ "pull_input_audio_stream_callback" : audio_config_data["pull_input_audio_stream_callback"],
289
+ "pull_input_audio_stream" : audio_config_data["pull_input_audio_stream"],
290
+ })
291
+
292
+ def recognize_continuous(self, speech_recognizer : speechsdk.SpeechRecognizer, format : speechsdk.audio.AudioStreamFormat, callback : helper.BinaryFileReaderCallback, stream : speechsdk.audio.PullAudioInputStream) :
293
+ done = False
294
+ def recognizing_handler(e : speechsdk.SpeechRecognitionEventArgs) :
295
+ if speechsdk.ResultReason.RecognizingSpeech == e.result.reason and len(e.result.text) > 0 :
296
+ # This seems to be the only way we can get information about
297
+ # exceptions raised inside an event handler.
298
+ try :
299
+ caption = self.caption_from_real_time_result(e.result, False)
300
+ if caption is not None :
301
+ helper.write_to_console_or_file(text=caption, user_config=self._user_config)
302
+ except Exception as ex :
303
+ print('Exception in recognizing_handler: {}'.format(ex))
304
+ elif speechsdk.ResultReason.NoMatch == e.result.reason :
305
+ helper.write_to_console(text="NOMATCH: Speech could not be recognized.{}".format(linesep), user_config=self._user_config)
306
+
307
+ def recognized_handler(e : speechsdk.SpeechRecognitionEventArgs) :
308
+ if speechsdk.ResultReason.RecognizedSpeech == e.result.reason and len(e.result.text) > 0 :
309
+ try :
310
+ if user_config_helper.CaptioningMode.OFFLINE == self._user_config["captioning_mode"] :
311
+ self._offline_results.append(e.result)
312
+ else :
313
+ caption = self.caption_from_real_time_result(e.result, True)
314
+ if caption is not None :
315
+ helper.write_to_console_or_file(text=caption, user_config=self._user_config)
316
+ except Exception as ex :
317
+ print('Exception in recognized_handler: {}'.format(ex))
318
+ elif speechsdk.ResultReason.NoMatch == e.result.reason :
319
+ helper.write_to_console(text="NOMATCH: Speech could not be recognized.{}".format(linesep), user_config=self._user_config)
320
+
321
+ def canceled_handler(e : speechsdk.SpeechRecognitionCanceledEventArgs) :
322
+ nonlocal done
323
+ # Notes:
324
+ # SpeechRecognitionCanceledEventArgs inherits the result property from SpeechRecognitionEventArgs. See:
325
+ # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitioncanceledeventargs
326
+ # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitioneventargs
327
+ # result is type SpeechRecognitionResult, which inherits the reason property from RecognitionResult. See:
328
+ # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitionresult
329
+ # https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult
330
+ # e.result.reason is ResultReason.Canceled. To get the cancellation reason, see e.cancellation_details.reason.
331
+ if speechsdk.CancellationReason.EndOfStream == e.cancellation_details.reason :
332
+ helper.write_to_console(text="End of stream reached.{}".format(linesep), user_config=self._user_config)
333
+ done = True
334
+ elif speechsdk.CancellationReason.CancelledByUser == e.cancellation_details.reason :
335
+ helper.write_to_console(text="User canceled request.{}".format(linesep), user_config=self._user_config)
336
+ done = True
337
+ elif speechsdk.CancellationReason.Error == e.cancellation_details.reason :
338
+ # Error output should not be suppressed, even if suppress output flag is set.
339
+ print("Encountered error. Cancellation details: {}{}".format(e.cancellation_details, linesep))
340
+ done = True
341
+ else :
342
+ print("Request was cancelled for an unrecognized reason. Cancellation details: {}{}".format(e.cancellation_details, linesep))
343
+ done = True
344
+
345
+ def stopped_handler(e : speechsdk.SessionEventArgs) :
346
+ nonlocal done
347
+ helper.write_to_console(text="Session stopped.{}".format(linesep), user_config=self._user_config)
348
+ done = True
349
+
350
+ # We only use Recognizing results in real-time mode.
351
+ if user_config_helper.CaptioningMode.REALTIME == self._user_config["captioning_mode"] :
352
+ speech_recognizer.recognizing.connect(recognizing_handler)
353
+ speech_recognizer.recognized.connect(recognized_handler)
354
+ speech_recognizer.session_stopped.connect(stopped_handler)
355
+ speech_recognizer.canceled.connect(canceled_handler)
356
+
357
+ speech_recognizer.start_continuous_recognition()
358
+
359
+ while not done :
360
+ sleep(5)
361
+ speech_recognizer.stop_continuous_recognition()
362
+
363
+ return
364
+
365
+ def generate_sub(language, input_file, output_file) :
366
+ captioning = Captioning(language=language, input_audio=input_file, output=output_file)
367
+ captioning.initialize()
368
+ speech_recognizer_data = captioning.speech_recognizer_from_user_config()
369
+ captioning.recognize_continuous(speech_recognizer=speech_recognizer_data["speech_recognizer"], format=speech_recognizer_data["audio_stream_format"], callback=speech_recognizer_data["pull_input_audio_stream_callback"], stream=speech_recognizer_data["pull_input_audio_stream"])
370
+ captioning.finish()
app/captioning/helper.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) Microsoft. All rights reserved.
3
+ # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
4
+ #
5
+
6
+ # Note: abc = abstract base classes
7
+ from collections.abc import Mapping
8
+ from datetime import date, datetime, time, timedelta
9
+ from sys import argv
10
+ from typing import Optional
11
+ from pathlib import Path
12
+ import azure.cognitiveservices.speech as speechsdk # type: ignore
13
+
14
+ DEFAULT_MAX_LINE_LENGTH_SBCS = 37
15
+ DEFAULT_MAX_LINE_LENGTH_MBCS = 30
16
+
17
+ # See speech_recognize_once_compressed_input() in:
18
+ # https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/python/console/speech_sample.py
19
+ class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
20
+ def __init__(self, filename: str):
21
+ super().__init__()
22
+ self._file_h = open(filename, "rb")
23
+
24
+ def read(self, buffer: memoryview) -> int:
25
+ try:
26
+ size = buffer.nbytes
27
+ frames = self._file_h.read(size)
28
+ buffer[:len(frames)] = frames
29
+ return len(frames)
30
+ except Exception as ex:
31
+ print('Exception in `read`: {}'.format(ex))
32
+ raise
33
+
34
+ def close(self) -> None:
35
+ print('closing file')
36
+ try:
37
+ self._file_h.close()
38
+ except Exception as ex:
39
+ print('Exception in `close`: {}'.format(ex))
40
+ raise
41
+
42
+ class Read_Only_Dict(Mapping):
43
+ def __init__(self, data):
44
+ self._data = data
45
+ def __getitem__(self, key):
46
+ return self._data[key]
47
+ def __len__(self):
48
+ return len(self._data)
49
+ def __iter__(self):
50
+ return iter(self._data)
51
+
52
+ # See:
53
+ # https://stackoverflow.com/a/12448721
54
+ # https://stackoverflow.com/a/39651061
55
+ def add_time_and_timedelta(t1 : time, t2 : timedelta) -> time :
56
+ return (datetime.combine(date.min, t1) + t2).time()
57
+
58
+ def subtract_times(t1 : time, t2 : time) -> timedelta :
59
+ return datetime.combine(date.min, t1) - datetime.combine(date.min, t2)
60
+
61
+ # We cannot simply create time with ticks.
62
+ def time_from_ticks(ticks) -> time :
63
+ microseconds_1 = ticks / 10
64
+ microseconds_2 = microseconds_1 % 1000000
65
+ seconds_1 = microseconds_1 / 1000000
66
+ seconds_2 = seconds_1 % 60
67
+ minutes_1 = seconds_1 / 60
68
+ minutes_2 = minutes_1 % 60
69
+ hours = minutes_1 / 60
70
+ return time(int(hours), int(minutes_2), int(seconds_2), int(microseconds_2))
71
+
72
+ def write_to_console(text : str, user_config : Read_Only_Dict) :
73
+ if not user_config["suppress_console_output"] :
74
+ print(text, end = "", flush = True)
75
+ return
76
+
77
+ def write_to_console_or_file(text : str, user_config : Read_Only_Dict) :
78
+ write_to_console(text = text, user_config = user_config)
79
+ if user_config["output_file"] is not None :
80
+ file_path = Path(user_config["output_file"])
81
+ with open(file_path, mode = "a", newline = "", encoding='utf-8') as f :
82
+ f.write(text)
83
+ return
app/captioning/user_config_helper.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) Microsoft. All rights reserved.
3
+ # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
4
+ #
5
+
6
+ from datetime import timedelta
7
+ from enum import Enum
8
+ from os import linesep, environ
9
+ from sys import argv
10
+ from typing import List, Optional
11
+ import azure.cognitiveservices.speech as speechsdk # type: ignore
12
+ from . import helper
13
+
14
+ class CaptioningMode(Enum):
15
+ OFFLINE = 1
16
+ REALTIME = 2
17
+
18
+ def get_cmd_option(option : str) -> Optional[str] :
19
+ argc = len(argv)
20
+ if option.lower() in list(map(lambda arg: arg.lower(), argv)) :
21
+ index = argv.index(option)
22
+ if index < argc - 1 :
23
+ # We found the option (for example, "--output"), so advance from that to the value (for example, "filename").
24
+ return argv[index + 1]
25
+ else :
26
+ return None
27
+ else :
28
+ return None
29
+
30
+ def cmd_option_exists(option : str) -> bool :
31
+ return option.lower() in list(map(lambda arg : arg.lower(), argv))
32
+
33
+ def get_language() -> str :
34
+ retval = "en-US"
35
+ language = get_cmd_option("--language")
36
+ if language is not None :
37
+ retval = language
38
+ return retval
39
+
40
+ def get_phrases() -> List[str] :
41
+ retval : List[str] = []
42
+ phrases = get_cmd_option("--phrases")
43
+ if phrases is not None :
44
+ retval = list(map(lambda phrase : phrase.strip(), phrases.split(';')))
45
+ return retval
46
+
47
+ def get_compressed_audio_format() -> speechsdk.AudioStreamContainerFormat :
48
+ value = get_cmd_option("--format")
49
+ if value is None :
50
+ return speechsdk.AudioStreamContainerFormat.ANY
51
+ else :
52
+ value = value.lower()
53
+ if "alaw" == value : return speechsdk.AudioStreamContainerFormat.ALAW
54
+ elif "flac" == value : return speechsdk.AudioStreamContainerFormat.FLAC
55
+ elif "mp3" == value : return speechsdk.AudioStreamContainerFormat.MP3
56
+ elif "mulaw" == value : return speechsdk.AudioStreamContainerFormat.MULAW
57
+ elif "ogg_opus" == value : return speechsdk.AudioStreamContainerFormat.OGG_OPUS
58
+ else : return speechsdk.AudioStreamContainerFormat.ANY;
59
+
60
+ def get_profanity_option() -> speechsdk.ProfanityOption :
61
+ value = get_cmd_option("--profanity")
62
+ if value is None :
63
+ return speechsdk.ProfanityOption.Masked
64
+ else :
65
+ value = value.lower()
66
+ if "raw" == value: return speechsdk.ProfanityOption.Raw
67
+ elif "remove" == value : return speechsdk.ProfanityOption.Removed
68
+ else : return speechsdk.ProfanityOption.Masked
69
+
70
+ def user_config_from_args(usage : str) -> helper.Read_Only_Dict :
71
+ keyEnv = environ["SPEECH_KEY"] if "SPEECH_KEY" in environ else None
72
+ keyOption = get_cmd_option("--key")
73
+ key = keyOption if keyOption is not None else keyEnv
74
+ if key is None :
75
+ raise RuntimeError("Please set the SPEECH_KEY environment variable or provide a Speech resource key with the --key option.{}{}".format(linesep, usage))
76
+
77
+ regionEnv = environ["SPEECH_REGION"] if "SPEECH_REGION" in environ else None
78
+ regionOption = get_cmd_option("--region")
79
+ region = regionOption if regionOption is not None else regionEnv
80
+ if region is None :
81
+ raise RuntimeError("Please set the SPEECH_REGION environment variable or provide a Speech resource region with the --region option.{}{}".format(linesep, usage))
82
+
83
+ captioning_mode = CaptioningMode.REALTIME if cmd_option_exists("--realtime") and not cmd_option_exists("--offline") else CaptioningMode.OFFLINE
84
+
85
+ td_remain_time = timedelta(milliseconds=1000)
86
+ s_remain_time = get_cmd_option("--remainTime")
87
+ if s_remain_time is not None :
88
+ int_remain_time = float(s_remain_time)
89
+ if int_remain_time < 0 :
90
+ int_remain_time = 1000
91
+ td_remain_time = timedelta(milliseconds=int_remain_time)
92
+
93
+ td_delay = timedelta(milliseconds=1000)
94
+ s_delay = get_cmd_option("--delay")
95
+ if s_delay is not None :
96
+ int_delay = float(s_delay)
97
+ if int_delay < 0 :
98
+ int_delay = 1000
99
+ td_delay = timedelta(milliseconds=int_delay)
100
+
101
+ int_max_line_length = helper.DEFAULT_MAX_LINE_LENGTH_SBCS
102
+ s_max_line_length = get_cmd_option("--maxLineLength")
103
+ if s_max_line_length is not None :
104
+ int_max_line_length = int(s_max_line_length)
105
+ if int_max_line_length < 20 :
106
+ int_max_line_length = 20
107
+
108
+ int_lines = 2
109
+ s_lines = get_cmd_option("--lines")
110
+ if s_lines is not None :
111
+ int_lines = int(s_lines)
112
+ if int_lines < 1 :
113
+ int_lines = 2
114
+
115
+ return helper.Read_Only_Dict({
116
+ "use_compressed_audio" : cmd_option_exists("--format"),
117
+ "compressed_audio_format" : get_compressed_audio_format(),
118
+ "profanity_option" : get_profanity_option(),
119
+ "language" : get_language(),
120
+ "input_file" : get_cmd_option("--input"),
121
+ "output_file" : get_cmd_option("--output"),
122
+ "phrases" : get_phrases(),
123
+ "suppress_console_output" : cmd_option_exists("--quiet"),
124
+ "captioning_mode" : captioning_mode,
125
+ "remain_time" : td_remain_time,
126
+ "delay" : td_delay,
127
+ "use_sub_rip_text_caption_format" : cmd_option_exists("--srt"),
128
+ "max_line_length" : int_max_line_length,
129
+ "lines" : int_lines,
130
+ "stable_partial_result_threshold" : get_cmd_option("--threshold"),
131
+ "subscription_key" : key,
132
+ "region" : region,
133
+ })
app/constants.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MALE_LANGUAGES = {
2
+ "hi": ["hi-IN", "hi-IN-MadhurNeural"], # hindi
3
+ "bn": ["bn-IN", "bn-IN-BashkarNeural"], # bengali
4
+ "en": ["en-IN", "en-IN-PrabhatNeural"], # english
5
+ "gu": ["gu-IN", "gu-IN-NiranjanNeural"], # gujarati
6
+ "kn": ["kn-IN", "kn-IN-GaganNeural"], # kannada
7
+ "ml": ["ml-IN", "ml-IN-MidhunNeural"], # malayalam
8
+ "mr": ["mr-IN", "mr-IN-ManoharNeural"], # marathi
9
+ "ta": ["ta-IN", "ta-IN-ValluvarNeural"], # tamil
10
+ "te": ["te-IN", "te-IN-MohanNeural"], # telugu
11
+ "ur": ["ur-IN", "ur-IN-SalmanNeural"], # urdu
12
+ "de": ["de-DE", "de-DE-ConradNeural"], # german
13
+ "ja": ["ja-JP", "ja-JP-KeitaNeural"], # japanese
14
+ }
15
+
16
+ FEMALE_LANGUAGES = {
17
+ "hi": ["hi-IN", "hi-IN-SwaraNeural"], # hindi
18
+ "bn": ["bn-IN", "bn-IN-TanishaaNeural"], # bengali
19
+ "en": ["en-IN", "en-IN-NeerjaNeural"], # english
20
+ "gu": ["gu-IN", "gu-IN-DhwaniNeural"], # gujarati
21
+ "kn": ["kn-IN", "kn-IN-SapnaNeural"], # kannada
22
+ "ml": ["ml-IN", "ml-IN-SobhanaNeural"], # malayalam
23
+ "mr": ["mr-IN", "mr-IN-AarohiNeural"], # marathi
24
+ "ta": ["ta-IN", "ta-IN-PallaviNeural"], # tamil
25
+ "te": ["te-IN", "te-IN-ShrutiNeural"], # telugu
26
+ "ur": ["ur-IN", "ur-IN-GulNeural"], # urdu
27
+ "de": ["de-DE", "de-DE-AmalaNeural"], # german
28
+ "ja": ["ja-JP", "ja-JP-NanamiNeural"], # japanese
29
+ }
app/functions/__init__.py ADDED
File without changes
app/functions/helper.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from subprocess import run, DEVNULL
2
+ from app.captioning import generate_sub
3
+
4
+ def download_video(link, output):
5
+ command = ["yt-dlp", "-f", "bv*[ext=mp4]", "-o", output, link]
6
+ run(command, stdout=DEVNULL, stderr=DEVNULL)
7
+
8
+ def download_audio(link, output):
9
+ command = ["yt-dlp", "-f", "ba*[ext=m4a]", "-o", output, link]
10
+ run(command, stdout=DEVNULL, stderr=DEVNULL)
11
+
12
+ def m4a_to_wav(input_video, output):
13
+ command = ["ffmpeg", "-i", input_video, output]
14
+ run(command, stdout=DEVNULL, stderr=DEVNULL)
15
+ print(f"m4a to wav converted, Input: {input_video}, Output: {output}")
16
+
17
+
18
+ def audio_to_srt(language, audio_file, output):
19
+ generate_sub(language, audio_file, output)
20
+ print("audio to srt converted")
21
+
22
+ def merge_video_audio(video_file, audio_file, output):
23
+ command = ["ffmpeg", "-i", video_file, "-i", audio_file, "-c:v", "copy", "-c:a", "copy", output]
24
+ run(command, stdout=DEVNULL, stderr=DEVNULL)
25
+ print(f"video and audio merged, Input: {video_file}, {audio_file}, Output: {output}")
app/functions/model.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from fastapi import UploadFile
3
+
4
+ class VideoURL(BaseModel):
5
+ url: str
6
+ from_lang: str = "en"
7
+ to_lang: str = "hi"
8
+ gender: str = "MALE"
9
+
10
+
11
+ class VideoFile(BaseModel):
12
+ video: UploadFile
13
+ from_lang: str = "en"
14
+ to_lang: str = "hi"
15
+ gender: str = "MALE"
16
+
17
+ class YoutubeURL(BaseModel):
18
+ url: str
app/functions/s3_handler.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import boto3
3
+ from botocore.exceptions import ClientError
4
+ import os
5
+
6
+ ACCESS_KEY_ID = os.environ.get("ACCESS_KEY_ID")
7
+ SECRET_ACCESS_KEY = os.environ.get("SECRET_ACCESS_KEY")
8
+
9
+ session = boto3.Session(ACCESS_KEY_ID, SECRET_ACCESS_KEY)
10
+
11
+ def upload_file(file_name, bucket, folder, object_name=None):
12
+ # If S3 object_name was not specified, use file_name
13
+ if object_name is None:
14
+ object_name = os.path.basename(file_name)
15
+
16
+ # Upload the file
17
+ s3_client = session.client('s3')
18
+ try:
19
+ response = s3_client.upload_file(file_name, bucket, f"{folder}/"+object_name)
20
+ except ClientError as e:
21
+ logging.error(e)
22
+ return False
23
+
24
+ url = f'{os.environ.get("RESULT_URL")}{folder}/{object_name}'
25
+ return url
app/functions/video_url_handler.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from uuid import uuid4
3
+ from tempfile import TemporaryDirectory
4
+ from .s3_handler import upload_file
5
+ from app.scripts import synthesise_audio
6
+ from .helper import download_audio, download_video, m4a_to_wav, audio_to_srt, merge_video_audio
7
+ from app.constants import MALE_LANGUAGES, FEMALE_LANGUAGES
8
+
9
+ def handler_video_url(url, from_lang, to_lang, gender):
10
+ with TemporaryDirectory(dir=".") as tempdir:
11
+ srt_file = f"{tempdir}/audio.srt"
12
+ video_file = f"{tempdir}/video.mp4"
13
+ audio_file = f"{tempdir}/audio.m4a"
14
+ audio_wav_file = f"{tempdir}/audio.wav"
15
+ translated_video = f"{tempdir}/translated_video.mp4"
16
+ download_audio(url, audio_file)
17
+ download_video(url, video_file)
18
+ m4a_to_wav(audio_file, audio_wav_file)
19
+ language_code = MALE_LANGUAGES[from_lang][0]
20
+ audio_to_srt(language_code, audio_wav_file, srt_file)
21
+
22
+ if gender.lower() == "male":
23
+ language_code = MALE_LANGUAGES[to_lang][0]
24
+ voice_name = MALE_LANGUAGES[to_lang][1]
25
+ else:
26
+ language_code = FEMALE_LANGUAGES[to_lang][0]
27
+ voice_name = FEMALE_LANGUAGES[to_lang][1]
28
+
29
+ result = synthesise_audio(
30
+ srt_file=srt_file,
31
+ video_file=video_file,
32
+ output_folder=tempdir,
33
+ language_code=language_code,
34
+ voice_name=voice_name,
35
+ from_lang=from_lang,
36
+ to_lang=to_lang,
37
+ gender=gender,
38
+ )
39
+ translated_srt = result["translated_subtitle"]
40
+ translated_audio = result["translated_audio"]
41
+ merge_video_audio(video_file, translated_audio, translated_video)
42
+
43
+ now = datetime.now()
44
+ today = now.strftime("%Y-%m-%d")
45
+ id = f"{today}/{str(uuid4()).replace('-', '')[:15]}"
46
+ srt_url = upload_file(srt_file, "expressapi", id, "subtitle.srt")
47
+ translated_srt_url = upload_file(
48
+ translated_srt, "expressapi", id, "translated_subtitle.srt"
49
+ )
50
+ translated_audio_url = upload_file(
51
+ translated_audio, "expressapi", id, "translated_audio.mp3"
52
+ )
53
+ translated_video_url = upload_file(translated_video, "expressapi", id, "translated_video.mp4")
54
+ return {
55
+ "srt_url": srt_url,
56
+ "video_url": translated_video_url,
57
+ "translated_srt_url": translated_srt_url,
58
+ "translated_audio_url": translated_audio_url,
59
+ }
60
+
app/functions/youtube_summarizer.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, os
2
+ import requests
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+
5
+ API_URL = os.environ.get("SUMMARIZE_API_URL")
6
+ API_TOKEN = os.environ.get("SUMMARIZE_API_TOKEN")
7
+ headers = {"Authorization": f"Bearer {API_TOKEN}"}
8
+
9
+
10
+ def extract_video_id(youtube_url):
11
+ video_id_pattern = r"(?:/shorts/|v=)([a-zA-Z0-9_-]+)(?:&|\?|$)"
12
+ match = re.search(video_id_pattern, youtube_url)
13
+ if match:
14
+ video_id = match.group(1)
15
+ return video_id
16
+ else:
17
+ return None
18
+
19
+
20
+ def youtube_summarizer_handler(link):
21
+ video_id = extract_video_id(link)
22
+ subs = YouTubeTranscriptApi.get_transcript(video_id)
23
+ texts = " ".join([sub["text"] for sub in subs])
24
+ payload = {"inputs": texts}
25
+ response = requests.post(API_URL, headers=headers, json=payload)
26
+ summary = response.json()[0]
27
+ return summary
app/main.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from .functions.video_url_handler import handler_video_url
3
+ from .functions.youtube_summarizer import youtube_summarizer_handler
4
+ from .functions.model import VideoURL, VideoFile, YoutubeURL
5
+
6
+
7
+ app = FastAPI()
8
+
9
+
10
+ @app.get("/")
11
+ async def home():
12
+ return {"health_check": "OK"}
13
+
14
+
15
+ @app.post("/synthesise_video_url")
16
+ async def synthesise_video_url(req: VideoURL):
17
+ response = handler_video_url(req.url, req.from_lang, req.to_lang, req.gender)
18
+ return response
19
+
20
+
21
+ @app.post("/youtube_summarizer")
22
+ async def youtube_summarizer(req: YoutubeURL):
23
+ response = youtube_summarizer_handler(req.url)
24
+ return response
app/scripts/TTS.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ import azure.cognitiveservices.speech as speechsdk
5
+ import datetime
6
+ import zipfile
7
+ import io
8
+ import copy
9
+ import re
10
+ from urllib.request import urlopen
11
+ from pathlib import Path
12
+
13
+ from . import azure_batch
14
+ from . import utils
15
+ from .utils import parseBool
16
+ # Get variables from config
17
+
18
+ # Get Azure variables if applicable
19
+ AZURE_SPEECH_KEY = os.environ.get('SPEECH_KEY')
20
+ AZURE_SPEECH_REGION = os.environ.get('SPEECH_REGION')
21
+
22
+ azure_sentence_pause = 80
23
+ azure_comma_pause = 50
24
+ debug_mode = False
25
+ tts_service = 'azure'
26
+
27
+
28
+ # ======================================== Pronunciation Correction Functions ================================================
29
+ BASE_DIR = Path(__file__).resolve().parent.parent / 'SSML_Customization'
30
+
31
+ interpretAsOverrideFile = os.path.join(BASE_DIR, 'interpret-as.csv')
32
+ interpretAsEntries = utils.csv_to_dict(interpretAsOverrideFile)
33
+
34
+ aliasOverrideFile = os.path.join(BASE_DIR, 'aliases.csv')
35
+ aliasEntries = utils.csv_to_dict(aliasOverrideFile)
36
+
37
+ urlListFile = os.path.join(BASE_DIR, 'url_list.txt')
38
+ urlList = utils.txt_to_list(urlListFile)
39
+
40
+ phonemeFile = os.path.join(BASE_DIR, 'Phoneme_Pronunciation.csv')
41
+ phonemeEntries = utils.csv_to_dict(phonemeFile)
42
+
43
+ def add_all_pronunciation_overrides(text):
44
+ text = add_interpretas_tags(text)
45
+ text = add_alias_tags(text)
46
+ text = add_phoneme_tags(text)
47
+ return text
48
+
49
+ def add_interpretas_tags(text):
50
+ # Add interpret-as tags from interpret-as.csv
51
+ for entryDict in interpretAsEntries:
52
+ # Get entry info
53
+ entryText = entryDict['Text']
54
+ entryInterpretAsType = entryDict['interpret-as Type']
55
+ isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
56
+ entryFormat = entryDict['Format (Optional)']
57
+
58
+ # Create say-as tag
59
+ if entryFormat == "":
60
+ sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}">'
61
+ else:
62
+ sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}" format="{entryFormat}">'
63
+
64
+ # Find and replace the word
65
+ findWordRegex = rf'(\b["\']?{entryText}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
66
+ if isCaseSensitive:
67
+ text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text) # Uses group reference, so remember regex must be in parentheses
68
+
69
+ else:
70
+ text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text, flags=re.IGNORECASE)
71
+
72
+ # Add interpret-as tags from url_list.txt
73
+ for url in urlList:
74
+ # This regex expression will match the top level domain extension, and the punctuation before/after it, and any periods, slashes or colons
75
+ # It will then put the say-as characters tag around all matches
76
+ punctuationRegex = re.compile(r'((?:\.[a-z]{2,6}(?:\/|$|\s))|(?:[\.\/:]+))')
77
+ taggedURL = re.sub(punctuationRegex, r'<say-as interpret-as="characters">\1</say-as>', url)
78
+ # Replace any instances of the URL with the tagged version
79
+ text = text.replace(url, taggedURL)
80
+
81
+ return text
82
+
83
+ def add_alias_tags(text):
84
+ for entryDict in aliasEntries:
85
+ # Get entry info
86
+ entryText = entryDict['Original Text']
87
+ entryAlias = entryDict['Alias']
88
+ if entryDict['Case Sensitive (True/False)'] == "":
89
+ isCaseSensitive = False
90
+ else:
91
+ isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
92
+
93
+ # Find and replace the word
94
+ findWordRegex = rf'\b["\'()]?{entryText}[.,!?()]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
95
+ if isCaseSensitive:
96
+ text = re.sub(findWordRegex, rf'{entryAlias}', text)
97
+ else:
98
+ text = re.sub(findWordRegex, rf'{entryAlias}', text, flags=re.IGNORECASE)
99
+ return text
100
+
101
+
102
+ # Uses the phoneme pronunciation file to add phoneme tags to the text
103
+ def add_phoneme_tags(text):
104
+ for entryDict in phonemeEntries:
105
+ # Get entry info
106
+ entryText = entryDict['Text']
107
+ entryPhoneme = entryDict['Phonetic Pronunciation']
108
+ entryAlphabet = entryDict['Phonetic Alphabet']
109
+
110
+ if entryDict['Case Sensitive (True/False)'] == "":
111
+ isCaseSensitive = False
112
+ else:
113
+ isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
114
+
115
+ # Find and replace the word
116
+ findWordRegex = rf'(\b["\'()]?{entryText}[.,!?()]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
117
+ if isCaseSensitive:
118
+ text = re.sub(findWordRegex, rf'<phoneme alphabet="ipa" ph="{entryPhoneme}">\1</phoneme>', text)
119
+ else:
120
+ text = re.sub(findWordRegex, rf'<phoneme alphabet="{entryAlphabet}" ph="{entryPhoneme}">\1</phoneme>', text, flags=re.IGNORECASE)
121
+ return text
122
+
123
+ # ================================================== Azure Functions =========================================================
124
+
125
+ def synthesize_text_azure(text, duration, voiceName, languageCode):
126
+
127
+ # Create tag for desired duration of clip
128
+ durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'
129
+
130
+ # Create string for sentence pauses, if not default
131
+ if not azure_sentence_pause == 'default':
132
+ sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
133
+ else:
134
+ sentencePauseTag = ''
135
+
136
+ # Create string for comma pauses, if not default
137
+ if not azure_comma_pause == 'default':
138
+ commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
139
+ else:
140
+ commaPauseTag = ''
141
+
142
+ # Set string for tag to set leading and trailing silence times to zero
143
+ leadSilenceTag = '<mstts:silence type="Leading-exact" value="0ms"/>'
144
+ tailSilenceTag = '<mstts:silence type="Tailing-exact" value="0ms"/>'
145
+
146
+ # Process text using pronunciation customization set by user
147
+ text = add_all_pronunciation_overrides(text)
148
+
149
+ # Create SSML syntax for Azure TTS
150
+ ssml = f"<speak version='1.0' xml:lang='{languageCode}' xmlns='http://www.w3.org/2001/10/synthesis' " \
151
+ "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
152
+ f"<voice name='{voiceName}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
153
+ f"{text}</voice></speak>"
154
+
155
+ speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
156
+ # For Azure voices, see: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=stt-tts
157
+ speech_config.speech_synthesis_voice_name=voiceName
158
+ # For audio outputs, see: https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechsynthesisoutputformat?view=azure-python
159
+ speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
160
+ synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
161
+
162
+ #result = synthesizer.speak_text_async(text).get()
163
+ result = synthesizer.speak_ssml_async(ssml).get()
164
+
165
+ stream = speechsdk.AudioDataStream(result)
166
+ return stream
167
+
168
+ def format_percentage_change(speedFactor):
169
+ # Determine speedFactor value for Azure TTS. It should be either 'default' or a relative change.
170
+ if speedFactor == 1.0:
171
+ rate = 'default'
172
+ else:
173
+ # Whether to add a plus sign to the number to relative change. A negative will automatically be added
174
+ if speedFactor >= 1.0:
175
+ percentSign = '+'
176
+ else:
177
+ percentSign = ''
178
+ # Convert speedFactor float value to a relative percentage
179
+ rate = percentSign + str(round((speedFactor - 1.0) * 100, 5)) + '%'
180
+ return rate
181
+
182
+ def synthesize_text_azure_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):
183
+
184
+ def create_request_payload(remainingEntriesDict):
185
+ # Create SSML for all subtitles
186
+ ssmlJson = []
187
+ payloadSizeInBytes = 0
188
+ tempDict = dict(remainingEntriesDict) # Need to do this to avoid changing the original dict which would mess with the loop
189
+
190
+ for key, value in tempDict.items():
191
+ text = tempDict[key]['translated_text']
192
+ duration = tempDict[key]['duration_ms_buffered']
193
+ language = langDict['languageCode']
194
+ voice = langDict['voiceName']
195
+
196
+ # Create tag for desired duration of clip
197
+ durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'
198
+
199
+ # Create string for sentence pauses, if not default
200
+ if not azure_sentence_pause == 'default':
201
+ sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
202
+ else:
203
+ sentencePauseTag = ''
204
+
205
+ # Create string for comma pauses, if not default
206
+ if not azure_comma_pause == 'default':
207
+ commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
208
+ else:
209
+ commaPauseTag = ''
210
+
211
+ # Set string for tag to set leading and trailing silence times to zero
212
+ leadSilenceTag = '<mstts:silence type="Leading-exact" value="0ms"/>'
213
+ tailSilenceTag = '<mstts:silence type="Tailing-exact" value="0ms"/>'
214
+
215
+ # Process text using pronunciation customization set by user
216
+ text = add_all_pronunciation_overrides(text)
217
+
218
+ # Create the SSML for each subtitle
219
+ ssml = f"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' " \
220
+ "xmlns:mstts='http://www.w3.org/2001/mstts'>" \
221
+ f"<voice name='{voice}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
222
+ f"{text}</voice></speak>"
223
+ ssmlJson.append({"text": ssml})
224
+
225
+ # Construct request payload with SSML
226
+ # Reconstruct payload with every loop with new SSML so that the payload size is accurate
227
+ now = datetime.datetime.now()
228
+ pendingPayload = {
229
+ 'displayName': langDict['languageCode'] + '-' + now.strftime("%Y-%m-%d %H:%M:%S"),
230
+ 'description': 'Batch synthesis of ' + langDict['languageCode'] + ' subtitles',
231
+ "textType": "SSML",
232
+ # To use custom voice, see original example code script linked from azure_batch.py
233
+ "inputs": ssmlJson,
234
+ "properties": {
235
+ "outputFormat": "audio-48khz-192kbitrate-mono-mp3",
236
+ "wordBoundaryEnabled": False,
237
+ "sentenceBoundaryEnabled": False,
238
+ "concatenateResult": False,
239
+ "decompressOutputFiles": False
240
+ },
241
+ }
242
+ # Azure TTS Batch requests require payload must be under 500 kilobytes, so check payload is under 500,000 bytes. Not sure if they actually mean kibibytes, assume worst case.
243
+ # Payload will be formatted as json so must account for that too by doing json.dumps(), otherwise calculated size will be inaccurate
244
+ payloadSizeInBytes = len(str(json.dumps(pendingPayload)).encode('utf-8'))
245
+
246
+ if payloadSizeInBytes > 495000 or len(ssmlJson) > 995: # Leave some room for anything unexpected. Also number of inputs must be below 1000
247
+ # If payload would be too large, ignore the last entry and break out of loop
248
+ return payload, remainingEntriesDict
249
+ else:
250
+ payload = copy.deepcopy(pendingPayload) # Must make deepycopy otherwise ssmlJson will be updated in both instead of just pendingPayload
251
+ # Remove entry from remainingEntriesDict if it was added to payload
252
+ remainingEntriesDict.pop(key)
253
+
254
+
255
+ # If all the rest of the entries fit, return the payload
256
+ return payload, remainingEntriesDict
257
+ # ------------------------- End create_request_payload() -----------------------------------
258
+
259
+
260
+ # Create payloads, split into multiple if necessary
261
+ payloadList = []
262
+ remainingPayloadEntriesDict = dict(subsDict) # Will remove entries as they are added to payloads
263
+ while len(remainingPayloadEntriesDict) > 0:
264
+ payloadToAppend, remainingPayloadEntriesDict = create_request_payload(remainingPayloadEntriesDict)
265
+ payloadList.append(payloadToAppend)
266
+
267
+ # Tell user if request will be broken up into multiple payloads
268
+ if len(payloadList) > 1:
269
+ print(f'Payload will be broken up into {len(payloadList)} requests (due to Azure size limitations).')
270
+
271
+ # Use to keep track of filenames downloaded via separate zip files. WIll remove as they are downloaded
272
+ remainingDownloadedEntriesList = list(subsDict.keys())
273
+
274
+ # Clear out workingFolder
275
+ for filename in os.listdir('workingFolder'):
276
+ if not debug_mode:
277
+ os.remove(os.path.join('workingFolder', filename))
278
+
279
+ # Loop through payloads and submit to Azure
280
+ for payload in payloadList:
281
+ # Reset job_id from previous loops
282
+ job_id = None
283
+
284
+ # Send request to Azure
285
+ job_id = azure_batch.submit_synthesis(payload)
286
+
287
+ # Wait for job to finish
288
+ if job_id is not None:
289
+ status = "Running"
290
+ resultDownloadLink = None
291
+
292
+ while True: # Must use break to exit loop
293
+ # Get status
294
+ response = azure_batch.get_synthesis(job_id)
295
+ status = response.json()['status']
296
+ if status == 'Succeeded':
297
+ print('Batch synthesis job succeeded')
298
+ resultDownloadLink = azure_batch.get_synthesis(job_id).json()['outputs']['result']
299
+ break
300
+ elif status == 'Failed':
301
+ print('ERROR: Batch synthesis job failed!')
302
+ print("Reason:" + response.reason)
303
+ break
304
+ else:
305
+ print(f'Waiting for Azure batch synthesis job to finish. Status: [{status}]')
306
+ time.sleep(5)
307
+
308
+ # Download resultig zip file
309
+ if resultDownloadLink is not None:
310
+ # Download zip file
311
+ urlResponse = urlopen(resultDownloadLink)
312
+
313
+ # If debug mode, save zip file to disk
314
+ if debug_mode:
315
+ if secondPass == False:
316
+ zipName = 'azureBatch.zip'
317
+ else:
318
+ zipName = 'azureBatchPass2.zip'
319
+
320
+ zipPath = os.path.join('workingFolder', zipName)
321
+ with open(zipPath, 'wb') as f:
322
+ f.write(urlResponse.read())
323
+ # Reset urlResponse so it can be read again
324
+ urlResponse = urlopen(resultDownloadLink)
325
+
326
+ # Process zip file
327
+ virtualResultZip = io.BytesIO(urlResponse.read())
328
+ zipdata = zipfile.ZipFile(virtualResultZip)
329
+ zipinfos = zipdata.infolist()
330
+
331
+ # Reorder zipinfos so the file names are in alphanumeric order
332
+ zipinfos.sort(key=lambda x: x.filename)
333
+
334
+ # Only extract necessary files, and rename them while doing so
335
+ for file in zipinfos:
336
+ if file.filename == "summary.json":
337
+ #zipdata.extract(file, 'workingFolder') # For debugging
338
+ pass
339
+ elif "json" not in file.filename:
340
+ # Rename file to match first entry in remainingDownloadedEntriesDict, then extract
341
+ currentFileNum = remainingDownloadedEntriesList[0]
342
+ file.filename = str(currentFileNum) + '.mp3'
343
+ #file.filename = file.filename.lstrip('0')
344
+
345
+ # Add file path to subsDict then remove from remainingDownloadedEntriesList
346
+ subsDict[currentFileNum]['TTS_FilePath'] = os.path.join('workingFolder', str(currentFileNum)) + '.mp3'
347
+ # Extract file
348
+ zipdata.extract(file, 'workingFolder')
349
+ # Remove entry from remainingDownloadedEntriesList
350
+ remainingDownloadedEntriesList.pop(0)
351
+
352
+
353
+ return subsDict
354
+
355
+
356
+ def synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):
357
+ if not skipSynthesize:
358
+ subsDict = synthesize_text_azure_batch(subsDict, langDict, skipSynthesize, secondPass)
359
+ return subsDict
360
+
361
+ def synthesize_dictionary(subsDict, langDict, outputFolder, skipSynthesize=False, secondPass=False):
362
+ for key, value in subsDict.items():
363
+ # TTS each subtitle text, write to file, write filename into dictionary
364
+ workingFolder = os.path.join(outputFolder, 'workingFolder')
365
+ filePath = os.path.join(workingFolder, f'{str(key)}.mp3')
366
+ filePathStem = os.path.join(workingFolder, f'{str(key)}')
367
+ if not skipSynthesize:
368
+
369
+ duration = value['duration_ms_buffered']
370
+
371
+ if secondPass:
372
+ # Get speed factor from subsDict
373
+ speedFactor = subsDict[key]['speed_factor']
374
+ else:
375
+ speedFactor = float(1.0)
376
+
377
+ # Prepare output location. If folder doesn't exist, create it
378
+ if not os.path.exists(os.path.dirname(filePath)):
379
+ try:
380
+ os.makedirs(os.path.dirname(filePath))
381
+ except OSError:
382
+ print("Error creating directory")
383
+
384
+
385
+ # If Azure TTS, use Azure API
386
+ if tts_service == "azure":
387
+ # Audio variable is an AudioDataStream object
388
+ audio = synthesize_text_azure(value['translated_text'], duration, langDict['voiceName'], langDict['languageCode'])
389
+ # Save to file using save_to_wav_file method of audio object
390
+ audio.save_to_wav_file(filePath)
391
+
392
+ # If debug mode, write to files after Google TTS
393
+ if debug_mode and secondPass == False:
394
+ audio.save_to_wav_file(filePathStem+"_p1.mp3")
395
+ elif debug_mode and secondPass == True:
396
+ audio.save_to_wav_file(filePathStem+"_p2.mp3")
397
+
398
+ subsDict[key]['TTS_FilePath'] = filePath
399
+
400
+ # Get key index
401
+ keyIndex = list(subsDict.keys()).index(key)
402
+ # Print progress and overwrite line next time
403
+ if not secondPass:
404
+ print(f" Synthesizing TTS Line: {keyIndex+1} of {len(subsDict)}", end="\r")
405
+ else:
406
+ print(f" Synthesizing TTS Line (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
407
+ print(" ") # Clear the line
408
+ return subsDict
app/scripts/__init__.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .audio import process_language
2
+ from .srt import parse_srt_file, get_duration
3
+ import langcodes
4
+ import pathlib
5
+ import os
6
+
7
+ def synthesise_audio(
8
+ srt_file,
9
+ video_file,
10
+ output_folder,
11
+ language_code="hi-IN",
12
+ voice_name="hi-IN-MadhurNeural",
13
+ from_lang="en",
14
+ to_lang="hi",
15
+ gender="MALE",
16
+ ):
17
+ langData = {
18
+ "synth_language_code": language_code,
19
+ "synth_voice_name": voice_name,
20
+ "translation_source_language": from_lang,
21
+ "translation_target_language": to_lang,
22
+ "synth_voice_gender": gender,
23
+ "translate_service": "azure",
24
+ "formality": None,
25
+ }
26
+
27
+ with open(srt_file, "r", encoding="utf-8-sig") as f:
28
+ originalSubLines = f.readlines()
29
+
30
+ originalLanguageSubsDict = parse_srt_file(originalSubLines)
31
+
32
+ totalAudioLength = get_duration(video_file)
33
+
34
+ # Use video file name to use in the name of the translate srt file, also display regular language name
35
+ lang = langcodes.get(to_lang).display_name()
36
+ translatedSrtFileName = pathlib.Path(video_file).stem + f" - {lang} - {to_lang}.srt"
37
+ # Set path to save translated srt file
38
+ translatedSrtFileName = f"{output_folder}/{translatedSrtFileName}"
39
+
40
+ lang = langcodes.get(langData['synth_language_code'])
41
+ langName = langcodes.get(langData['synth_language_code']).get(lang.to_alpha3()).display_name()
42
+
43
+ outputFileName = pathlib.Path(video_file).stem + f" - {langName} - {langData['synth_language_code']}."
44
+ # Set output path
45
+ outputFileName = os.path.join(output_folder, outputFileName)
46
+
47
+ process_language(
48
+ langData,
49
+ originalLanguageSubsDict,
50
+ totalAudioLength,
51
+ translatedSrtFileName,
52
+ outputFileName,
53
+ output_folder
54
+ )
55
+ return {"translated_subtitle": translatedSrtFileName, "translated_audio": outputFileName+"mp3"}
app/scripts/audio.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from .TTS import synthesize_dictionary_batch, synthesize_dictionary
3
+ from .translate import translate_dictionary
4
+ from .audio_builder import build_audio
5
+
6
+
7
+ original_language = "en"
8
+ batch_tts_synthesize = False
9
+ skip_translation = False
10
+ stop_after_translation = False
11
+ skip_translation = False
12
+ skip_synthesize = False
13
+
14
+ two_pass_voice_synth = False # Azure doesn't need two pass voice synth, so disable it
15
+
16
+
17
+ def manually_prepare_dictionary(dictionaryToPrep):
18
+ ### Do additional Processing to match the format produced by translation function
19
+ # Create new key 'translated_text' and set it to the value of 'text'
20
+ for key, value in dictionaryToPrep.items():
21
+ dictionaryToPrep[key]['translated_text'] = value['text']
22
+
23
+ # Convert the keys to integers and return the dictionary
24
+ return {int(k): v for k, v in dictionaryToPrep.items()}
25
+
26
+
27
+ # Process a language: Translate, Synthesize, and Build Audio
28
+ def process_language(langData, originalLanguageSubsDict, totalAudioLength, translatedSrtFileName, outputFileName, outputFolder):
29
+ langDict = {
30
+ 'targetLanguage': langData['translation_target_language'],
31
+ 'sourceLanguage': langData['translation_source_language'],
32
+ 'voiceName': langData['synth_voice_name'],
33
+ 'languageCode': langData['synth_language_code'],
34
+ 'voiceGender': langData['synth_voice_gender'],
35
+ 'translateService': langData['translate_service'],
36
+ 'formality': langData['formality']
37
+ }
38
+
39
+ individualLanguageSubsDict = copy.deepcopy(originalLanguageSubsDict)
40
+
41
+ # Check for special case where original language is the same as the target language
42
+ if langDict['languageCode'].lower() == original_language.lower():
43
+ print("Original language is the same as the target language. Skipping translation.")
44
+ individualLanguageSubsDict = manually_prepare_dictionary(individualLanguageSubsDict)
45
+
46
+ elif skip_translation == False:
47
+ # Translate
48
+ individualLanguageSubsDict = translate_dictionary(individualLanguageSubsDict, langDict, translatedSrtFileName, skipTranslation=skip_translation)
49
+ if stop_after_translation:
50
+ print("Stopping at translation is enabled. Skipping TTS and building audio.")
51
+ return
52
+
53
+ # Synthesize
54
+ if batch_tts_synthesize == True:
55
+ individualLanguageSubsDict = synthesize_dictionary_batch(individualLanguageSubsDict, langDict, skipSynthesize=skip_synthesize)
56
+ else:
57
+ individualLanguageSubsDict = synthesize_dictionary(individualLanguageSubsDict, langDict, outputFolder, skipSynthesize=skip_synthesize)
58
+ print(individualLanguageSubsDict)
59
+
60
+ # Build audio
61
+ individualLanguageSubsDict = build_audio(individualLanguageSubsDict, langDict, totalAudioLength, outputFileName, two_pass_voice_synth)
62
+
app/scripts/audio_builder.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile
2
+ import pyrubberband
3
+ import pathlib
4
+ import os
5
+ import io
6
+
7
+
8
+ from . import TTS
9
+
10
+ from pydub import AudioSegment
11
+ from pydub.silence import detect_leading_silence
12
+ import langcodes
13
+
14
+
15
+ # Set working folder
16
+ workingFolder = "workingFolder"
17
+
18
+ synth_sample_rate = 24000
19
+ debug_mode = False
20
+ tts_service = "azure"
21
+ batch_tts_synthesize = False
22
+ skip_translation = False
23
+ stop_after_translation = False
24
+ skip_translation = False
25
+ skip_synthesize = False
26
+ force_stretch_with_twopass = False
27
+ output_format = "mp3"
28
+
29
+
30
+ def trim_clip(inputSound):
31
+ trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
32
+ trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
33
+ strip_silence: AudioSegment = lambda x: trim_trailing_silence(trim_leading_silence(x))
34
+ strippedSound = strip_silence(inputSound)
35
+ return strippedSound
36
+
37
+ # Function to insert audio into canvas at specific point
38
+ def insert_audio(canvas, audioToOverlay, startTimeMs):
39
+ # Create a copy of the canvas
40
+ canvasCopy = canvas
41
+ # Overlay the audio onto the copy
42
+ canvasCopy = canvasCopy.overlay(audioToOverlay, position=int(startTimeMs))
43
+ # Return the copy
44
+ return canvasCopy
45
+
46
+ # Function to create a canvas of a specific duration in miliseconds
47
+ def create_canvas(canvasDuration, frame_rate=int(synth_sample_rate)):
48
+ canvas = AudioSegment.silent(duration=canvasDuration, frame_rate=frame_rate)
49
+ return canvas
50
+
51
+ def get_speed_factor(subsDict, trimmedAudio, desiredDuration, num):
52
+ virtualTempFile = AudioSegment.from_file(trimmedAudio, format="wav")
53
+ rawDuration = virtualTempFile.duration_seconds
54
+ trimmedAudio.seek(0) # This MUST be done to reset the file pointer to the start of the file, otherwise will get errors next time try to access the virtual files
55
+ # Calculate the speed factor, put into dictionary
56
+ desiredDuration = float(desiredDuration)
57
+ speedFactor = (rawDuration*1000) / desiredDuration
58
+ subsDict[num]['speed_factor'] = speedFactor
59
+ return subsDict
60
+
61
+ def stretch_audio(audioFileToStretch, speedFactor, num):
62
+ virtualTempAudioFile = io.BytesIO()
63
+ # Write the raw string to virtualtempaudiofile
64
+ y, sampleRate = soundfile.read(audioFileToStretch)
65
+
66
+ streched_audio = pyrubberband.time_stretch(y, sampleRate, speedFactor, rbargs={'--fine': '--fine'}) # Need to add rbarges in weird way because it demands a dictionary of two values
67
+ #soundfile.write(f'{workingFolder}\\temp_stretched.wav', streched_audio, sampleRate)
68
+ soundfile.write(virtualTempAudioFile, streched_audio, sampleRate, format='wav')
69
+ if debug_mode:
70
+ soundfile.write(os.path.join(workingFolder, f'{num}_s.wav'), streched_audio, sampleRate) # For debugging, saves the stretched audio files
71
+ #return AudioSegment.from_file(f'{workingFolder}\\temp_stretched.wav', format="wav")
72
+ return AudioSegment.from_file(virtualTempAudioFile, format="wav")
73
+
74
+
75
+ def build_audio(subsDict, langDict, totalAudioLength, outputFileName, twoPassVoiceSynth=False):
76
+ if tts_service == 'azure':
77
+ twoPassVoiceSynth = False # Azure doesn't need two pass voice synth, so disable it
78
+
79
+ virtualTrimmedFileDict = {}
80
+ # First trim silence off the audio files
81
+ for key, value in subsDict.items():
82
+ filePathTrimmed = os.path.join(workingFolder, str(key)) + "_t.wav"
83
+ subsDict[key]['TTS_FilePath_Trimmed'] = filePathTrimmed
84
+
85
+ # Trim the clip and re-write file
86
+ rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
87
+ trimmedClip = trim_clip(rawClip)
88
+ if debug_mode:
89
+ trimmedClip.export(filePathTrimmed, format="wav")
90
+
91
+ # Create virtual file in dictionary with audio to be read later
92
+ tempTrimmedFile = io.BytesIO()
93
+ trimmedClip.export(tempTrimmedFile, format="wav")
94
+ virtualTrimmedFileDict[key] = tempTrimmedFile
95
+ keyIndex = list(subsDict.keys()).index(key)
96
+ print(f" Trimmed Audio: {keyIndex+1} of {len(subsDict)}", end="\r")
97
+ print("\n")
98
+
99
+ # Calculates speed factor if necessary. Azure doesn't need this, so skip it
100
+ if not tts_service == 'azure':
101
+ # Calculate speed factors for each clip, aka how much to stretch the audio
102
+ for key, value in subsDict.items():
103
+ #subsDict = get_speed_factor(subsDict, value['TTS_FilePath_Trimmed'], value['duration_ms'], num=key)
104
+ subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
105
+ keyIndex = list(subsDict.keys()).index(key)
106
+ print(f" Calculated Speed Factor: {keyIndex+1} of {len(subsDict)}", end="\r")
107
+ print("\n")
108
+
109
+ # If two pass voice synth is enabled, have API re-synthesize the clips at the new speed
110
+ # Azure allows direct specification of audio duration, so no need to re-synthesize
111
+ if twoPassVoiceSynth == True and not tts_service == 'azure':
112
+ if batch_tts_synthesize == True and tts_service == 'azure':
113
+ subsDict = TTS.synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
114
+ else:
115
+ subsDict = TTS.synthesize_dictionary(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
116
+
117
+ for key, value in subsDict.items():
118
+ # Trim the clip and re-write file
119
+ rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
120
+ trimmedClip = trim_clip(rawClip)
121
+ if debug_mode:
122
+ # Remove '.wav' from the end of the file path
123
+ secondPassTrimmedFile = value['TTS_FilePath_Trimmed'][:-4] + "_p2_t.wav"
124
+ trimmedClip.export(secondPassTrimmedFile, format="wav")
125
+ trimmedClip.export(virtualTrimmedFileDict[key], format="wav")
126
+ keyIndex = list(subsDict.keys()).index(key)
127
+ print(f" Trimmed Audio (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
128
+ print("\n")
129
+
130
+ if force_stretch_with_twopass == True:
131
+ for key, value in subsDict.items():
132
+ subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
133
+ keyIndex = list(subsDict.keys()).index(key)
134
+ print(f" Calculated Speed Factor (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
135
+ print("\n")
136
+
137
+ # Create canvas to overlay audio onto
138
+ canvas = create_canvas(totalAudioLength)
139
+
140
+ # Stretch audio and insert into canvas
141
+ for key, value in subsDict.items():
142
+ if (not twoPassVoiceSynth or force_stretch_with_twopass == True) and not tts_service == 'azure': # Don't stretch if azure is used
143
+ #stretchedClip = stretch_audio(value['TTS_FilePath_Trimmed'], speedFactor=subsDict[key]['speed_factor'], num=key)
144
+ stretchedClip = stretch_audio(virtualTrimmedFileDict[key], speedFactor=subsDict[key]['speed_factor'], num=key)
145
+ else:
146
+ #stretchedClip = AudioSegment.from_file(value['TTS_FilePath_Trimmed'], format="wav")
147
+ stretchedClip = AudioSegment.from_file(virtualTrimmedFileDict[key], format="wav")
148
+ virtualTrimmedFileDict[key].seek(0) # Not 100% sure if this is necessary but it was in the other place it is used
149
+
150
+ canvas = insert_audio(canvas, stretchedClip, value['start_ms'])
151
+ keyIndex = list(subsDict.keys()).index(key)
152
+ print(f" Final Audio Processed: {keyIndex+1} of {len(subsDict)}", end="\r")
153
+ print("\n")
154
+
155
+
156
+ # Determine string to use for output format and file extension based on config setting
157
+ outputFormat=output_format.lower()
158
+ if outputFormat == "mp3":
159
+ outputFileName += "mp3"
160
+ formatString = "mp3"
161
+ elif outputFormat == "wav":
162
+ outputFileName += "wav"
163
+ formatString = "wav"
164
+ elif outputFormat == "aac":
165
+ #outputFileName += "m4a"
166
+ #formatString = "mp4" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
167
+ outputFileName += "aac"
168
+ formatString = "adts" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
169
+
170
+ canvas = canvas.set_channels(2) # Change from mono to stereo
171
+ try:
172
+ print("\nExporting audio file...")
173
+ canvas.export(outputFileName, format=formatString, bitrate="192k")
174
+ except:
175
+ outputFileName = outputFileName + ".bak"
176
+ canvas.export(outputFileName, format=formatString, bitrate="192k")
177
+ print("\nThere was an issue exporting the audio, it might be a permission error. The file was saved as a backup with the extension .bak")
178
+ print("Try removing the .bak extension then listen to the file to see if it worked.\n")
179
+ input("Press Enter to exit...")
180
+
181
+ return subsDict
app/scripts/azure_batch.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # Based on Microsoft Azure sample code found here: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch-synthesis/python/synthesis.py
5
+ # Original License Info Below:
6
+ # Copyright (c) Microsoft. All rights reserved.
7
+ # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
8
+ #--------------------------------------------------------------------------------------------------------
9
+ import os
10
+ import json
11
+ import logging
12
+ import sys
13
+
14
+ import requests
15
+
16
+
17
+ logging.basicConfig(stream=sys.stdout, level=logging.ERROR,
18
+ format="[%(asctime)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Your Speech resource key and region
22
+ # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
23
+
24
+ AZURE_SPEECH_KEY = os.environ.get('SPEECH_KEY')
25
+ AZURE_SPEECH_REGION = os.environ.get('SPEECH_REGION')
26
+
27
+ NAME = "Simple synthesis"
28
+ DESCRIPTION = "Simple synthesis description"
29
+
30
+ # The service host suffix.
31
+ # For azure.cn the host suffix is "customvoice.api.speech.azure.cn"
32
+ SERVICE_HOST = "customvoice.api.speech.microsoft.com"
33
+
34
+
35
+ def submit_synthesis(payload):
36
+ url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis'
37
+ header = {
38
+ 'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY,
39
+ 'Content-Type': 'application/json'
40
+ }
41
+
42
+ response = requests.post(url, json.dumps(payload), headers=header)
43
+ if response.status_code < 400:
44
+ logger.info('Batch synthesis job submitted successfully')
45
+ logger.info(f'Job ID: {response.json()["id"]}')
46
+ return response.json()["id"]
47
+ else:
48
+ logger.error(f'Failed to submit batch synthesis job: {response.text}')
49
+
50
+
51
+ def get_synthesis(job_id):
52
+ url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/{job_id}'
53
+ header = {
54
+ 'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY
55
+ }
56
+ response = requests.get(url, headers=header)
57
+ if response.status_code < 400:
58
+ logger.info('Get batch synthesis job successfully')
59
+ logger.info(response.json())
60
+ #return response.json()['status']
61
+ return response
62
+ else:
63
+ logger.error(f'Failed to get batch synthesis job: {response.text}')
64
+
65
+
66
+ def list_synthesis_jobs(skip: int = 0, top: int = 100):
67
+ """List all batch synthesis jobs in the subscription"""
68
+ url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis?skip={skip}&top={top}'
69
+ header = {
70
+ 'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY
71
+ }
72
+ response = requests.get(url, headers=header)
73
+ if response.status_code < 400:
74
+ logger.info(f'List batch synthesis jobs successfully, got {len(response.json()["values"])} jobs')
75
+ logger.info(response.json())
76
+ else:
77
+ logger.error(f'Failed to list batch synthesis jobs: {response.text}')
78
+
app/scripts/azure_translate.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, uuid, json, os
2
+
3
+
4
+ def azure_translate_text(text_list, from_lang="en", to_lang="hi"):
5
+ TRANSLATE_API_ENDPOINT = os.environ.get("TRANSLATE_API_ENDPOINT")
6
+ url = f"{TRANSLATE_API_ENDPOINT}/translate"
7
+
8
+ params = {
9
+ 'api-version': '3.0',
10
+ 'from': from_lang,
11
+ 'to': [to_lang]
12
+ }
13
+
14
+ TRANSLATE_KEY = os.environ.get("TRANSLATE_KEY")
15
+ LOCATION = os.environ.get("SPEECH_REGION")
16
+
17
+ headers = {
18
+ 'Ocp-Apim-Subscription-Key': TRANSLATE_KEY,
19
+ 'Ocp-Apim-Subscription-Region': LOCATION,
20
+ 'Content-type': 'application/json',
21
+ 'X-ClientTraceId': str(uuid.uuid4())
22
+ }
23
+ body = [{"text": text} for text in text_list]
24
+
25
+ request = requests.post(url, params=params, headers=headers, json=body)
26
+ response = request.json()
27
+ response = [{"text": text["translations"][0]["text"]} for text in response]
28
+ return response
app/scripts/srt.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def parse_srt_file(srtFileLines, preTranslated=False):
4
+ # Matches the following example with regex: 00:00:20,130 --> 00:00:23,419
5
+ subtitleTimeLineRegex = re.compile(r'\d\d:\d\d:\d\d,\d\d\d --> \d\d:\d\d:\d\d,\d\d\d')
6
+
7
+ # Create a dictionary
8
+ subsDict = {}
9
+
10
+ # Will add this many milliseconds of extra silence before and after each audio clip / spoken subtitle line
11
+ addBufferMilliseconds = 0
12
+
13
+ # Enumerate lines, and if a line in lines contains only an integer, put that number in the key, and a dictionary in the value
14
+ # The dictionary contains the start, ending, and duration of the subtitles as well as the text
15
+ # The next line uses the syntax HH:MM:SS,MMM --> HH:MM:SS,MMM . Get the difference between the two times and put that in the dictionary
16
+ # For the line after that, put the text in the dictionary
17
+ for lineNum, line in enumerate(srtFileLines):
18
+ line = line.strip()
19
+ if line.isdigit() and subtitleTimeLineRegex.match(srtFileLines[lineNum + 1]):
20
+ lineWithTimestamps = srtFileLines[lineNum + 1].strip()
21
+ lineWithSubtitleText = srtFileLines[lineNum + 2].strip()
22
+
23
+ # If there are more lines after the subtitle text, add them to the text
24
+ count = 3
25
+ while True:
26
+ # Check if the next line is blank or not
27
+ if (lineNum+count) < len(srtFileLines) and srtFileLines[lineNum + count].strip():
28
+ lineWithSubtitleText += ' ' + srtFileLines[lineNum + count].strip()
29
+ count += 1
30
+ else:
31
+ break
32
+
33
+ # Create empty dictionary with keys for start and end times and subtitle text
34
+ subsDict[line] = {'start_ms': '', 'end_ms': '', 'duration_ms': '', 'text': '', 'break_until_next': '', 'srt_timestamps_line': lineWithTimestamps}
35
+
36
+ time = lineWithTimestamps.split(' --> ')
37
+ time1 = time[0].split(':')
38
+ time2 = time[1].split(':')
39
+
40
+ # Converts the time to milliseconds
41
+ processedTime1 = int(time1[0]) * 3600000 + int(time1[1]) * 60000 + int(time1[2].split(',')[0]) * 1000 + int(time1[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
42
+ processedTime2 = int(time2[0]) * 3600000 + int(time2[1]) * 60000 + int(time2[2].split(',')[0]) * 1000 + int(time2[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
43
+ timeDifferenceMs = str(processedTime2 - processedTime1)
44
+
45
+ # Adjust times with buffer
46
+ if addBufferMilliseconds > 0 and not preTranslated:
47
+ subsDict[line]['start_ms_buffered'] = str(processedTime1 + addBufferMilliseconds)
48
+ subsDict[line]['end_ms_buffered'] = str(processedTime2 - addBufferMilliseconds)
49
+ subsDict[line]['duration_ms_buffered'] = str((processedTime2 - addBufferMilliseconds) - (processedTime1 + addBufferMilliseconds))
50
+ else:
51
+ subsDict[line]['start_ms_buffered'] = str(processedTime1)
52
+ subsDict[line]['end_ms_buffered'] = str(processedTime2)
53
+ subsDict[line]['duration_ms_buffered'] = str(processedTime2 - processedTime1)
54
+
55
+ # Set the keys in the dictionary to the values
56
+ subsDict[line]['start_ms'] = str(processedTime1)
57
+ subsDict[line]['end_ms'] = str(processedTime2)
58
+ subsDict[line]['duration_ms'] = timeDifferenceMs
59
+ subsDict[line]['text'] = lineWithSubtitleText
60
+ if lineNum > 0:
61
+ # Goes back to previous line's dictionary and writes difference in time to current line
62
+ subsDict[str(int(line)-1)]['break_until_next'] = processedTime1 - int(subsDict[str(int(line) - 1)]['end_ms'])
63
+ else:
64
+ subsDict[line]['break_until_next'] = 0
65
+
66
+
67
+ # Apply the buffer to the start and end times by setting copying over the buffer values to main values
68
+ if addBufferMilliseconds > 0 and not preTranslated:
69
+ for key, value in subsDict.items():
70
+ subsDict[key]['start_ms'] = value['start_ms_buffered']
71
+ subsDict[key]['end_ms'] = value['end_ms_buffered']
72
+ subsDict[key]['duration_ms'] = value['duration_ms_buffered']
73
+
74
+ return subsDict
75
+
76
+
77
+ def get_duration(filename):
78
+ import subprocess, json
79
+ result = subprocess.check_output(f'ffprobe -i {filename} -show_entries format=duration -v quiet -of csv="p=0" -of json', shell=True).decode()
80
+
81
+ try:
82
+ duration = json.loads(result)['format']["duration"]
83
+ except KeyError:
84
+ print("Error: Could not get duration of video file. Please check the file path and try again.")
85
+ durationMS = round(float(duration)*1000) # Convert to milliseconds
86
+ return durationMS
app/scripts/translate.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: UTF-8 -*-
3
+
4
+ # Imports
5
+ import re, regex
6
+ from . import utils
7
+ from .azure_translate import azure_translate_text
8
+
9
+
10
+ from operator import itemgetter
11
+ import sys
12
+ import copy
13
+ import os
14
+ import html
15
+ from pathlib import Path
16
+
17
+
18
+ combine_subtitles_max_chars = 200
19
+ translate_service = 'azure'
20
+ # -------------------------------- No Translate and Manual Translation Functions -----------------------------------
21
+ BASE_DIR = Path(__file__).resolve().parent.parent / 'SSML_Customization'
22
+
23
+ # Import files and put into dictionaries
24
+ noTranslateOverrideFile = os.path.join(BASE_DIR, 'dont_translate_phrases.txt')
25
+ dontTranslateList = utils.txt_to_list(noTranslateOverrideFile)
26
+ manualTranslationOverrideFile = os.path.join(BASE_DIR, 'Manual_Translations.csv')
27
+ manualTranslationsDict = utils.csv_to_dict(manualTranslationOverrideFile)
28
+ urlListFile = os.path.join(BASE_DIR, 'url_list.txt')
29
+ urlList = utils.txt_to_list(urlListFile)
30
+
31
+ # Add span tags around certain words to exclude them from being translated
32
+ def add_notranslate_tags_from_notranslate_file(text, phraseList):
33
+ for word in phraseList:
34
+ findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{word}[.,!?()]?["\']?)(\p{{Z}}|$)' #\p ensures it works with unicode characters
35
+ findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
36
+ # Find the word, with optional punctuation after, and optional quotes before or after
37
+ text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
38
+ return text
39
+
40
+ def remove_notranslate_tags(text):
41
+ text = text.replace('<span class="notranslate">', '').replace('</span>', '')
42
+ return text
43
+
44
+ def add_notranslate_tags_for_manual_translations(text, langcode):
45
+ for manualTranslatedText in manualTranslationsDict:
46
+ # Only replace text if the language matches the entry in the manual translations file
47
+ if manualTranslatedText['Language Code'] == langcode:
48
+ originalText = manualTranslatedText['Original Text']
49
+ findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
50
+ findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
51
+ text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
52
+ return text
53
+
54
+ # Replace certain words or phrases with their manual translation
55
+ def replace_manual_translations(text, langcode):
56
+ for manualTranslatedText in manualTranslationsDict:
57
+ # Only replace text if the language matches the entry in the manual translations file
58
+ if manualTranslatedText['Language Code'] == langcode:
59
+ originalText = manualTranslatedText['Original Text']
60
+ translatedText = manualTranslatedText['Translated Text']
61
+ findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
62
+ findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
63
+ # Substitute the matched word with the translated text
64
+ text = findWordRegexCompiled.sub(rf'\1{translatedText}\3', text)
65
+ return text
66
+
67
+
68
+
69
+ #======================================== Translate Text ================================================
70
+ # Note: This function was almost entirely written by GPT-3 after feeding it my original code and asking it to change it so it
71
+ # would break up the text into chunks if it was too long. It appears to work
72
+
73
+ def process_response_text(text, targetLanguage):
74
+ text = html.unescape(text)
75
+ text = remove_notranslate_tags(text)
76
+ text = replace_manual_translations(text, targetLanguage)
77
+ return text
78
+
79
+ def split_transcript_chunks(text, max_length=5000):
80
+ # Calculate the total number of utf-8 codepoints
81
+ #totalCodepoints = len(text.encode("utf-8"))
82
+
83
+ # Split the transcript into sentences
84
+ sentences = re.split(r'(?<=[.!?])\s+', text)
85
+
86
+ # Initialize a list to store the chunks of text
87
+ chunks = []
88
+
89
+ # Initialize a string to store a chunk of text
90
+ chunk = ""
91
+
92
+ # For each sentence in the list of sentences
93
+ for sentence in sentences:
94
+ # If adding the sentence to the chunk would keep it within the maximum length
95
+ if len(chunk.encode("utf-8")) + len(sentence.encode("utf-8")) + 1 <= max_length: # Adding 1 to account for space
96
+ # Add the sentence to the chunk
97
+ chunk += sentence + " "
98
+ else:
99
+ # If adding the sentence would exceed the maximum length and chunk is not empty
100
+ if chunk:
101
+ # Add the chunk to the list of chunks
102
+ chunks.append(chunk.strip())
103
+ # Start a new chunk with the current sentence
104
+ chunk = sentence + " "
105
+
106
+ # Add the last chunk to the list of chunks (if it's not empty)
107
+ if chunk:
108
+ chunks.append(chunk.strip())
109
+
110
+ # Return the list of chunks
111
+ return chunks
112
+
113
+ def convertChunkListToCompatibleDict(chunkList):
114
+ # Create dictionary with numbers as keys and chunks as values
115
+ chunkDict = {}
116
+ for i, chunk in enumerate(chunkList, 1):
117
+ chunkDict[i] = {'text': chunk}
118
+ return chunkDict
119
+
120
+
121
+ # Translate the text entries of the dictionary
122
+ def translate_dictionary(inputSubsDict, langDict, translatedSrtFileName, skipTranslation=False, ):
123
+ targetLanguage = langDict['targetLanguage']
124
+ sourceLanguage = langDict['sourceLanguage']
125
+ translateService = langDict['translateService']
126
+
127
+ # Create a container for all the text to be translated
128
+ textToTranslate = []
129
+
130
+ for key in inputSubsDict:
131
+ originalText = inputSubsDict[key]['text']
132
+ # Add any 'notranslate' tags to the text
133
+ processedText = add_notranslate_tags_from_notranslate_file(originalText, dontTranslateList)
134
+ processedText = add_notranslate_tags_from_notranslate_file(processedText, urlList)
135
+ processedText = add_notranslate_tags_for_manual_translations(processedText, targetLanguage)
136
+
137
+ # Add the text to the list of text to be translated
138
+ textToTranslate.append(processedText)
139
+
140
+ # Calculate the total number of utf-8 codepoints
141
+ codepoints = 0
142
+ for text in textToTranslate:
143
+ codepoints += len(text.encode("utf-8"))
144
+
145
+ # If the codepoints are greater than 28000, split the request into multiple
146
+ # Google's API limit is 30000 Utf-8 codepoints per request, while DeepL's is 130000, but we leave some room just in case
147
+ if skipTranslation == False:
148
+ if translateService == 'azure':
149
+ print("Translating text using Azure...")
150
+ result = azure_translate_text(textToTranslate, sourceLanguage, targetLanguage)
151
+
152
+ # Add the translated texts to the dictionary
153
+ for i, key in enumerate(inputSubsDict):
154
+ inputSubsDict[key]['translated_text'] = process_response_text(result[i]["text"], targetLanguage)
155
+ # Print progress, overwrite the same line
156
+ print(f' Translated: {key} of {len(inputSubsDict)}', end='\r')
157
+ else:
158
+ print("Error: Invalid translate_service setting. Only 'Azure' is supported.")
159
+ sys.exit()
160
+ else:
161
+ for key in inputSubsDict:
162
+ inputSubsDict[key]['translated_text'] = process_response_text(inputSubsDict[key]['text'], targetLanguage) # Skips translating, such as for testing
163
+ print(" ")
164
+
165
+
166
+ combinedProcessedDict = combine_subtitles_advanced(inputSubsDict, int(combine_subtitles_max_chars))
167
+
168
+ if skipTranslation == False:
169
+ # Write new srt file with translated text
170
+ with open(translatedSrtFileName, 'w', encoding='utf-8-sig') as f:
171
+ for key in combinedProcessedDict:
172
+ f.write(str(key) + '\n')
173
+ f.write(combinedProcessedDict[key]['srt_timestamps_line'] + '\n')
174
+ f.write(combinedProcessedDict[key]['translated_text'] + '\n')
175
+ f.write('\n')
176
+
177
+ return combinedProcessedDict
178
+
179
+
180
+ ##### Add additional info to the dictionary for each language #####
181
+ def set_translation_info(languageBatchDict):
182
+ newBatchSettingsDict = copy.deepcopy(languageBatchDict)
183
+
184
+ # If using Azure, set all languages to use Azure in dictionary
185
+ if translate_service == 'azure':
186
+ for langNum, langInfo in languageBatchDict.items():
187
+ newBatchSettingsDict[langNum]['translate_service'] = 'azure'
188
+ newBatchSettingsDict[langNum]['formality'] = None
189
+
190
+ else:
191
+ print("Error: No valid translation service selected. Please choose a valid service or enable 'skip_translation' in config.")
192
+ sys.exit()
193
+
194
+ return newBatchSettingsDict
195
+
196
+
197
+ #======================================== Combine Subtitle Lines ================================================
198
+ def combine_subtitles_advanced(inputDict, maxCharacters=200):
199
+ charRateGoal = 20 #20
200
+ gapThreshold = 100 # The maximum gap between subtitles to combine
201
+ noMorePossibleCombines = False
202
+ # Convert dictionary to list of dictionaries of the values
203
+ entryList = []
204
+
205
+ for key, value in inputDict.items():
206
+ value['originalIndex'] = int(key)-1
207
+ entryList.append(value)
208
+
209
+ while not noMorePossibleCombines:
210
+ entryList, noMorePossibleCombines = combine_single_pass(entryList, charRateGoal, gapThreshold, maxCharacters)
211
+
212
+ # Convert the list back to a dictionary then return it
213
+ return dict(enumerate(entryList, start=1))
214
+
215
+ def combine_single_pass(entryListLocal, charRateGoal, gapThreshold, maxCharacters):
216
+ # Want to restart the loop if a change is made, so use this variable, otherwise break only if the end is reached
217
+ reachedEndOfList = False
218
+ noMorePossibleCombines = True # Will be set to False if a combination is made
219
+
220
+ # Use while loop because the list is being modified
221
+ while not reachedEndOfList:
222
+
223
+ # Need to update original index in here
224
+ for entry in entryListLocal:
225
+ entry['originalIndex'] = entryListLocal.index(entry)
226
+
227
+ # Will use later to check if an entry is the last one in the list, because the last entry will have originalIndex equal to the length of the list - 1
228
+ originalNumberOfEntries = len(entryListLocal)
229
+
230
+ # Need to calculate the char_rate for each entry, any time something changes, so put it at the top of this loop
231
+ entryListLocal = calc_list_speaking_rates(entryListLocal, charRateGoal)
232
+
233
+ # Sort the list by the difference in speaking speed from charRateGoal
234
+ priorityOrderedList = sorted(entryListLocal, key=itemgetter('char_rate_diff'), reverse=True)
235
+
236
+ # Iterates through the list in order of priority, and uses that index to operate on entryListLocal
237
+ # For loop is broken after a combination is made, so that the list can be re-sorted and re-iterated
238
+ for progress, data in enumerate(priorityOrderedList):
239
+ i = data['originalIndex']
240
+ # Check if last entry, and therefore will end loop when done with this iteration
241
+ if progress == len(priorityOrderedList) - 1:
242
+ reachedEndOfList = True
243
+
244
+ # Check if the current entry is outside the upper and lower bounds
245
+ if (data['char_rate'] > charRateGoal or data['char_rate'] < charRateGoal):
246
+
247
+ # Check if the entry is the first in entryListLocal, if so do not consider the previous entry
248
+ if data['originalIndex'] == 0:
249
+ considerPrev = False
250
+ else:
251
+ considerPrev = True
252
+
253
+ # Check if the entry is the last in entryListLocal, if so do not consider the next entry
254
+ if data['originalIndex'] == originalNumberOfEntries - 1:
255
+ considerNext = False
256
+ else:
257
+ considerNext = True
258
+
259
+ # Check if current entry is still in the list - if it has been combined with another entry, it will not be
260
+
261
+
262
+ # Get the char_rate of the next and previous entries, if they exist, and calculate the difference
263
+ # If the diff is positive, then it is lower than the current char_rate
264
+ try:
265
+ nextCharRate = entryListLocal[i+1]['char_rate']
266
+ nextDiff = data['char_rate'] - nextCharRate
267
+ except IndexError:
268
+ considerNext = False
269
+ nextCharRate = None
270
+ nextDiff = None
271
+ try:
272
+ prevCharRate = entryListLocal[i-1]['char_rate']
273
+ prevDiff = data['char_rate'] - prevCharRate
274
+ except IndexError:
275
+ considerPrev = False
276
+ prevCharRate = None
277
+ prevDiff = None
278
+
279
+ else:
280
+ continue
281
+
282
+ # Define functions for combining with previous or next entries - Generated with copilot, it's possible this isn't perfect
283
+ def combine_with_next():
284
+ entryListLocal[i]['text'] = entryListLocal[i]['text'] + ' ' + entryListLocal[i+1]['text']
285
+ entryListLocal[i]['translated_text'] = entryListLocal[i]['translated_text'] + ' ' + entryListLocal[i+1]['translated_text']
286
+ entryListLocal[i]['end_ms'] = entryListLocal[i+1]['end_ms']
287
+ entryListLocal[i]['end_ms_buffered'] = entryListLocal[i+1]['end_ms_buffered']
288
+ entryListLocal[i]['duration_ms'] = int(entryListLocal[i+1]['end_ms']) - int(entryListLocal[i]['start_ms'])
289
+ entryListLocal[i]['duration_ms_buffered'] = int(entryListLocal[i+1]['end_ms_buffered']) - int(entryListLocal[i]['start_ms_buffered'])
290
+ entryListLocal[i]['srt_timestamps_line'] = entryListLocal[i]['srt_timestamps_line'].split(' --> ')[0] + ' --> ' + entryListLocal[i+1]['srt_timestamps_line'].split(' --> ')[1]
291
+ del entryListLocal[i+1]
292
+
293
+ def combine_with_prev():
294
+ entryListLocal[i-1]['text'] = entryListLocal[i-1]['text'] + ' ' + entryListLocal[i]['text']
295
+ entryListLocal[i-1]['translated_text'] = entryListLocal[i-1]['translated_text'] + ' ' + entryListLocal[i]['translated_text']
296
+ entryListLocal[i-1]['end_ms'] = entryListLocal[i]['end_ms']
297
+ entryListLocal[i-1]['end_ms_buffered'] = entryListLocal[i]['end_ms_buffered']
298
+ entryListLocal[i-1]['duration_ms'] = int(entryListLocal[i]['end_ms']) - int(entryListLocal[i-1]['start_ms'])
299
+ entryListLocal[i-1]['duration_ms_buffered'] = int(entryListLocal[i]['end_ms_buffered']) - int(entryListLocal[i-1]['start_ms_buffered'])
300
+ entryListLocal[i-1]['srt_timestamps_line'] = entryListLocal[i-1]['srt_timestamps_line'].split(' --> ')[0] + ' --> ' + entryListLocal[i]['srt_timestamps_line'].split(' --> ')[1]
301
+ del entryListLocal[i]
302
+
303
+
304
+ # Choose whether to consider next and previous entries, and if neither then continue to next loop
305
+ if data['char_rate'] > charRateGoal:
306
+ # Check to ensure next/previous rates are lower than current rate, and the combined entry is not too long, and the gap between entries is not too large
307
+ # Need to add check for considerNext and considerPrev first, because if run other checks when there is no next/prev value to check, it will throw an error
308
+ if considerNext == False or nextDiff or nextDiff < 0 or (entryListLocal[i]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i]['translated_text']) + len(entryListLocal[i+1]['translated_text']) > maxCharacters):
309
+ considerNext = False
310
+ try:
311
+ if considerPrev == False or not prevDiff or prevDiff < 0 or (entryListLocal[i-1]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i-1]['translated_text']) + len(entryListLocal[i]['translated_text']) > maxCharacters):
312
+ considerPrev = False
313
+ except TypeError:
314
+ considerPrev = False
315
+
316
+ elif data['char_rate'] < charRateGoal:
317
+ # Check to ensure next/previous rates are higher than current rate
318
+ if considerNext == False or not nextDiff or nextDiff > 0 or (entryListLocal[i]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i]['translated_text']) + len(entryListLocal[i+1]['translated_text']) > maxCharacters):
319
+ considerNext = False
320
+ try:
321
+ if considerPrev == False or not prevDiff or prevDiff > 0 or (entryListLocal[i-1]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i-1]['translated_text']) + len(entryListLocal[i]['translated_text']) > maxCharacters):
322
+ considerPrev = False
323
+ except TypeError:
324
+ considerPrev = False
325
+ else:
326
+ continue
327
+
328
+ # Continue to next loop if neither are considered
329
+ if not considerNext and not considerPrev:
330
+ continue
331
+
332
+ # Should only reach this point if two entries are to be combined
333
+ if data['char_rate'] > charRateGoal:
334
+ # If both are to be considered, then choose the one with the lower char_rate
335
+ if considerNext and considerPrev:
336
+ if nextDiff < prevDiff:
337
+ combine_with_next()
338
+ noMorePossibleCombines = False
339
+ break
340
+ else:
341
+ combine_with_prev()
342
+ noMorePossibleCombines = False
343
+ break
344
+ # If only one is to be considered, then combine with that one
345
+ elif considerNext:
346
+ combine_with_next()
347
+ noMorePossibleCombines = False
348
+ break
349
+ elif considerPrev:
350
+ combine_with_prev()
351
+ noMorePossibleCombines = False
352
+ break
353
+ else:
354
+ print(f"Error U: Should not reach this point! Current entry = {i}")
355
+ print(f"Current Entry Text = {data['text']}")
356
+ continue
357
+
358
+ elif data['char_rate'] < charRateGoal:
359
+ # If both are to be considered, then choose the one with the higher char_rate
360
+ if considerNext and considerPrev:
361
+ if nextDiff > prevDiff:
362
+ combine_with_next()
363
+ noMorePossibleCombines = False
364
+ break
365
+ else:
366
+ combine_with_prev()
367
+ noMorePossibleCombines = False
368
+ break
369
+ # If only one is to be considered, then combine with that one
370
+ elif considerNext:
371
+ combine_with_next()
372
+ noMorePossibleCombines = False
373
+ break
374
+ elif considerPrev:
375
+ combine_with_prev()
376
+ noMorePossibleCombines = False
377
+ break
378
+ else:
379
+ print(f"Error L: Should not reach this point! Index = {i}")
380
+ print(f"Current Entry Text = {data['text']}")
381
+ continue
382
+ return entryListLocal, noMorePossibleCombines
383
+
384
+ #-- End of combine_single_pass --
385
+
386
+ #----------------------------------------------------------------------
387
+
388
+ # Calculate the number of characters per second for each subtitle entry
389
+ def calc_dict_speaking_rates(inputDict, dictKey='translated_text'):
390
+ tempDict = copy.deepcopy(inputDict)
391
+ for key, value in tempDict.items():
392
+ tempDict[key]['char_rate'] = round(len(value[dictKey]) / (int(value['duration_ms']) / 1000), 2)
393
+ return tempDict
394
+
395
+ def calc_list_speaking_rates(inputList, charRateGoal, dictKey='translated_text'):
396
+ tempList = copy.deepcopy(inputList)
397
+ for i in range(len(tempList)):
398
+ # Calculate the number of characters per second based on the duration of the entry
399
+ tempList[i]['char_rate'] = round(len(tempList[i][dictKey]) / (int(tempList[i]['duration_ms']) / 1000), 2)
400
+ # Calculate the difference between the current char_rate and the goal char_rate - Absolute Value
401
+ tempList[i]['char_rate_diff'] = abs(round(tempList[i]['char_rate'] - charRateGoal, 2))
402
+ return tempList
app/scripts/utils.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+
3
+ # Interprets a string as a boolean. Returns True or False
4
+ def parseBool(string, silent=False):
5
+ if type(string) == str:
6
+ if string.lower() == 'true':
7
+ return True
8
+ elif string.lower() == 'false':
9
+ return False
10
+ else:
11
+ if not silent:
12
+ raise ValueError(f'Invalid value "{string}". Must be "True" or "False"')
13
+ elif silent:
14
+ return string
15
+ elif type(string) == bool:
16
+ if string == True:
17
+ return True
18
+ elif string == False:
19
+ return False
20
+ else:
21
+ raise ValueError('Not a valid boolean string')
22
+
23
+ def parseConfigSetting(setting):
24
+ # Remove any quotes user may have added in config file
25
+ setting = setting.strip("\"").strip("\'")
26
+
27
+ # Check if it is a boolean
28
+ if type(parseBool(setting, silent=True)) == bool:
29
+ return parseBool(setting, silent=True)
30
+
31
+ # Check if it is an integer
32
+ try:
33
+ return int(setting)
34
+ except ValueError:
35
+ pass
36
+
37
+ # Otherwise return the string in lower case
38
+ return setting.lower()
39
+
40
+ # Returns a list of dictionaries from a csv file. Where the key is the column name and the value is the value in that column
41
+ # The column names are set by the first row of the csv file
42
+ def csv_to_dict(csvFilePath):
43
+ with open(csvFilePath, "r", encoding='utf-8-sig') as data:
44
+ entriesDictsList = []
45
+ for line in csv.DictReader(data):
46
+ entriesDictsList.append(line)
47
+ return entriesDictsList
48
+
49
+ # Returns a list of strings from a txt file. Ignores empty lines and lines that start with '#'
50
+ def txt_to_list(txtFilePath):
51
+ with open(txtFilePath, "r", encoding='utf-8-sig') as data:
52
+ entriesList = []
53
+ for line in data:
54
+ if line.strip() != '' and line.strip()[0] != '#':
55
+ entriesList.append(line.strip())
56
+ return entriesList
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.5.0
2
+ anyio==3.7.1
3
+ azure-cognitiveservices-speech==1.30.0
4
+ boto3==1.28.18
5
+ botocore==1.31.18
6
+ Brotli==1.0.9
7
+ certifi==2023.7.22
8
+ cffi==1.15.1
9
+ charset-normalizer==3.2.0
10
+ click==8.0.3
11
+ essentials==1.1.4
12
+ fastapi==0.100.1
13
+ h11==0.14.0
14
+ idna==3.4
15
+ jmespath==1.0.1
16
+ langcodes==3.3.0
17
+ language-data==1.1
18
+ marisa-trie==0.7.8
19
+ mutagen==1.46.0
20
+ numpy==1.25.2
21
+ pycparser==2.21
22
+ pycryptodomex==3.18.0
23
+ pydantic==2.1.1
24
+ pydantic_core==2.4.0
25
+ pydub==0.25.1
26
+ pyrubberband==0.3.0
27
+ PySoundFile==0.9.0.post1
28
+ python-dateutil==2.8.2
29
+ python-dotenv==0.19.2
30
+ python-multipart==0.0.6
31
+ regex==2023.6.3
32
+ requests==2.31.0
33
+ s3transfer==0.6.1
34
+ six==1.16.0
35
+ sniffio==1.3.0
36
+ soundfile==0.12.1
37
+ starlette==0.27.0
38
+ typing_extensions==4.7.1
39
+ urllib3==1.26.16
40
+ uvicorn==0.23.2
41
+ websockets==11.0.3
42
+ yt-dlp==2023.7.6
43
+ youtube_transcript_api