Spaces:
Sleeping
Sleeping
badal
commited on
Commit
·
2f2406a
0
Parent(s):
feat: initial commit
Browse files- .gitattributes +35 -0
- .gitignore +29 -0
- Dockerfile +41 -0
- README.md +10 -0
- app/SSML_Customization/Examples.xlsx +0 -0
- app/SSML_Customization/Manual_Translations.csv +1 -0
- app/SSML_Customization/Phoneme_Pronunciation.csv +1 -0
- app/SSML_Customization/READ THIS.txt +46 -0
- app/SSML_Customization/aliases.csv +1 -0
- app/SSML_Customization/dont_translate_phrases.txt +3 -0
- app/SSML_Customization/interpret-as.csv +1 -0
- app/SSML_Customization/url_list.txt +4 -0
- app/__init__.py +1 -0
- app/captioning/__init__.py +1 -0
- app/captioning/caption_helper.py +156 -0
- app/captioning/captioning.py +370 -0
- app/captioning/helper.py +83 -0
- app/captioning/user_config_helper.py +133 -0
- app/constants.py +29 -0
- app/functions/__init__.py +0 -0
- app/functions/helper.py +25 -0
- app/functions/model.py +18 -0
- app/functions/s3_handler.py +25 -0
- app/functions/video_url_handler.py +60 -0
- app/functions/youtube_summarizer.py +27 -0
- app/main.py +24 -0
- app/scripts/TTS.py +408 -0
- app/scripts/__init__.py +55 -0
- app/scripts/audio.py +62 -0
- app/scripts/audio_builder.py +181 -0
- app/scripts/azure_batch.py +78 -0
- app/scripts/azure_translate.py +28 -0
- app/scripts/srt.py +86 -0
- app/scripts/translate.py +402 -0
- app/scripts/utils.py +56 -0
- requirements.txt +43 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.idea
|
2 |
+
.ipynb_checkpoints
|
3 |
+
.mypy_cache
|
4 |
+
.vscode
|
5 |
+
__pycache__
|
6 |
+
.pytest_cache
|
7 |
+
htmlcov
|
8 |
+
dist
|
9 |
+
site
|
10 |
+
.coverage
|
11 |
+
coverage.xml
|
12 |
+
.netlify
|
13 |
+
test.db
|
14 |
+
log.txt
|
15 |
+
Pipfile.lock
|
16 |
+
env3.*
|
17 |
+
env
|
18 |
+
docs_build
|
19 |
+
site_build
|
20 |
+
venv
|
21 |
+
docs.zip
|
22 |
+
archive.zip
|
23 |
+
openssl-1.1.1u
|
24 |
+
logs
|
25 |
+
run.sh
|
26 |
+
# vim temporary files
|
27 |
+
*~
|
28 |
+
.*.sw?
|
29 |
+
.cache
|
Dockerfile
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:latest
|
2 |
+
|
3 |
+
RUN mkdir -p /code
|
4 |
+
RUN chmod 777 /code
|
5 |
+
|
6 |
+
WORKDIR /code
|
7 |
+
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
RUN apt-get update && apt-get upgrade -y
|
11 |
+
RUN apt-get install ffmpeg -y
|
12 |
+
RUN apt-get install git -y
|
13 |
+
|
14 |
+
RUN apt-get install -y \
|
15 |
+
build-essential \
|
16 |
+
libssl-dev \
|
17 |
+
ca-certificates \
|
18 |
+
libasound2 \
|
19 |
+
wget
|
20 |
+
|
21 |
+
# Download OpenSSL source, compile, and install it
|
22 |
+
RUN wget -O - https://www.openssl.org/source/openssl-1.1.1u.tar.gz | tar zxf -
|
23 |
+
WORKDIR openssl-1.1.1u
|
24 |
+
RUN ./config --prefix=/usr/local
|
25 |
+
RUN make -j $(nproc)
|
26 |
+
RUN make install_sw install_ssldirs
|
27 |
+
RUN ldconfig -v
|
28 |
+
|
29 |
+
# Set environment variables
|
30 |
+
ENV SSL_CERT_DIR=/etc/ssl/certs
|
31 |
+
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
32 |
+
|
33 |
+
WORKDIR /code
|
34 |
+
|
35 |
+
RUN pip install --upgrade pip
|
36 |
+
|
37 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
38 |
+
|
39 |
+
COPY ./app /code/app
|
40 |
+
|
41 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Vidverse
|
3 |
+
emoji: 🚀
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
+
|
10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app/SSML_Customization/Examples.xlsx
ADDED
Binary file (11.6 kB). View file
|
|
app/SSML_Customization/Manual_Translations.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Original Text,Translated Text,Language Code
|
app/SSML_Customization/Phoneme_Pronunciation.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Text,Phonetic Pronunciation,Case Sensitive (True/False),Phonetic Alphabet
|
app/SSML_Customization/READ THIS.txt
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This folder contains the following three pronunciation customization files by default.
|
2 |
+
|
3 |
+
• dont_translate_phrases.txt
|
4 |
+
- You can add a list of phrases or words you do not want to be translated.
|
5 |
+
- This will work for both Google Translate and DeepL
|
6 |
+
|
7 |
+
• interpret-as.csv (Azure Only)
|
8 |
+
- You can use SSML parameters to customize how specific words or phrases are pronounced
|
9 |
+
- See this article for documentation: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#say-as-element
|
10 |
+
- Note: The script will match the phrases in the TRANSLATED text. You may therefore wish to also add these phrases to 'dont_translate_phrases.txt'.
|
11 |
+
- The first row contains the titles of each column - Do not change anything in the first row!
|
12 |
+
- Descriptions of each column:
|
13 |
+
• Text: The word or phrase that will be pronounced how you specify, if it is found in the text to be spoken
|
14 |
+
• interpret-as Type: The way in which the word/phrase will be pronounced. See documentation link above. (Some examples include: characters, cardinal, ordinal)
|
15 |
+
• Case Sensitive (True/False): Whether to only modify the pronunciation if the word/phrase matches exactly, being case sensitive
|
16 |
+
• Format (Optional): Only applicable to some types, such as 'date', 'time', and others. Otherwise leave blank. See documentation link above for details
|
17 |
+
- See 'Example - interpret-as.csv' for an example of how to use this file
|
18 |
+
- This will only apply if using Azure TTS, not Google
|
19 |
+
|
20 |
+
• aliases.csv (Azure Only)
|
21 |
+
- Lets you effectively change what should be spoken instead of a certain word or phrase
|
22 |
+
- Example: If the text to be spoken contains "BTW" you can have it say "by the way"
|
23 |
+
-Note: It does NOT actually replace the text, but only changes how the voice will pronounce it
|
24 |
+
- The first row contains the titles of each column - Do not change anything in the first row!
|
25 |
+
- Description of each column:
|
26 |
+
- Original Text: The original word or phrase to match
|
27 |
+
- Alias: The word or phrase to speak instead of the original text
|
28 |
+
- Case Sensitive (True/False): Whether it must be an exact match including capital/lower case. If nothing is entered, will default to False
|
29 |
+
- This will only apply if using Azure TTS, not Google
|
30 |
+
|
31 |
+
• Manual_Translations.csv
|
32 |
+
- If you know you are going to use a word that gets incorrectly interpreted or translated, you can enter manual translations for any words for any languages
|
33 |
+
- In Manual_Translations.csv, put the original text in the first column, your translation in the second, and the 2-letter language code for that entry into the 3rd column
|
34 |
+
|
35 |
+
|
36 |
+
• url_list.txt
|
37 |
+
- If you have any URLs in the original text, you can put them as a list in this file
|
38 |
+
- This makes it so the URL will not be translated, and also improves the pronunciation in the TTS stage
|
39 |
+
- It will really only work on basic URLs, such as "example.com/test". If it has anything other than slashes, periods, and colons, it won't work
|
40 |
+
- See the notes at the top of the url_list.txt file for more details
|
41 |
+
|
42 |
+
• Phoneme_Pronunciation.csv
|
43 |
+
- Allows you to specify exact phonetic pronunciation of words or phrases
|
44 |
+
- Note: This is different from 'aliases'. Using this requires using special phonetic alphabets (see links below)
|
45 |
+
- See: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-ssml-phonetic-sets
|
46 |
+
- See: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup-pronunciation#phoneme-element
|
app/SSML_Customization/aliases.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Original Text,Alias,Case Sensitive (True/False)
|
app/SSML_Customization/dont_translate_phrases.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Add one word or phrase per line that you do not want to be translated. The original word will be left as-is in the translated srt files.
|
2 |
+
# Don't include punctuation. This list will NOT be case sensitive
|
3 |
+
# Lines beginning with a # will be ignored
|
app/SSML_Customization/interpret-as.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Text,interpret-as Type,Case Sensitive (True/False),Format (Optional)
|
app/SSML_Customization/url_list.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# List any URLs that may appear in the original text, such as "google.com/example"
|
2 |
+
# This ensures they will not be translated, and will be spoken as words in the TTS stage
|
3 |
+
# Example: "google.com/example" becomes "google dot com slash example", which spoken in spanish would be "google punto c o m diagonal example"
|
4 |
+
# The actual text in the subtitles will remain as "google.com/example", and only the spoken audio will change
|
app/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import scripts
|
app/captioning/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .captioning import generate_sub
|
app/captioning/caption_helper.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) Microsoft. All rights reserved.
|
3 |
+
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
4 |
+
#
|
5 |
+
|
6 |
+
from datetime import date, datetime, time, timedelta
|
7 |
+
from typing import List, Optional, Tuple
|
8 |
+
import azure.cognitiveservices.speech as speechsdk # type: ignore
|
9 |
+
from . import helper
|
10 |
+
|
11 |
+
class Caption(object) :
|
12 |
+
def __init__(self, language : Optional[str], sequence : int, begin : time, end : time, text : str) :
|
13 |
+
self.language = language
|
14 |
+
self.sequence = sequence
|
15 |
+
self.begin = begin
|
16 |
+
self.end = end
|
17 |
+
self.text = text
|
18 |
+
|
19 |
+
def get_captions(language : Optional[str], max_width : int, max_height : int, results : List[dict]) -> List[Caption] :
|
20 |
+
caption_helper = CaptionHelper(language, max_width, max_height, results)
|
21 |
+
return caption_helper.get_captions()
|
22 |
+
|
23 |
+
class CaptionHelper(object) :
|
24 |
+
def __init__(self, language : Optional[str], max_width : int, max_height : int, results : List[speechsdk.RecognitionResult]) :
|
25 |
+
self._language = language
|
26 |
+
self._max_width = max_width
|
27 |
+
self._max_height = max_height
|
28 |
+
self._results = results
|
29 |
+
|
30 |
+
self._first_pass_terminators = ["?", "!", ",", ";"]
|
31 |
+
self._second_pass_terminators = [" ", "."]
|
32 |
+
|
33 |
+
self._captions : List[Caption] = []
|
34 |
+
|
35 |
+
# consider adapting to use http://unicode.org/reports/tr29/#Sentence_Boundaries
|
36 |
+
if self._language is not None :
|
37 |
+
iso639 = self._language.split('-')[0]
|
38 |
+
if "zh" == iso639.lower() :
|
39 |
+
self._first_pass_terminators = [",", "、", ";", "?", "!", "?", "!", ",", ";"]
|
40 |
+
self._second_pass_terminators = ["。", " "]
|
41 |
+
if (helper.DEFAULT_MAX_LINE_LENGTH_SBCS == self._max_width) :
|
42 |
+
self._max_width = helper.DEFAULT_MAX_LINE_LENGTH_MBCS
|
43 |
+
|
44 |
+
def get_captions(self) -> List[Caption] :
|
45 |
+
self.ensure_captions()
|
46 |
+
return self._captions
|
47 |
+
|
48 |
+
def ensure_captions(self) -> None :
|
49 |
+
if not self._captions :
|
50 |
+
self.add_captions_for_all_results()
|
51 |
+
|
52 |
+
def add_captions_for_all_results(self) -> None :
|
53 |
+
for result in self._results :
|
54 |
+
if result.offset <= 0 or not self.is_final_result(result) :
|
55 |
+
continue
|
56 |
+
text = self.get_text_or_translation(result)
|
57 |
+
if not text :
|
58 |
+
continue
|
59 |
+
self.add_captions_for_final_result(result, text)
|
60 |
+
|
61 |
+
def get_text_or_translation(self, result : speechsdk.RecognitionResult) -> Optional[str] :
|
62 |
+
return result.text
|
63 |
+
|
64 |
+
# 20220921 We do not use this for now because this sample
|
65 |
+
# does not handle TranslationRecognitionResults.
|
66 |
+
#if not self._language :
|
67 |
+
# return result.text
|
68 |
+
#if type(result) is speechsdk.TranslationRecognitionResult and self._language in result.Translations :
|
69 |
+
# return result.Translations[self._language]
|
70 |
+
#else :
|
71 |
+
# return None
|
72 |
+
|
73 |
+
def add_captions_for_final_result(self, result : speechsdk.RecognitionResult, text : str) -> None :
|
74 |
+
caption_starts_at = 0
|
75 |
+
caption_lines : List[str] = []
|
76 |
+
index = 0
|
77 |
+
while (index < len(text)) :
|
78 |
+
index = self.skip_skippable(text, index)
|
79 |
+
|
80 |
+
line_length = self.get_best_width(text, index)
|
81 |
+
caption_lines.append(text[index:index + line_length].strip())
|
82 |
+
index += line_length
|
83 |
+
|
84 |
+
is_last_caption = index >= len(text)
|
85 |
+
max_caption_lines = len(caption_lines) >= self._max_height
|
86 |
+
|
87 |
+
add_caption = is_last_caption or max_caption_lines
|
88 |
+
|
89 |
+
if add_caption :
|
90 |
+
caption_text = '\n'.join(caption_lines)
|
91 |
+
caption_lines.clear()
|
92 |
+
|
93 |
+
caption_sequence = len(self._captions) + 1
|
94 |
+
is_first_caption = 0 == caption_starts_at
|
95 |
+
|
96 |
+
caption_begin_and_end : Tuple[time, time]
|
97 |
+
if is_first_caption and is_last_caption :
|
98 |
+
caption_begin_and_end = self.get_full_caption_result_timing(result)
|
99 |
+
else :
|
100 |
+
caption_begin_and_end = self.get_partial_result_caption_timing(result, text, caption_text, caption_starts_at, index - caption_starts_at)
|
101 |
+
|
102 |
+
self._captions.append(Caption(self._language, caption_sequence, caption_begin_and_end[0], caption_begin_and_end[1], caption_text))
|
103 |
+
|
104 |
+
caption_starts_at = index
|
105 |
+
|
106 |
+
def get_best_width(self, text : str, start_index : int) -> int :
|
107 |
+
remaining = len(text) - start_index
|
108 |
+
best_width = remaining if remaining < self._max_width else self.find_best_width(self._first_pass_terminators, text, start_index)
|
109 |
+
if (best_width < 0) :
|
110 |
+
best_width = self.find_best_width(self._second_pass_terminators, text, start_index)
|
111 |
+
if best_width < 0 :
|
112 |
+
best_width = self._max_width
|
113 |
+
return best_width
|
114 |
+
|
115 |
+
def find_best_width(self, terminators : List[str], text : str, start_at : int) -> int :
|
116 |
+
remaining = len(text) - start_at
|
117 |
+
check_chars = min(remaining, self._max_width)
|
118 |
+
best_width = -1
|
119 |
+
for terminator in terminators :
|
120 |
+
index = text.rfind(terminator, start_at, start_at + check_chars)
|
121 |
+
width = index - start_at
|
122 |
+
if width > best_width :
|
123 |
+
best_width = width + len(terminator)
|
124 |
+
return best_width
|
125 |
+
|
126 |
+
def skip_skippable(self, text : str, start_index : int) -> int :
|
127 |
+
index = start_index
|
128 |
+
while len(text) > index and ' ' == text[index] :
|
129 |
+
index += 1
|
130 |
+
return index
|
131 |
+
|
132 |
+
def get_full_caption_result_timing(self, result : speechsdk.RecognitionResult) -> Tuple[time, time] :
|
133 |
+
begin = helper.time_from_ticks(result.offset)
|
134 |
+
end = helper.time_from_ticks(result.offset + result.duration)
|
135 |
+
return (begin, end)
|
136 |
+
|
137 |
+
def get_partial_result_caption_timing(self, result : speechsdk.RecognitionResult, text : str, caption_text : str, caption_starts_at : int, caption_length : int) -> Tuple[time, time] :
|
138 |
+
(result_begin, result_end) = self.get_full_caption_result_timing(result)
|
139 |
+
result_duration = helper.subtract_times(result_end, result_begin)
|
140 |
+
text_length = len(text)
|
141 |
+
partial_begin = helper.add_time_and_timedelta(result_begin, result_duration * caption_starts_at / text_length)
|
142 |
+
partial_end = helper.add_time_and_timedelta(result_begin, result_duration * (caption_starts_at + caption_length) / text_length)
|
143 |
+
return (partial_begin, partial_end)
|
144 |
+
|
145 |
+
def is_final_result(self, result : speechsdk.RecognitionResult) -> bool :
|
146 |
+
return speechsdk.ResultReason.RecognizedSpeech == result.reason or speechsdk.ResultReason.RecognizedIntent == result.reason or speechsdk.ResultReason.TranslatedSpeech == result.reason
|
147 |
+
|
148 |
+
def lines_from_text(self, text : str) -> List[str] :
|
149 |
+
retval : List[str] = []
|
150 |
+
index = 0
|
151 |
+
while (index < len(text)) :
|
152 |
+
index = self.skip_skippable(text, index)
|
153 |
+
line_length = self.get_best_width(text, index)
|
154 |
+
retval.append(text[index:index + line_length].strip())
|
155 |
+
index += line_length
|
156 |
+
return retval
|
app/captioning/captioning.py
ADDED
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) Microsoft. All rights reserved.
|
3 |
+
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
4 |
+
#
|
5 |
+
|
6 |
+
# Notes:
|
7 |
+
# - Install the Speech SDK. Run:
|
8 |
+
# pip install azure-cognitiveservices-speech
|
9 |
+
# - The Python Speech SDK on Windows requires the Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017, 2019, or 2022 on the system. See:
|
10 |
+
# https://docs.microsoft.com/azure/cognitive-services/speech-service/quickstarts/setup-platform
|
11 |
+
# - Install gstreamer:
|
12 |
+
# https://docs.microsoft.com/azure/cognitive-services/speech-service/how-to-use-codec-compressed-audio-input-streams
|
13 |
+
|
14 |
+
from datetime import datetime, time, timezone, timedelta
|
15 |
+
from itertools import groupby, pairwise
|
16 |
+
from os import linesep, remove, environ
|
17 |
+
from os.path import exists
|
18 |
+
from pathlib import Path
|
19 |
+
from sys import argv
|
20 |
+
from time import sleep
|
21 |
+
from typing import Any, List, Optional
|
22 |
+
import wave
|
23 |
+
import azure.cognitiveservices.speech as speechsdk # type: ignore
|
24 |
+
from . import caption_helper
|
25 |
+
from . import helper
|
26 |
+
from . import user_config_helper
|
27 |
+
|
28 |
+
USAGE = """Usage: python captioning.py [...]
|
29 |
+
|
30 |
+
HELP
|
31 |
+
--help Show this help and stop.
|
32 |
+
|
33 |
+
CONNECTION
|
34 |
+
--key KEY Your Azure Speech service resource key.
|
35 |
+
Overrides the SPEECH_KEY environment variable. You must set the environment variable (recommended) or use the `--key` option.
|
36 |
+
--region REGION Your Azure Speech service region.
|
37 |
+
Overrides the SPEECH_REGION environment variable. You must set the environment variable (recommended) or use the `--region` option.
|
38 |
+
Examples: westus, eastus
|
39 |
+
|
40 |
+
LANGUAGE
|
41 |
+
--language LANG1 Specify language. This is used when breaking captions into lines.
|
42 |
+
Default value is en-US.
|
43 |
+
Examples: en-US, ja-JP
|
44 |
+
|
45 |
+
INPUT
|
46 |
+
--input FILE Input audio from file (default input is the microphone.)
|
47 |
+
--format FORMAT Use compressed audio format.
|
48 |
+
If this is not present, uncompressed format (wav) is assumed.
|
49 |
+
Valid only with --file.
|
50 |
+
Valid values: alaw, any, flac, mp3, mulaw, ogg_opus
|
51 |
+
|
52 |
+
MODE
|
53 |
+
--offline Output offline results.
|
54 |
+
Overrides --realTime.
|
55 |
+
--realTime Output real-time results.
|
56 |
+
Default output mode is offline.
|
57 |
+
|
58 |
+
ACCURACY
|
59 |
+
--phrases ""PHRASE1;PHRASE2"" Example: ""Constoso;Jessie;Rehaan""
|
60 |
+
|
61 |
+
OUTPUT
|
62 |
+
--output FILE Output captions to FILE.
|
63 |
+
--srt Output captions in SubRip Text format (default format is WebVTT.)
|
64 |
+
--maxLineLength LENGTH Set the maximum number of characters per line for a caption to LENGTH.
|
65 |
+
Minimum is 20. Default is 37 (30 for Chinese).
|
66 |
+
--lines LINES Set the number of lines for a caption to LINES.
|
67 |
+
Minimum is 1. Default is 2.
|
68 |
+
--delay MILLISECONDS How many MILLISECONDS to delay the appearance of each caption.
|
69 |
+
Minimum is 0. Default is 1000.
|
70 |
+
--remainTime MILLISECONDS How many MILLISECONDS a caption should remain on screen if it is not replaced by another.
|
71 |
+
Minimum is 0. Default is 1000.
|
72 |
+
--quiet Suppress console output, except errors.
|
73 |
+
--profanity OPTION Valid values: raw, remove, mask
|
74 |
+
Default is mask.
|
75 |
+
--threshold NUMBER Set stable partial result threshold.
|
76 |
+
Default is 3.
|
77 |
+
"""
|
78 |
+
|
79 |
+
class Captioning(object) :
|
80 |
+
def __init__(self, language, input_audio, output) :
|
81 |
+
# self._user_config = user_config_helper.user_config_from_args(USAGE)
|
82 |
+
self._user_config = {
|
83 |
+
"language": language,
|
84 |
+
"captioning_mode": user_config_helper.CaptioningMode.OFFLINE, # or OFFLINE if you prefer offline mode
|
85 |
+
"input_file": input_audio,
|
86 |
+
"output_file": output,
|
87 |
+
"use_sub_rip_text_caption_format": True,
|
88 |
+
"use_compressed_audio": False,
|
89 |
+
"compressed_audio_format": speechsdk.AudioStreamContainerFormat.ANY,
|
90 |
+
"subscription_key" : environ.get("SPEECH_KEY"),
|
91 |
+
"region" : environ.get("SPEECH_REGION"),
|
92 |
+
"profanity_option" : speechsdk.ProfanityOption.Masked,
|
93 |
+
"phrases" : "Constoso;Jessie;Rehaan",
|
94 |
+
"suppress_console_output" : True,
|
95 |
+
"remain_time" : timedelta(milliseconds=1000),
|
96 |
+
"delay" : timedelta(milliseconds=1000),
|
97 |
+
"max_line_length" : helper.DEFAULT_MAX_LINE_LENGTH_SBCS,
|
98 |
+
"lines" : 2,
|
99 |
+
"stable_partial_result_threshold" : "3",
|
100 |
+
}
|
101 |
+
self._srt_sequence_number = 1
|
102 |
+
self._previous_caption : Optional[caption_helper.Caption] = None
|
103 |
+
self._previous_end_time : Optional[time] = None
|
104 |
+
self._previous_result_is_recognized = False
|
105 |
+
self._recognized_lines : List[str] = []
|
106 |
+
self._offline_results : List[speechsdk.SpeechRecognitionResult] = []
|
107 |
+
|
108 |
+
def get_timestamp(self, start : time, end : time) -> str :
|
109 |
+
time_format = ""
|
110 |
+
if self._user_config["use_sub_rip_text_caption_format"] :
|
111 |
+
# SRT format requires ',' as decimal separator rather than '.'.
|
112 |
+
time_format = "%H:%M:%S,%f"
|
113 |
+
else :
|
114 |
+
time_format = "%H:%M:%S.%f"
|
115 |
+
# Truncate microseconds to milliseconds.
|
116 |
+
return "{} --> {}".format(start.strftime(time_format)[:-3], end.strftime(time_format)[:-3])
|
117 |
+
|
118 |
+
def string_from_caption(self, caption : caption_helper.Caption) -> str :
|
119 |
+
retval = ""
|
120 |
+
if self._user_config["use_sub_rip_text_caption_format"] :
|
121 |
+
retval += str(caption.sequence) + linesep
|
122 |
+
retval += self.get_timestamp(caption.begin, caption.end) + linesep
|
123 |
+
retval += caption.text + linesep + linesep
|
124 |
+
return retval
|
125 |
+
|
126 |
+
def adjust_real_time_caption_text(self, text : str, is_recognized_result : bool) -> str :
|
127 |
+
# Split the caption text into multiple lines based on max_line_length and lines.
|
128 |
+
temp_caption_helper = caption_helper.CaptionHelper(self._user_config["language"], self._user_config["max_line_length"], self._user_config["lines"], [])
|
129 |
+
lines = temp_caption_helper.lines_from_text(text)
|
130 |
+
|
131 |
+
# Recognizing results can change with each new result, so we do not save previous Recognizing results.
|
132 |
+
# Recognized results are final, so we save them in a member value.
|
133 |
+
recognizing_lines : List[str] = []
|
134 |
+
if is_recognized_result :
|
135 |
+
self._recognized_lines = self._recognized_lines + lines
|
136 |
+
else :
|
137 |
+
recognizing_lines = lines
|
138 |
+
|
139 |
+
caption_lines = self._recognized_lines + recognizing_lines
|
140 |
+
return '\n'.join(caption_lines[-self._user_config["lines"]:])
|
141 |
+
|
142 |
+
def caption_from_real_time_result(self, result : speechsdk.SpeechRecognitionResult, is_recognized_result : bool) -> Optional[str] :
|
143 |
+
retval : Optional[str] = None
|
144 |
+
|
145 |
+
start_time = helper.time_from_ticks(result.offset)
|
146 |
+
end_time = helper.time_from_ticks(result.offset + result.duration)
|
147 |
+
|
148 |
+
# If the end timestamp for the previous result is later
|
149 |
+
# than the end timestamp for this result, drop the result.
|
150 |
+
# This sometimes happens when we receive a lot of Recognizing results close together.
|
151 |
+
if self._previous_end_time is not None and self._previous_end_time > end_time :
|
152 |
+
pass
|
153 |
+
else :
|
154 |
+
# Record the end timestamp for this result.
|
155 |
+
self._previous_end_time = end_time
|
156 |
+
|
157 |
+
# Convert the SpeechRecognitionResult to a caption.
|
158 |
+
# We are not ready to set the text for this caption.
|
159 |
+
# First we need to determine whether to clear _recognizedLines.
|
160 |
+
caption = caption_helper.Caption(self._user_config["language"], self._srt_sequence_number, helper.add_time_and_timedelta(start_time, self._user_config["delay"]), helper.add_time_and_timedelta(end_time, self._user_config["delay"]), "")
|
161 |
+
# Increment the sequence number.
|
162 |
+
self._srt_sequence_number += 1
|
163 |
+
|
164 |
+
# If we have a previous caption...
|
165 |
+
if self._previous_caption is not None :
|
166 |
+
# If the previous result was type Recognized...
|
167 |
+
if self._previous_result_is_recognized :
|
168 |
+
# Set the end timestamp for the previous caption to the earliest of:
|
169 |
+
# - The end timestamp for the previous caption plus the remain time.
|
170 |
+
# - The start timestamp for the current caption.
|
171 |
+
previous_end = helper.add_time_and_timedelta(self._previous_caption.end, self._user_config["remain_time"])
|
172 |
+
self._previous_caption.end = previous_end if previous_end < caption.begin else caption.begin
|
173 |
+
# If the gap between the original end timestamp for the previous caption
|
174 |
+
# and the start timestamp for the current caption is larger than remainTime,
|
175 |
+
# clear the cached recognized lines.
|
176 |
+
# Note this needs to be done before we call AdjustRealTimeCaptionText
|
177 |
+
# for the current caption, because it uses _recognizedLines.
|
178 |
+
if previous_end < caption.begin :
|
179 |
+
self._recognized_lines.clear()
|
180 |
+
# If the previous result was type Recognizing, simply set the start timestamp
|
181 |
+
# for the current caption to the end timestamp for the previous caption.
|
182 |
+
# Note this presumes there will not be a large gap between Recognizing results,
|
183 |
+
# because such a gap would cause the previous Recognizing result to be succeeded
|
184 |
+
# by a Recognized result.
|
185 |
+
else :
|
186 |
+
caption.begin = self._previous_caption.end
|
187 |
+
|
188 |
+
retval = self.string_from_caption(self._previous_caption)
|
189 |
+
|
190 |
+
# Break the caption text into lines if needed.
|
191 |
+
caption.text = self.adjust_real_time_caption_text(result.text, is_recognized_result)
|
192 |
+
# Save the current caption as the previous caption.
|
193 |
+
self._previous_caption = caption
|
194 |
+
# Save the result type as the previous result type.
|
195 |
+
self._previous_result_is_recognized = is_recognized_result
|
196 |
+
|
197 |
+
return retval
|
198 |
+
|
199 |
+
def captions_from_offline_results(self) -> List[caption_helper.Caption] :
|
200 |
+
captions = caption_helper.get_captions(self._user_config["language"], self._user_config["max_line_length"], self._user_config["lines"], list(self._offline_results))
|
201 |
+
# Save the last caption.
|
202 |
+
last_caption = captions[-1]
|
203 |
+
last_caption.end = helper.add_time_and_timedelta(last_caption.end, self._user_config["remain_time"])
|
204 |
+
# In offline mode, all captions come from RecognitionResults of type Recognized.
|
205 |
+
# Set the end timestamp for each caption to the earliest of:
|
206 |
+
# - The end timestamp for this caption plus the remain time.
|
207 |
+
# - The start timestamp for the next caption.
|
208 |
+
captions_2 : List[caption_helper.Caption] = []
|
209 |
+
for (caption_1, caption_2) in pairwise(captions) :
|
210 |
+
end = helper.add_time_and_timedelta(caption_1.end, self._user_config["remain_time"])
|
211 |
+
caption_1.end = end if end < caption_2.begin else caption_2.begin
|
212 |
+
captions_2.append(caption_1)
|
213 |
+
# Re-add the last caption.
|
214 |
+
captions_2.append(last_caption)
|
215 |
+
return captions_2
|
216 |
+
|
217 |
+
def finish(self) -> None :
|
218 |
+
if user_config_helper.CaptioningMode.OFFLINE == self._user_config["captioning_mode"] :
|
219 |
+
for caption in self.captions_from_offline_results() :
|
220 |
+
helper.write_to_console_or_file(text=self.string_from_caption(caption), user_config=self._user_config)
|
221 |
+
elif user_config_helper.CaptioningMode.REALTIME == self._user_config["captioning_mode"] :
|
222 |
+
# Show the last "previous" caption, which is actually the last caption.
|
223 |
+
if self._previous_caption is not None :
|
224 |
+
self._previous_caption.end = helper.add_time_and_timedelta(self._previous_caption.end, self._user_config["remain_time"])
|
225 |
+
helper.write_to_console_or_file(text=self.string_from_caption(self._previous_caption), user_config=self._user_config)
|
226 |
+
|
227 |
+
def initialize(self) :
|
228 |
+
if self._user_config["output_file"] is not None and exists(self._user_config["output_file"]) :
|
229 |
+
remove(self._user_config["output_file"])
|
230 |
+
if not self._user_config["use_sub_rip_text_caption_format"] :
|
231 |
+
helper.write_to_console_or_file(text="WEBVTT{}{}".format(linesep, linesep), user_config=self._user_config)
|
232 |
+
return
|
233 |
+
|
234 |
+
def audio_config_from_user_config(self) -> helper.Read_Only_Dict :
|
235 |
+
if self._user_config["input_file"] is None :
|
236 |
+
return helper.Read_Only_Dict({
|
237 |
+
"audio_config" : speechsdk.AudioConfig(use_default_microphone=True),
|
238 |
+
"audio_stream_format" : None,
|
239 |
+
"pull_input_audio_stream_callback" : None,
|
240 |
+
"pull_input_audio_stream" : None
|
241 |
+
});
|
242 |
+
else :
|
243 |
+
audio_stream_format = None
|
244 |
+
if not self._user_config["use_compressed_audio"] :
|
245 |
+
reader = wave.open(self._user_config["input_file"], mode=None)
|
246 |
+
audio_stream_format = speechsdk.audio.AudioStreamFormat(samples_per_second=reader.getframerate(), bits_per_sample=reader.getsampwidth() * 8, channels=reader.getnchannels())
|
247 |
+
reader.close()
|
248 |
+
else :
|
249 |
+
audio_stream_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=self._user_config["compressed_audio_format"])
|
250 |
+
callback = helper.BinaryFileReaderCallback(filename=self._user_config["input_file"])
|
251 |
+
stream = speechsdk.audio.PullAudioInputStream(pull_stream_callback=callback, stream_format=audio_stream_format)
|
252 |
+
# We return the BinaryFileReaderCallback, AudioStreamFormat, and PullAudioInputStream
|
253 |
+
# because we need to keep them in scope until they are actually used.
|
254 |
+
return helper.Read_Only_Dict({
|
255 |
+
"audio_config" : speechsdk.audio.AudioConfig(stream=stream),
|
256 |
+
"audio_stream_format" : audio_stream_format,
|
257 |
+
"pull_input_audio_stream_callback" : callback,
|
258 |
+
"pull_input_audio_stream" : stream,
|
259 |
+
})
|
260 |
+
|
261 |
+
def speech_config_from_user_config(self) -> speechsdk.SpeechConfig :
|
262 |
+
speech_config = None
|
263 |
+
speech_config = speechsdk.SpeechConfig(subscription=self._user_config["subscription_key"], region=self._user_config["region"])
|
264 |
+
|
265 |
+
speech_config.set_profanity(self._user_config["profanity_option"])
|
266 |
+
|
267 |
+
if self._user_config["stable_partial_result_threshold"] is not None :
|
268 |
+
speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_StablePartialResultThreshold, value=self._user_config["stable_partial_result_threshold"])
|
269 |
+
|
270 |
+
speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_PostProcessingOption, value="TrueText")
|
271 |
+
speech_config.speech_recognition_language=self._user_config["language"]
|
272 |
+
|
273 |
+
return speech_config
|
274 |
+
|
275 |
+
def speech_recognizer_from_user_config(self) -> helper.Read_Only_Dict :
|
276 |
+
audio_config_data = self.audio_config_from_user_config()
|
277 |
+
speech_config = self.speech_config_from_user_config()
|
278 |
+
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config_data["audio_config"])
|
279 |
+
|
280 |
+
if len(self._user_config["phrases"]) > 0 :
|
281 |
+
grammar = speechsdk.PhraseListGrammar.from_recognizer(recognizer=speech_recognizer)
|
282 |
+
for phrase in self._user_config["phrases"] :
|
283 |
+
grammar.addPhrase(phrase)
|
284 |
+
|
285 |
+
return helper.Read_Only_Dict({
|
286 |
+
"speech_recognizer" : speech_recognizer,
|
287 |
+
"audio_stream_format" : audio_config_data["audio_stream_format"],
|
288 |
+
"pull_input_audio_stream_callback" : audio_config_data["pull_input_audio_stream_callback"],
|
289 |
+
"pull_input_audio_stream" : audio_config_data["pull_input_audio_stream"],
|
290 |
+
})
|
291 |
+
|
292 |
+
def recognize_continuous(self, speech_recognizer : speechsdk.SpeechRecognizer, format : speechsdk.audio.AudioStreamFormat, callback : helper.BinaryFileReaderCallback, stream : speechsdk.audio.PullAudioInputStream) :
|
293 |
+
done = False
|
294 |
+
def recognizing_handler(e : speechsdk.SpeechRecognitionEventArgs) :
|
295 |
+
if speechsdk.ResultReason.RecognizingSpeech == e.result.reason and len(e.result.text) > 0 :
|
296 |
+
# This seems to be the only way we can get information about
|
297 |
+
# exceptions raised inside an event handler.
|
298 |
+
try :
|
299 |
+
caption = self.caption_from_real_time_result(e.result, False)
|
300 |
+
if caption is not None :
|
301 |
+
helper.write_to_console_or_file(text=caption, user_config=self._user_config)
|
302 |
+
except Exception as ex :
|
303 |
+
print('Exception in recognizing_handler: {}'.format(ex))
|
304 |
+
elif speechsdk.ResultReason.NoMatch == e.result.reason :
|
305 |
+
helper.write_to_console(text="NOMATCH: Speech could not be recognized.{}".format(linesep), user_config=self._user_config)
|
306 |
+
|
307 |
+
def recognized_handler(e : speechsdk.SpeechRecognitionEventArgs) :
|
308 |
+
if speechsdk.ResultReason.RecognizedSpeech == e.result.reason and len(e.result.text) > 0 :
|
309 |
+
try :
|
310 |
+
if user_config_helper.CaptioningMode.OFFLINE == self._user_config["captioning_mode"] :
|
311 |
+
self._offline_results.append(e.result)
|
312 |
+
else :
|
313 |
+
caption = self.caption_from_real_time_result(e.result, True)
|
314 |
+
if caption is not None :
|
315 |
+
helper.write_to_console_or_file(text=caption, user_config=self._user_config)
|
316 |
+
except Exception as ex :
|
317 |
+
print('Exception in recognized_handler: {}'.format(ex))
|
318 |
+
elif speechsdk.ResultReason.NoMatch == e.result.reason :
|
319 |
+
helper.write_to_console(text="NOMATCH: Speech could not be recognized.{}".format(linesep), user_config=self._user_config)
|
320 |
+
|
321 |
+
def canceled_handler(e : speechsdk.SpeechRecognitionCanceledEventArgs) :
|
322 |
+
nonlocal done
|
323 |
+
# Notes:
|
324 |
+
# SpeechRecognitionCanceledEventArgs inherits the result property from SpeechRecognitionEventArgs. See:
|
325 |
+
# https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitioncanceledeventargs
|
326 |
+
# https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitioneventargs
|
327 |
+
# result is type SpeechRecognitionResult, which inherits the reason property from RecognitionResult. See:
|
328 |
+
# https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechrecognitionresult
|
329 |
+
# https://docs.microsoft.com/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult
|
330 |
+
# e.result.reason is ResultReason.Canceled. To get the cancellation reason, see e.cancellation_details.reason.
|
331 |
+
if speechsdk.CancellationReason.EndOfStream == e.cancellation_details.reason :
|
332 |
+
helper.write_to_console(text="End of stream reached.{}".format(linesep), user_config=self._user_config)
|
333 |
+
done = True
|
334 |
+
elif speechsdk.CancellationReason.CancelledByUser == e.cancellation_details.reason :
|
335 |
+
helper.write_to_console(text="User canceled request.{}".format(linesep), user_config=self._user_config)
|
336 |
+
done = True
|
337 |
+
elif speechsdk.CancellationReason.Error == e.cancellation_details.reason :
|
338 |
+
# Error output should not be suppressed, even if suppress output flag is set.
|
339 |
+
print("Encountered error. Cancellation details: {}{}".format(e.cancellation_details, linesep))
|
340 |
+
done = True
|
341 |
+
else :
|
342 |
+
print("Request was cancelled for an unrecognized reason. Cancellation details: {}{}".format(e.cancellation_details, linesep))
|
343 |
+
done = True
|
344 |
+
|
345 |
+
def stopped_handler(e : speechsdk.SessionEventArgs) :
|
346 |
+
nonlocal done
|
347 |
+
helper.write_to_console(text="Session stopped.{}".format(linesep), user_config=self._user_config)
|
348 |
+
done = True
|
349 |
+
|
350 |
+
# We only use Recognizing results in real-time mode.
|
351 |
+
if user_config_helper.CaptioningMode.REALTIME == self._user_config["captioning_mode"] :
|
352 |
+
speech_recognizer.recognizing.connect(recognizing_handler)
|
353 |
+
speech_recognizer.recognized.connect(recognized_handler)
|
354 |
+
speech_recognizer.session_stopped.connect(stopped_handler)
|
355 |
+
speech_recognizer.canceled.connect(canceled_handler)
|
356 |
+
|
357 |
+
speech_recognizer.start_continuous_recognition()
|
358 |
+
|
359 |
+
while not done :
|
360 |
+
sleep(5)
|
361 |
+
speech_recognizer.stop_continuous_recognition()
|
362 |
+
|
363 |
+
return
|
364 |
+
|
365 |
+
def generate_sub(language, input_file, output_file) :
|
366 |
+
captioning = Captioning(language=language, input_audio=input_file, output=output_file)
|
367 |
+
captioning.initialize()
|
368 |
+
speech_recognizer_data = captioning.speech_recognizer_from_user_config()
|
369 |
+
captioning.recognize_continuous(speech_recognizer=speech_recognizer_data["speech_recognizer"], format=speech_recognizer_data["audio_stream_format"], callback=speech_recognizer_data["pull_input_audio_stream_callback"], stream=speech_recognizer_data["pull_input_audio_stream"])
|
370 |
+
captioning.finish()
|
app/captioning/helper.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) Microsoft. All rights reserved.
|
3 |
+
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
4 |
+
#
|
5 |
+
|
6 |
+
# Note: abc = abstract base classes
|
7 |
+
from collections.abc import Mapping
|
8 |
+
from datetime import date, datetime, time, timedelta
|
9 |
+
from sys import argv
|
10 |
+
from typing import Optional
|
11 |
+
from pathlib import Path
|
12 |
+
import azure.cognitiveservices.speech as speechsdk # type: ignore
|
13 |
+
|
14 |
+
DEFAULT_MAX_LINE_LENGTH_SBCS = 37
|
15 |
+
DEFAULT_MAX_LINE_LENGTH_MBCS = 30
|
16 |
+
|
17 |
+
# See speech_recognize_once_compressed_input() in:
|
18 |
+
# https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/python/console/speech_sample.py
|
19 |
+
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
|
20 |
+
def __init__(self, filename: str):
|
21 |
+
super().__init__()
|
22 |
+
self._file_h = open(filename, "rb")
|
23 |
+
|
24 |
+
def read(self, buffer: memoryview) -> int:
|
25 |
+
try:
|
26 |
+
size = buffer.nbytes
|
27 |
+
frames = self._file_h.read(size)
|
28 |
+
buffer[:len(frames)] = frames
|
29 |
+
return len(frames)
|
30 |
+
except Exception as ex:
|
31 |
+
print('Exception in `read`: {}'.format(ex))
|
32 |
+
raise
|
33 |
+
|
34 |
+
def close(self) -> None:
|
35 |
+
print('closing file')
|
36 |
+
try:
|
37 |
+
self._file_h.close()
|
38 |
+
except Exception as ex:
|
39 |
+
print('Exception in `close`: {}'.format(ex))
|
40 |
+
raise
|
41 |
+
|
42 |
+
class Read_Only_Dict(Mapping):
|
43 |
+
def __init__(self, data):
|
44 |
+
self._data = data
|
45 |
+
def __getitem__(self, key):
|
46 |
+
return self._data[key]
|
47 |
+
def __len__(self):
|
48 |
+
return len(self._data)
|
49 |
+
def __iter__(self):
|
50 |
+
return iter(self._data)
|
51 |
+
|
52 |
+
# See:
|
53 |
+
# https://stackoverflow.com/a/12448721
|
54 |
+
# https://stackoverflow.com/a/39651061
|
55 |
+
def add_time_and_timedelta(t1 : time, t2 : timedelta) -> time :
|
56 |
+
return (datetime.combine(date.min, t1) + t2).time()
|
57 |
+
|
58 |
+
def subtract_times(t1 : time, t2 : time) -> timedelta :
|
59 |
+
return datetime.combine(date.min, t1) - datetime.combine(date.min, t2)
|
60 |
+
|
61 |
+
# We cannot simply create time with ticks.
|
62 |
+
def time_from_ticks(ticks) -> time :
|
63 |
+
microseconds_1 = ticks / 10
|
64 |
+
microseconds_2 = microseconds_1 % 1000000
|
65 |
+
seconds_1 = microseconds_1 / 1000000
|
66 |
+
seconds_2 = seconds_1 % 60
|
67 |
+
minutes_1 = seconds_1 / 60
|
68 |
+
minutes_2 = minutes_1 % 60
|
69 |
+
hours = minutes_1 / 60
|
70 |
+
return time(int(hours), int(minutes_2), int(seconds_2), int(microseconds_2))
|
71 |
+
|
72 |
+
def write_to_console(text : str, user_config : Read_Only_Dict) :
|
73 |
+
if not user_config["suppress_console_output"] :
|
74 |
+
print(text, end = "", flush = True)
|
75 |
+
return
|
76 |
+
|
77 |
+
def write_to_console_or_file(text : str, user_config : Read_Only_Dict) :
|
78 |
+
write_to_console(text = text, user_config = user_config)
|
79 |
+
if user_config["output_file"] is not None :
|
80 |
+
file_path = Path(user_config["output_file"])
|
81 |
+
with open(file_path, mode = "a", newline = "", encoding='utf-8') as f :
|
82 |
+
f.write(text)
|
83 |
+
return
|
app/captioning/user_config_helper.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) Microsoft. All rights reserved.
|
3 |
+
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
4 |
+
#
|
5 |
+
|
6 |
+
from datetime import timedelta
|
7 |
+
from enum import Enum
|
8 |
+
from os import linesep, environ
|
9 |
+
from sys import argv
|
10 |
+
from typing import List, Optional
|
11 |
+
import azure.cognitiveservices.speech as speechsdk # type: ignore
|
12 |
+
from . import helper
|
13 |
+
|
14 |
+
class CaptioningMode(Enum):
|
15 |
+
OFFLINE = 1
|
16 |
+
REALTIME = 2
|
17 |
+
|
18 |
+
def get_cmd_option(option : str) -> Optional[str] :
|
19 |
+
argc = len(argv)
|
20 |
+
if option.lower() in list(map(lambda arg: arg.lower(), argv)) :
|
21 |
+
index = argv.index(option)
|
22 |
+
if index < argc - 1 :
|
23 |
+
# We found the option (for example, "--output"), so advance from that to the value (for example, "filename").
|
24 |
+
return argv[index + 1]
|
25 |
+
else :
|
26 |
+
return None
|
27 |
+
else :
|
28 |
+
return None
|
29 |
+
|
30 |
+
def cmd_option_exists(option : str) -> bool :
|
31 |
+
return option.lower() in list(map(lambda arg : arg.lower(), argv))
|
32 |
+
|
33 |
+
def get_language() -> str :
|
34 |
+
retval = "en-US"
|
35 |
+
language = get_cmd_option("--language")
|
36 |
+
if language is not None :
|
37 |
+
retval = language
|
38 |
+
return retval
|
39 |
+
|
40 |
+
def get_phrases() -> List[str] :
|
41 |
+
retval : List[str] = []
|
42 |
+
phrases = get_cmd_option("--phrases")
|
43 |
+
if phrases is not None :
|
44 |
+
retval = list(map(lambda phrase : phrase.strip(), phrases.split(';')))
|
45 |
+
return retval
|
46 |
+
|
47 |
+
def get_compressed_audio_format() -> speechsdk.AudioStreamContainerFormat :
|
48 |
+
value = get_cmd_option("--format")
|
49 |
+
if value is None :
|
50 |
+
return speechsdk.AudioStreamContainerFormat.ANY
|
51 |
+
else :
|
52 |
+
value = value.lower()
|
53 |
+
if "alaw" == value : return speechsdk.AudioStreamContainerFormat.ALAW
|
54 |
+
elif "flac" == value : return speechsdk.AudioStreamContainerFormat.FLAC
|
55 |
+
elif "mp3" == value : return speechsdk.AudioStreamContainerFormat.MP3
|
56 |
+
elif "mulaw" == value : return speechsdk.AudioStreamContainerFormat.MULAW
|
57 |
+
elif "ogg_opus" == value : return speechsdk.AudioStreamContainerFormat.OGG_OPUS
|
58 |
+
else : return speechsdk.AudioStreamContainerFormat.ANY;
|
59 |
+
|
60 |
+
def get_profanity_option() -> speechsdk.ProfanityOption :
|
61 |
+
value = get_cmd_option("--profanity")
|
62 |
+
if value is None :
|
63 |
+
return speechsdk.ProfanityOption.Masked
|
64 |
+
else :
|
65 |
+
value = value.lower()
|
66 |
+
if "raw" == value: return speechsdk.ProfanityOption.Raw
|
67 |
+
elif "remove" == value : return speechsdk.ProfanityOption.Removed
|
68 |
+
else : return speechsdk.ProfanityOption.Masked
|
69 |
+
|
70 |
+
def user_config_from_args(usage : str) -> helper.Read_Only_Dict :
|
71 |
+
keyEnv = environ["SPEECH_KEY"] if "SPEECH_KEY" in environ else None
|
72 |
+
keyOption = get_cmd_option("--key")
|
73 |
+
key = keyOption if keyOption is not None else keyEnv
|
74 |
+
if key is None :
|
75 |
+
raise RuntimeError("Please set the SPEECH_KEY environment variable or provide a Speech resource key with the --key option.{}{}".format(linesep, usage))
|
76 |
+
|
77 |
+
regionEnv = environ["SPEECH_REGION"] if "SPEECH_REGION" in environ else None
|
78 |
+
regionOption = get_cmd_option("--region")
|
79 |
+
region = regionOption if regionOption is not None else regionEnv
|
80 |
+
if region is None :
|
81 |
+
raise RuntimeError("Please set the SPEECH_REGION environment variable or provide a Speech resource region with the --region option.{}{}".format(linesep, usage))
|
82 |
+
|
83 |
+
captioning_mode = CaptioningMode.REALTIME if cmd_option_exists("--realtime") and not cmd_option_exists("--offline") else CaptioningMode.OFFLINE
|
84 |
+
|
85 |
+
td_remain_time = timedelta(milliseconds=1000)
|
86 |
+
s_remain_time = get_cmd_option("--remainTime")
|
87 |
+
if s_remain_time is not None :
|
88 |
+
int_remain_time = float(s_remain_time)
|
89 |
+
if int_remain_time < 0 :
|
90 |
+
int_remain_time = 1000
|
91 |
+
td_remain_time = timedelta(milliseconds=int_remain_time)
|
92 |
+
|
93 |
+
td_delay = timedelta(milliseconds=1000)
|
94 |
+
s_delay = get_cmd_option("--delay")
|
95 |
+
if s_delay is not None :
|
96 |
+
int_delay = float(s_delay)
|
97 |
+
if int_delay < 0 :
|
98 |
+
int_delay = 1000
|
99 |
+
td_delay = timedelta(milliseconds=int_delay)
|
100 |
+
|
101 |
+
int_max_line_length = helper.DEFAULT_MAX_LINE_LENGTH_SBCS
|
102 |
+
s_max_line_length = get_cmd_option("--maxLineLength")
|
103 |
+
if s_max_line_length is not None :
|
104 |
+
int_max_line_length = int(s_max_line_length)
|
105 |
+
if int_max_line_length < 20 :
|
106 |
+
int_max_line_length = 20
|
107 |
+
|
108 |
+
int_lines = 2
|
109 |
+
s_lines = get_cmd_option("--lines")
|
110 |
+
if s_lines is not None :
|
111 |
+
int_lines = int(s_lines)
|
112 |
+
if int_lines < 1 :
|
113 |
+
int_lines = 2
|
114 |
+
|
115 |
+
return helper.Read_Only_Dict({
|
116 |
+
"use_compressed_audio" : cmd_option_exists("--format"),
|
117 |
+
"compressed_audio_format" : get_compressed_audio_format(),
|
118 |
+
"profanity_option" : get_profanity_option(),
|
119 |
+
"language" : get_language(),
|
120 |
+
"input_file" : get_cmd_option("--input"),
|
121 |
+
"output_file" : get_cmd_option("--output"),
|
122 |
+
"phrases" : get_phrases(),
|
123 |
+
"suppress_console_output" : cmd_option_exists("--quiet"),
|
124 |
+
"captioning_mode" : captioning_mode,
|
125 |
+
"remain_time" : td_remain_time,
|
126 |
+
"delay" : td_delay,
|
127 |
+
"use_sub_rip_text_caption_format" : cmd_option_exists("--srt"),
|
128 |
+
"max_line_length" : int_max_line_length,
|
129 |
+
"lines" : int_lines,
|
130 |
+
"stable_partial_result_threshold" : get_cmd_option("--threshold"),
|
131 |
+
"subscription_key" : key,
|
132 |
+
"region" : region,
|
133 |
+
})
|
app/constants.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MALE_LANGUAGES = {
|
2 |
+
"hi": ["hi-IN", "hi-IN-MadhurNeural"], # hindi
|
3 |
+
"bn": ["bn-IN", "bn-IN-BashkarNeural"], # bengali
|
4 |
+
"en": ["en-IN", "en-IN-PrabhatNeural"], # english
|
5 |
+
"gu": ["gu-IN", "gu-IN-NiranjanNeural"], # gujarati
|
6 |
+
"kn": ["kn-IN", "kn-IN-GaganNeural"], # kannada
|
7 |
+
"ml": ["ml-IN", "ml-IN-MidhunNeural"], # malayalam
|
8 |
+
"mr": ["mr-IN", "mr-IN-ManoharNeural"], # marathi
|
9 |
+
"ta": ["ta-IN", "ta-IN-ValluvarNeural"], # tamil
|
10 |
+
"te": ["te-IN", "te-IN-MohanNeural"], # telugu
|
11 |
+
"ur": ["ur-IN", "ur-IN-SalmanNeural"], # urdu
|
12 |
+
"de": ["de-DE", "de-DE-ConradNeural"], # german
|
13 |
+
"ja": ["ja-JP", "ja-JP-KeitaNeural"], # japanese
|
14 |
+
}
|
15 |
+
|
16 |
+
FEMALE_LANGUAGES = {
|
17 |
+
"hi": ["hi-IN", "hi-IN-SwaraNeural"], # hindi
|
18 |
+
"bn": ["bn-IN", "bn-IN-TanishaaNeural"], # bengali
|
19 |
+
"en": ["en-IN", "en-IN-NeerjaNeural"], # english
|
20 |
+
"gu": ["gu-IN", "gu-IN-DhwaniNeural"], # gujarati
|
21 |
+
"kn": ["kn-IN", "kn-IN-SapnaNeural"], # kannada
|
22 |
+
"ml": ["ml-IN", "ml-IN-SobhanaNeural"], # malayalam
|
23 |
+
"mr": ["mr-IN", "mr-IN-AarohiNeural"], # marathi
|
24 |
+
"ta": ["ta-IN", "ta-IN-PallaviNeural"], # tamil
|
25 |
+
"te": ["te-IN", "te-IN-ShrutiNeural"], # telugu
|
26 |
+
"ur": ["ur-IN", "ur-IN-GulNeural"], # urdu
|
27 |
+
"de": ["de-DE", "de-DE-AmalaNeural"], # german
|
28 |
+
"ja": ["ja-JP", "ja-JP-NanamiNeural"], # japanese
|
29 |
+
}
|
app/functions/__init__.py
ADDED
File without changes
|
app/functions/helper.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from subprocess import run, DEVNULL
|
2 |
+
from app.captioning import generate_sub
|
3 |
+
|
4 |
+
def download_video(link, output):
|
5 |
+
command = ["yt-dlp", "-f", "bv*[ext=mp4]", "-o", output, link]
|
6 |
+
run(command, stdout=DEVNULL, stderr=DEVNULL)
|
7 |
+
|
8 |
+
def download_audio(link, output):
|
9 |
+
command = ["yt-dlp", "-f", "ba*[ext=m4a]", "-o", output, link]
|
10 |
+
run(command, stdout=DEVNULL, stderr=DEVNULL)
|
11 |
+
|
12 |
+
def m4a_to_wav(input_video, output):
|
13 |
+
command = ["ffmpeg", "-i", input_video, output]
|
14 |
+
run(command, stdout=DEVNULL, stderr=DEVNULL)
|
15 |
+
print(f"m4a to wav converted, Input: {input_video}, Output: {output}")
|
16 |
+
|
17 |
+
|
18 |
+
def audio_to_srt(language, audio_file, output):
|
19 |
+
generate_sub(language, audio_file, output)
|
20 |
+
print("audio to srt converted")
|
21 |
+
|
22 |
+
def merge_video_audio(video_file, audio_file, output):
|
23 |
+
command = ["ffmpeg", "-i", video_file, "-i", audio_file, "-c:v", "copy", "-c:a", "copy", output]
|
24 |
+
run(command, stdout=DEVNULL, stderr=DEVNULL)
|
25 |
+
print(f"video and audio merged, Input: {video_file}, {audio_file}, Output: {output}")
|
app/functions/model.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
from fastapi import UploadFile
|
3 |
+
|
4 |
+
class VideoURL(BaseModel):
|
5 |
+
url: str
|
6 |
+
from_lang: str = "en"
|
7 |
+
to_lang: str = "hi"
|
8 |
+
gender: str = "MALE"
|
9 |
+
|
10 |
+
|
11 |
+
class VideoFile(BaseModel):
|
12 |
+
video: UploadFile
|
13 |
+
from_lang: str = "en"
|
14 |
+
to_lang: str = "hi"
|
15 |
+
gender: str = "MALE"
|
16 |
+
|
17 |
+
class YoutubeURL(BaseModel):
|
18 |
+
url: str
|
app/functions/s3_handler.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import boto3
|
3 |
+
from botocore.exceptions import ClientError
|
4 |
+
import os
|
5 |
+
|
6 |
+
ACCESS_KEY_ID = os.environ.get("ACCESS_KEY_ID")
|
7 |
+
SECRET_ACCESS_KEY = os.environ.get("SECRET_ACCESS_KEY")
|
8 |
+
|
9 |
+
session = boto3.Session(ACCESS_KEY_ID, SECRET_ACCESS_KEY)
|
10 |
+
|
11 |
+
def upload_file(file_name, bucket, folder, object_name=None):
|
12 |
+
# If S3 object_name was not specified, use file_name
|
13 |
+
if object_name is None:
|
14 |
+
object_name = os.path.basename(file_name)
|
15 |
+
|
16 |
+
# Upload the file
|
17 |
+
s3_client = session.client('s3')
|
18 |
+
try:
|
19 |
+
response = s3_client.upload_file(file_name, bucket, f"{folder}/"+object_name)
|
20 |
+
except ClientError as e:
|
21 |
+
logging.error(e)
|
22 |
+
return False
|
23 |
+
|
24 |
+
url = f'{os.environ.get("RESULT_URL")}{folder}/{object_name}'
|
25 |
+
return url
|
app/functions/video_url_handler.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from uuid import uuid4
|
3 |
+
from tempfile import TemporaryDirectory
|
4 |
+
from .s3_handler import upload_file
|
5 |
+
from app.scripts import synthesise_audio
|
6 |
+
from .helper import download_audio, download_video, m4a_to_wav, audio_to_srt, merge_video_audio
|
7 |
+
from app.constants import MALE_LANGUAGES, FEMALE_LANGUAGES
|
8 |
+
|
9 |
+
def handler_video_url(url, from_lang, to_lang, gender):
|
10 |
+
with TemporaryDirectory(dir=".") as tempdir:
|
11 |
+
srt_file = f"{tempdir}/audio.srt"
|
12 |
+
video_file = f"{tempdir}/video.mp4"
|
13 |
+
audio_file = f"{tempdir}/audio.m4a"
|
14 |
+
audio_wav_file = f"{tempdir}/audio.wav"
|
15 |
+
translated_video = f"{tempdir}/translated_video.mp4"
|
16 |
+
download_audio(url, audio_file)
|
17 |
+
download_video(url, video_file)
|
18 |
+
m4a_to_wav(audio_file, audio_wav_file)
|
19 |
+
language_code = MALE_LANGUAGES[from_lang][0]
|
20 |
+
audio_to_srt(language_code, audio_wav_file, srt_file)
|
21 |
+
|
22 |
+
if gender.lower() == "male":
|
23 |
+
language_code = MALE_LANGUAGES[to_lang][0]
|
24 |
+
voice_name = MALE_LANGUAGES[to_lang][1]
|
25 |
+
else:
|
26 |
+
language_code = FEMALE_LANGUAGES[to_lang][0]
|
27 |
+
voice_name = FEMALE_LANGUAGES[to_lang][1]
|
28 |
+
|
29 |
+
result = synthesise_audio(
|
30 |
+
srt_file=srt_file,
|
31 |
+
video_file=video_file,
|
32 |
+
output_folder=tempdir,
|
33 |
+
language_code=language_code,
|
34 |
+
voice_name=voice_name,
|
35 |
+
from_lang=from_lang,
|
36 |
+
to_lang=to_lang,
|
37 |
+
gender=gender,
|
38 |
+
)
|
39 |
+
translated_srt = result["translated_subtitle"]
|
40 |
+
translated_audio = result["translated_audio"]
|
41 |
+
merge_video_audio(video_file, translated_audio, translated_video)
|
42 |
+
|
43 |
+
now = datetime.now()
|
44 |
+
today = now.strftime("%Y-%m-%d")
|
45 |
+
id = f"{today}/{str(uuid4()).replace('-', '')[:15]}"
|
46 |
+
srt_url = upload_file(srt_file, "expressapi", id, "subtitle.srt")
|
47 |
+
translated_srt_url = upload_file(
|
48 |
+
translated_srt, "expressapi", id, "translated_subtitle.srt"
|
49 |
+
)
|
50 |
+
translated_audio_url = upload_file(
|
51 |
+
translated_audio, "expressapi", id, "translated_audio.mp3"
|
52 |
+
)
|
53 |
+
translated_video_url = upload_file(translated_video, "expressapi", id, "translated_video.mp4")
|
54 |
+
return {
|
55 |
+
"srt_url": srt_url,
|
56 |
+
"video_url": translated_video_url,
|
57 |
+
"translated_srt_url": translated_srt_url,
|
58 |
+
"translated_audio_url": translated_audio_url,
|
59 |
+
}
|
60 |
+
|
app/functions/youtube_summarizer.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re, os
|
2 |
+
import requests
|
3 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
4 |
+
|
5 |
+
API_URL = os.environ.get("SUMMARIZE_API_URL")
|
6 |
+
API_TOKEN = os.environ.get("SUMMARIZE_API_TOKEN")
|
7 |
+
headers = {"Authorization": f"Bearer {API_TOKEN}"}
|
8 |
+
|
9 |
+
|
10 |
+
def extract_video_id(youtube_url):
|
11 |
+
video_id_pattern = r"(?:/shorts/|v=)([a-zA-Z0-9_-]+)(?:&|\?|$)"
|
12 |
+
match = re.search(video_id_pattern, youtube_url)
|
13 |
+
if match:
|
14 |
+
video_id = match.group(1)
|
15 |
+
return video_id
|
16 |
+
else:
|
17 |
+
return None
|
18 |
+
|
19 |
+
|
20 |
+
def youtube_summarizer_handler(link):
|
21 |
+
video_id = extract_video_id(link)
|
22 |
+
subs = YouTubeTranscriptApi.get_transcript(video_id)
|
23 |
+
texts = " ".join([sub["text"] for sub in subs])
|
24 |
+
payload = {"inputs": texts}
|
25 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
26 |
+
summary = response.json()[0]
|
27 |
+
return summary
|
app/main.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from .functions.video_url_handler import handler_video_url
|
3 |
+
from .functions.youtube_summarizer import youtube_summarizer_handler
|
4 |
+
from .functions.model import VideoURL, VideoFile, YoutubeURL
|
5 |
+
|
6 |
+
|
7 |
+
app = FastAPI()
|
8 |
+
|
9 |
+
|
10 |
+
@app.get("/")
|
11 |
+
async def home():
|
12 |
+
return {"health_check": "OK"}
|
13 |
+
|
14 |
+
|
15 |
+
@app.post("/synthesise_video_url")
|
16 |
+
async def synthesise_video_url(req: VideoURL):
|
17 |
+
response = handler_video_url(req.url, req.from_lang, req.to_lang, req.gender)
|
18 |
+
return response
|
19 |
+
|
20 |
+
|
21 |
+
@app.post("/youtube_summarizer")
|
22 |
+
async def youtube_summarizer(req: YoutubeURL):
|
23 |
+
response = youtube_summarizer_handler(req.url)
|
24 |
+
return response
|
app/scripts/TTS.py
ADDED
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import azure.cognitiveservices.speech as speechsdk
|
5 |
+
import datetime
|
6 |
+
import zipfile
|
7 |
+
import io
|
8 |
+
import copy
|
9 |
+
import re
|
10 |
+
from urllib.request import urlopen
|
11 |
+
from pathlib import Path
|
12 |
+
|
13 |
+
from . import azure_batch
|
14 |
+
from . import utils
|
15 |
+
from .utils import parseBool
|
16 |
+
# Get variables from config
|
17 |
+
|
18 |
+
# Get Azure variables if applicable
|
19 |
+
AZURE_SPEECH_KEY = os.environ.get('SPEECH_KEY')
|
20 |
+
AZURE_SPEECH_REGION = os.environ.get('SPEECH_REGION')
|
21 |
+
|
22 |
+
azure_sentence_pause = 80
|
23 |
+
azure_comma_pause = 50
|
24 |
+
debug_mode = False
|
25 |
+
tts_service = 'azure'
|
26 |
+
|
27 |
+
|
28 |
+
# ======================================== Pronunciation Correction Functions ================================================
|
29 |
+
BASE_DIR = Path(__file__).resolve().parent.parent / 'SSML_Customization'
|
30 |
+
|
31 |
+
interpretAsOverrideFile = os.path.join(BASE_DIR, 'interpret-as.csv')
|
32 |
+
interpretAsEntries = utils.csv_to_dict(interpretAsOverrideFile)
|
33 |
+
|
34 |
+
aliasOverrideFile = os.path.join(BASE_DIR, 'aliases.csv')
|
35 |
+
aliasEntries = utils.csv_to_dict(aliasOverrideFile)
|
36 |
+
|
37 |
+
urlListFile = os.path.join(BASE_DIR, 'url_list.txt')
|
38 |
+
urlList = utils.txt_to_list(urlListFile)
|
39 |
+
|
40 |
+
phonemeFile = os.path.join(BASE_DIR, 'Phoneme_Pronunciation.csv')
|
41 |
+
phonemeEntries = utils.csv_to_dict(phonemeFile)
|
42 |
+
|
43 |
+
def add_all_pronunciation_overrides(text):
|
44 |
+
text = add_interpretas_tags(text)
|
45 |
+
text = add_alias_tags(text)
|
46 |
+
text = add_phoneme_tags(text)
|
47 |
+
return text
|
48 |
+
|
49 |
+
def add_interpretas_tags(text):
|
50 |
+
# Add interpret-as tags from interpret-as.csv
|
51 |
+
for entryDict in interpretAsEntries:
|
52 |
+
# Get entry info
|
53 |
+
entryText = entryDict['Text']
|
54 |
+
entryInterpretAsType = entryDict['interpret-as Type']
|
55 |
+
isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
|
56 |
+
entryFormat = entryDict['Format (Optional)']
|
57 |
+
|
58 |
+
# Create say-as tag
|
59 |
+
if entryFormat == "":
|
60 |
+
sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}">'
|
61 |
+
else:
|
62 |
+
sayAsTagStart = rf'<say-as interpret-as="{entryInterpretAsType}" format="{entryFormat}">'
|
63 |
+
|
64 |
+
# Find and replace the word
|
65 |
+
findWordRegex = rf'(\b["\']?{entryText}[.,!?]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
|
66 |
+
if isCaseSensitive:
|
67 |
+
text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text) # Uses group reference, so remember regex must be in parentheses
|
68 |
+
|
69 |
+
else:
|
70 |
+
text = re.sub(findWordRegex, rf'{sayAsTagStart}\1</say-as>', text, flags=re.IGNORECASE)
|
71 |
+
|
72 |
+
# Add interpret-as tags from url_list.txt
|
73 |
+
for url in urlList:
|
74 |
+
# This regex expression will match the top level domain extension, and the punctuation before/after it, and any periods, slashes or colons
|
75 |
+
# It will then put the say-as characters tag around all matches
|
76 |
+
punctuationRegex = re.compile(r'((?:\.[a-z]{2,6}(?:\/|$|\s))|(?:[\.\/:]+))')
|
77 |
+
taggedURL = re.sub(punctuationRegex, r'<say-as interpret-as="characters">\1</say-as>', url)
|
78 |
+
# Replace any instances of the URL with the tagged version
|
79 |
+
text = text.replace(url, taggedURL)
|
80 |
+
|
81 |
+
return text
|
82 |
+
|
83 |
+
def add_alias_tags(text):
|
84 |
+
for entryDict in aliasEntries:
|
85 |
+
# Get entry info
|
86 |
+
entryText = entryDict['Original Text']
|
87 |
+
entryAlias = entryDict['Alias']
|
88 |
+
if entryDict['Case Sensitive (True/False)'] == "":
|
89 |
+
isCaseSensitive = False
|
90 |
+
else:
|
91 |
+
isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
|
92 |
+
|
93 |
+
# Find and replace the word
|
94 |
+
findWordRegex = rf'\b["\'()]?{entryText}[.,!?()]?["\']?\b' # Find the word, with optional punctuation after, and optional quotes before or after
|
95 |
+
if isCaseSensitive:
|
96 |
+
text = re.sub(findWordRegex, rf'{entryAlias}', text)
|
97 |
+
else:
|
98 |
+
text = re.sub(findWordRegex, rf'{entryAlias}', text, flags=re.IGNORECASE)
|
99 |
+
return text
|
100 |
+
|
101 |
+
|
102 |
+
# Uses the phoneme pronunciation file to add phoneme tags to the text
|
103 |
+
def add_phoneme_tags(text):
|
104 |
+
for entryDict in phonemeEntries:
|
105 |
+
# Get entry info
|
106 |
+
entryText = entryDict['Text']
|
107 |
+
entryPhoneme = entryDict['Phonetic Pronunciation']
|
108 |
+
entryAlphabet = entryDict['Phonetic Alphabet']
|
109 |
+
|
110 |
+
if entryDict['Case Sensitive (True/False)'] == "":
|
111 |
+
isCaseSensitive = False
|
112 |
+
else:
|
113 |
+
isCaseSensitive = parseBool(entryDict['Case Sensitive (True/False)'])
|
114 |
+
|
115 |
+
# Find and replace the word
|
116 |
+
findWordRegex = rf'(\b["\'()]?{entryText}[.,!?()]?["\']?\b)' # Find the word, with optional punctuation after, and optional quotes before or after
|
117 |
+
if isCaseSensitive:
|
118 |
+
text = re.sub(findWordRegex, rf'<phoneme alphabet="ipa" ph="{entryPhoneme}">\1</phoneme>', text)
|
119 |
+
else:
|
120 |
+
text = re.sub(findWordRegex, rf'<phoneme alphabet="{entryAlphabet}" ph="{entryPhoneme}">\1</phoneme>', text, flags=re.IGNORECASE)
|
121 |
+
return text
|
122 |
+
|
123 |
+
# ================================================== Azure Functions =========================================================
|
124 |
+
|
125 |
+
def synthesize_text_azure(text, duration, voiceName, languageCode):
|
126 |
+
|
127 |
+
# Create tag for desired duration of clip
|
128 |
+
durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'
|
129 |
+
|
130 |
+
# Create string for sentence pauses, if not default
|
131 |
+
if not azure_sentence_pause == 'default':
|
132 |
+
sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
|
133 |
+
else:
|
134 |
+
sentencePauseTag = ''
|
135 |
+
|
136 |
+
# Create string for comma pauses, if not default
|
137 |
+
if not azure_comma_pause == 'default':
|
138 |
+
commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
|
139 |
+
else:
|
140 |
+
commaPauseTag = ''
|
141 |
+
|
142 |
+
# Set string for tag to set leading and trailing silence times to zero
|
143 |
+
leadSilenceTag = '<mstts:silence type="Leading-exact" value="0ms"/>'
|
144 |
+
tailSilenceTag = '<mstts:silence type="Tailing-exact" value="0ms"/>'
|
145 |
+
|
146 |
+
# Process text using pronunciation customization set by user
|
147 |
+
text = add_all_pronunciation_overrides(text)
|
148 |
+
|
149 |
+
# Create SSML syntax for Azure TTS
|
150 |
+
ssml = f"<speak version='1.0' xml:lang='{languageCode}' xmlns='http://www.w3.org/2001/10/synthesis' " \
|
151 |
+
"xmlns:mstts='http://www.w3.org/2001/mstts'>" \
|
152 |
+
f"<voice name='{voiceName}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
|
153 |
+
f"{text}</voice></speak>"
|
154 |
+
|
155 |
+
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
|
156 |
+
# For Azure voices, see: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support?tabs=stt-tts
|
157 |
+
speech_config.speech_synthesis_voice_name=voiceName
|
158 |
+
# For audio outputs, see: https://learn.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.speechsynthesisoutputformat?view=azure-python
|
159 |
+
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
|
160 |
+
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
|
161 |
+
|
162 |
+
#result = synthesizer.speak_text_async(text).get()
|
163 |
+
result = synthesizer.speak_ssml_async(ssml).get()
|
164 |
+
|
165 |
+
stream = speechsdk.AudioDataStream(result)
|
166 |
+
return stream
|
167 |
+
|
168 |
+
def format_percentage_change(speedFactor):
|
169 |
+
# Determine speedFactor value for Azure TTS. It should be either 'default' or a relative change.
|
170 |
+
if speedFactor == 1.0:
|
171 |
+
rate = 'default'
|
172 |
+
else:
|
173 |
+
# Whether to add a plus sign to the number to relative change. A negative will automatically be added
|
174 |
+
if speedFactor >= 1.0:
|
175 |
+
percentSign = '+'
|
176 |
+
else:
|
177 |
+
percentSign = ''
|
178 |
+
# Convert speedFactor float value to a relative percentage
|
179 |
+
rate = percentSign + str(round((speedFactor - 1.0) * 100, 5)) + '%'
|
180 |
+
return rate
|
181 |
+
|
182 |
+
def synthesize_text_azure_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):
|
183 |
+
|
184 |
+
def create_request_payload(remainingEntriesDict):
|
185 |
+
# Create SSML for all subtitles
|
186 |
+
ssmlJson = []
|
187 |
+
payloadSizeInBytes = 0
|
188 |
+
tempDict = dict(remainingEntriesDict) # Need to do this to avoid changing the original dict which would mess with the loop
|
189 |
+
|
190 |
+
for key, value in tempDict.items():
|
191 |
+
text = tempDict[key]['translated_text']
|
192 |
+
duration = tempDict[key]['duration_ms_buffered']
|
193 |
+
language = langDict['languageCode']
|
194 |
+
voice = langDict['voiceName']
|
195 |
+
|
196 |
+
# Create tag for desired duration of clip
|
197 |
+
durationTag = f'<mstts:audioduration value="{str(duration)}ms"/>'
|
198 |
+
|
199 |
+
# Create string for sentence pauses, if not default
|
200 |
+
if not azure_sentence_pause == 'default':
|
201 |
+
sentencePauseTag = f'<mstts:silence type="Sentenceboundary-exact" value="{str(azure_sentence_pause)}ms"/>'
|
202 |
+
else:
|
203 |
+
sentencePauseTag = ''
|
204 |
+
|
205 |
+
# Create string for comma pauses, if not default
|
206 |
+
if not azure_comma_pause == 'default':
|
207 |
+
commaPauseTag = f'<mstts:silence type="Comma-exact" value="{str(azure_comma_pause)}ms"/>'
|
208 |
+
else:
|
209 |
+
commaPauseTag = ''
|
210 |
+
|
211 |
+
# Set string for tag to set leading and trailing silence times to zero
|
212 |
+
leadSilenceTag = '<mstts:silence type="Leading-exact" value="0ms"/>'
|
213 |
+
tailSilenceTag = '<mstts:silence type="Tailing-exact" value="0ms"/>'
|
214 |
+
|
215 |
+
# Process text using pronunciation customization set by user
|
216 |
+
text = add_all_pronunciation_overrides(text)
|
217 |
+
|
218 |
+
# Create the SSML for each subtitle
|
219 |
+
ssml = f"<speak version='1.0' xml:lang='{language}' xmlns='http://www.w3.org/2001/10/synthesis' " \
|
220 |
+
"xmlns:mstts='http://www.w3.org/2001/mstts'>" \
|
221 |
+
f"<voice name='{voice}'>{sentencePauseTag}{commaPauseTag}{durationTag}{leadSilenceTag}{tailSilenceTag}" \
|
222 |
+
f"{text}</voice></speak>"
|
223 |
+
ssmlJson.append({"text": ssml})
|
224 |
+
|
225 |
+
# Construct request payload with SSML
|
226 |
+
# Reconstruct payload with every loop with new SSML so that the payload size is accurate
|
227 |
+
now = datetime.datetime.now()
|
228 |
+
pendingPayload = {
|
229 |
+
'displayName': langDict['languageCode'] + '-' + now.strftime("%Y-%m-%d %H:%M:%S"),
|
230 |
+
'description': 'Batch synthesis of ' + langDict['languageCode'] + ' subtitles',
|
231 |
+
"textType": "SSML",
|
232 |
+
# To use custom voice, see original example code script linked from azure_batch.py
|
233 |
+
"inputs": ssmlJson,
|
234 |
+
"properties": {
|
235 |
+
"outputFormat": "audio-48khz-192kbitrate-mono-mp3",
|
236 |
+
"wordBoundaryEnabled": False,
|
237 |
+
"sentenceBoundaryEnabled": False,
|
238 |
+
"concatenateResult": False,
|
239 |
+
"decompressOutputFiles": False
|
240 |
+
},
|
241 |
+
}
|
242 |
+
# Azure TTS Batch requests require payload must be under 500 kilobytes, so check payload is under 500,000 bytes. Not sure if they actually mean kibibytes, assume worst case.
|
243 |
+
# Payload will be formatted as json so must account for that too by doing json.dumps(), otherwise calculated size will be inaccurate
|
244 |
+
payloadSizeInBytes = len(str(json.dumps(pendingPayload)).encode('utf-8'))
|
245 |
+
|
246 |
+
if payloadSizeInBytes > 495000 or len(ssmlJson) > 995: # Leave some room for anything unexpected. Also number of inputs must be below 1000
|
247 |
+
# If payload would be too large, ignore the last entry and break out of loop
|
248 |
+
return payload, remainingEntriesDict
|
249 |
+
else:
|
250 |
+
payload = copy.deepcopy(pendingPayload) # Must make deepycopy otherwise ssmlJson will be updated in both instead of just pendingPayload
|
251 |
+
# Remove entry from remainingEntriesDict if it was added to payload
|
252 |
+
remainingEntriesDict.pop(key)
|
253 |
+
|
254 |
+
|
255 |
+
# If all the rest of the entries fit, return the payload
|
256 |
+
return payload, remainingEntriesDict
|
257 |
+
# ------------------------- End create_request_payload() -----------------------------------
|
258 |
+
|
259 |
+
|
260 |
+
# Create payloads, split into multiple if necessary
|
261 |
+
payloadList = []
|
262 |
+
remainingPayloadEntriesDict = dict(subsDict) # Will remove entries as they are added to payloads
|
263 |
+
while len(remainingPayloadEntriesDict) > 0:
|
264 |
+
payloadToAppend, remainingPayloadEntriesDict = create_request_payload(remainingPayloadEntriesDict)
|
265 |
+
payloadList.append(payloadToAppend)
|
266 |
+
|
267 |
+
# Tell user if request will be broken up into multiple payloads
|
268 |
+
if len(payloadList) > 1:
|
269 |
+
print(f'Payload will be broken up into {len(payloadList)} requests (due to Azure size limitations).')
|
270 |
+
|
271 |
+
# Use to keep track of filenames downloaded via separate zip files. WIll remove as they are downloaded
|
272 |
+
remainingDownloadedEntriesList = list(subsDict.keys())
|
273 |
+
|
274 |
+
# Clear out workingFolder
|
275 |
+
for filename in os.listdir('workingFolder'):
|
276 |
+
if not debug_mode:
|
277 |
+
os.remove(os.path.join('workingFolder', filename))
|
278 |
+
|
279 |
+
# Loop through payloads and submit to Azure
|
280 |
+
for payload in payloadList:
|
281 |
+
# Reset job_id from previous loops
|
282 |
+
job_id = None
|
283 |
+
|
284 |
+
# Send request to Azure
|
285 |
+
job_id = azure_batch.submit_synthesis(payload)
|
286 |
+
|
287 |
+
# Wait for job to finish
|
288 |
+
if job_id is not None:
|
289 |
+
status = "Running"
|
290 |
+
resultDownloadLink = None
|
291 |
+
|
292 |
+
while True: # Must use break to exit loop
|
293 |
+
# Get status
|
294 |
+
response = azure_batch.get_synthesis(job_id)
|
295 |
+
status = response.json()['status']
|
296 |
+
if status == 'Succeeded':
|
297 |
+
print('Batch synthesis job succeeded')
|
298 |
+
resultDownloadLink = azure_batch.get_synthesis(job_id).json()['outputs']['result']
|
299 |
+
break
|
300 |
+
elif status == 'Failed':
|
301 |
+
print('ERROR: Batch synthesis job failed!')
|
302 |
+
print("Reason:" + response.reason)
|
303 |
+
break
|
304 |
+
else:
|
305 |
+
print(f'Waiting for Azure batch synthesis job to finish. Status: [{status}]')
|
306 |
+
time.sleep(5)
|
307 |
+
|
308 |
+
# Download resultig zip file
|
309 |
+
if resultDownloadLink is not None:
|
310 |
+
# Download zip file
|
311 |
+
urlResponse = urlopen(resultDownloadLink)
|
312 |
+
|
313 |
+
# If debug mode, save zip file to disk
|
314 |
+
if debug_mode:
|
315 |
+
if secondPass == False:
|
316 |
+
zipName = 'azureBatch.zip'
|
317 |
+
else:
|
318 |
+
zipName = 'azureBatchPass2.zip'
|
319 |
+
|
320 |
+
zipPath = os.path.join('workingFolder', zipName)
|
321 |
+
with open(zipPath, 'wb') as f:
|
322 |
+
f.write(urlResponse.read())
|
323 |
+
# Reset urlResponse so it can be read again
|
324 |
+
urlResponse = urlopen(resultDownloadLink)
|
325 |
+
|
326 |
+
# Process zip file
|
327 |
+
virtualResultZip = io.BytesIO(urlResponse.read())
|
328 |
+
zipdata = zipfile.ZipFile(virtualResultZip)
|
329 |
+
zipinfos = zipdata.infolist()
|
330 |
+
|
331 |
+
# Reorder zipinfos so the file names are in alphanumeric order
|
332 |
+
zipinfos.sort(key=lambda x: x.filename)
|
333 |
+
|
334 |
+
# Only extract necessary files, and rename them while doing so
|
335 |
+
for file in zipinfos:
|
336 |
+
if file.filename == "summary.json":
|
337 |
+
#zipdata.extract(file, 'workingFolder') # For debugging
|
338 |
+
pass
|
339 |
+
elif "json" not in file.filename:
|
340 |
+
# Rename file to match first entry in remainingDownloadedEntriesDict, then extract
|
341 |
+
currentFileNum = remainingDownloadedEntriesList[0]
|
342 |
+
file.filename = str(currentFileNum) + '.mp3'
|
343 |
+
#file.filename = file.filename.lstrip('0')
|
344 |
+
|
345 |
+
# Add file path to subsDict then remove from remainingDownloadedEntriesList
|
346 |
+
subsDict[currentFileNum]['TTS_FilePath'] = os.path.join('workingFolder', str(currentFileNum)) + '.mp3'
|
347 |
+
# Extract file
|
348 |
+
zipdata.extract(file, 'workingFolder')
|
349 |
+
# Remove entry from remainingDownloadedEntriesList
|
350 |
+
remainingDownloadedEntriesList.pop(0)
|
351 |
+
|
352 |
+
|
353 |
+
return subsDict
|
354 |
+
|
355 |
+
|
356 |
+
def synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=False, secondPass=False):
|
357 |
+
if not skipSynthesize:
|
358 |
+
subsDict = synthesize_text_azure_batch(subsDict, langDict, skipSynthesize, secondPass)
|
359 |
+
return subsDict
|
360 |
+
|
361 |
+
def synthesize_dictionary(subsDict, langDict, outputFolder, skipSynthesize=False, secondPass=False):
|
362 |
+
for key, value in subsDict.items():
|
363 |
+
# TTS each subtitle text, write to file, write filename into dictionary
|
364 |
+
workingFolder = os.path.join(outputFolder, 'workingFolder')
|
365 |
+
filePath = os.path.join(workingFolder, f'{str(key)}.mp3')
|
366 |
+
filePathStem = os.path.join(workingFolder, f'{str(key)}')
|
367 |
+
if not skipSynthesize:
|
368 |
+
|
369 |
+
duration = value['duration_ms_buffered']
|
370 |
+
|
371 |
+
if secondPass:
|
372 |
+
# Get speed factor from subsDict
|
373 |
+
speedFactor = subsDict[key]['speed_factor']
|
374 |
+
else:
|
375 |
+
speedFactor = float(1.0)
|
376 |
+
|
377 |
+
# Prepare output location. If folder doesn't exist, create it
|
378 |
+
if not os.path.exists(os.path.dirname(filePath)):
|
379 |
+
try:
|
380 |
+
os.makedirs(os.path.dirname(filePath))
|
381 |
+
except OSError:
|
382 |
+
print("Error creating directory")
|
383 |
+
|
384 |
+
|
385 |
+
# If Azure TTS, use Azure API
|
386 |
+
if tts_service == "azure":
|
387 |
+
# Audio variable is an AudioDataStream object
|
388 |
+
audio = synthesize_text_azure(value['translated_text'], duration, langDict['voiceName'], langDict['languageCode'])
|
389 |
+
# Save to file using save_to_wav_file method of audio object
|
390 |
+
audio.save_to_wav_file(filePath)
|
391 |
+
|
392 |
+
# If debug mode, write to files after Google TTS
|
393 |
+
if debug_mode and secondPass == False:
|
394 |
+
audio.save_to_wav_file(filePathStem+"_p1.mp3")
|
395 |
+
elif debug_mode and secondPass == True:
|
396 |
+
audio.save_to_wav_file(filePathStem+"_p2.mp3")
|
397 |
+
|
398 |
+
subsDict[key]['TTS_FilePath'] = filePath
|
399 |
+
|
400 |
+
# Get key index
|
401 |
+
keyIndex = list(subsDict.keys()).index(key)
|
402 |
+
# Print progress and overwrite line next time
|
403 |
+
if not secondPass:
|
404 |
+
print(f" Synthesizing TTS Line: {keyIndex+1} of {len(subsDict)}", end="\r")
|
405 |
+
else:
|
406 |
+
print(f" Synthesizing TTS Line (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
|
407 |
+
print(" ") # Clear the line
|
408 |
+
return subsDict
|
app/scripts/__init__.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .audio import process_language
|
2 |
+
from .srt import parse_srt_file, get_duration
|
3 |
+
import langcodes
|
4 |
+
import pathlib
|
5 |
+
import os
|
6 |
+
|
7 |
+
def synthesise_audio(
|
8 |
+
srt_file,
|
9 |
+
video_file,
|
10 |
+
output_folder,
|
11 |
+
language_code="hi-IN",
|
12 |
+
voice_name="hi-IN-MadhurNeural",
|
13 |
+
from_lang="en",
|
14 |
+
to_lang="hi",
|
15 |
+
gender="MALE",
|
16 |
+
):
|
17 |
+
langData = {
|
18 |
+
"synth_language_code": language_code,
|
19 |
+
"synth_voice_name": voice_name,
|
20 |
+
"translation_source_language": from_lang,
|
21 |
+
"translation_target_language": to_lang,
|
22 |
+
"synth_voice_gender": gender,
|
23 |
+
"translate_service": "azure",
|
24 |
+
"formality": None,
|
25 |
+
}
|
26 |
+
|
27 |
+
with open(srt_file, "r", encoding="utf-8-sig") as f:
|
28 |
+
originalSubLines = f.readlines()
|
29 |
+
|
30 |
+
originalLanguageSubsDict = parse_srt_file(originalSubLines)
|
31 |
+
|
32 |
+
totalAudioLength = get_duration(video_file)
|
33 |
+
|
34 |
+
# Use video file name to use in the name of the translate srt file, also display regular language name
|
35 |
+
lang = langcodes.get(to_lang).display_name()
|
36 |
+
translatedSrtFileName = pathlib.Path(video_file).stem + f" - {lang} - {to_lang}.srt"
|
37 |
+
# Set path to save translated srt file
|
38 |
+
translatedSrtFileName = f"{output_folder}/{translatedSrtFileName}"
|
39 |
+
|
40 |
+
lang = langcodes.get(langData['synth_language_code'])
|
41 |
+
langName = langcodes.get(langData['synth_language_code']).get(lang.to_alpha3()).display_name()
|
42 |
+
|
43 |
+
outputFileName = pathlib.Path(video_file).stem + f" - {langName} - {langData['synth_language_code']}."
|
44 |
+
# Set output path
|
45 |
+
outputFileName = os.path.join(output_folder, outputFileName)
|
46 |
+
|
47 |
+
process_language(
|
48 |
+
langData,
|
49 |
+
originalLanguageSubsDict,
|
50 |
+
totalAudioLength,
|
51 |
+
translatedSrtFileName,
|
52 |
+
outputFileName,
|
53 |
+
output_folder
|
54 |
+
)
|
55 |
+
return {"translated_subtitle": translatedSrtFileName, "translated_audio": outputFileName+"mp3"}
|
app/scripts/audio.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
from .TTS import synthesize_dictionary_batch, synthesize_dictionary
|
3 |
+
from .translate import translate_dictionary
|
4 |
+
from .audio_builder import build_audio
|
5 |
+
|
6 |
+
|
7 |
+
original_language = "en"
|
8 |
+
batch_tts_synthesize = False
|
9 |
+
skip_translation = False
|
10 |
+
stop_after_translation = False
|
11 |
+
skip_translation = False
|
12 |
+
skip_synthesize = False
|
13 |
+
|
14 |
+
two_pass_voice_synth = False # Azure doesn't need two pass voice synth, so disable it
|
15 |
+
|
16 |
+
|
17 |
+
def manually_prepare_dictionary(dictionaryToPrep):
|
18 |
+
### Do additional Processing to match the format produced by translation function
|
19 |
+
# Create new key 'translated_text' and set it to the value of 'text'
|
20 |
+
for key, value in dictionaryToPrep.items():
|
21 |
+
dictionaryToPrep[key]['translated_text'] = value['text']
|
22 |
+
|
23 |
+
# Convert the keys to integers and return the dictionary
|
24 |
+
return {int(k): v for k, v in dictionaryToPrep.items()}
|
25 |
+
|
26 |
+
|
27 |
+
# Process a language: Translate, Synthesize, and Build Audio
|
28 |
+
def process_language(langData, originalLanguageSubsDict, totalAudioLength, translatedSrtFileName, outputFileName, outputFolder):
|
29 |
+
langDict = {
|
30 |
+
'targetLanguage': langData['translation_target_language'],
|
31 |
+
'sourceLanguage': langData['translation_source_language'],
|
32 |
+
'voiceName': langData['synth_voice_name'],
|
33 |
+
'languageCode': langData['synth_language_code'],
|
34 |
+
'voiceGender': langData['synth_voice_gender'],
|
35 |
+
'translateService': langData['translate_service'],
|
36 |
+
'formality': langData['formality']
|
37 |
+
}
|
38 |
+
|
39 |
+
individualLanguageSubsDict = copy.deepcopy(originalLanguageSubsDict)
|
40 |
+
|
41 |
+
# Check for special case where original language is the same as the target language
|
42 |
+
if langDict['languageCode'].lower() == original_language.lower():
|
43 |
+
print("Original language is the same as the target language. Skipping translation.")
|
44 |
+
individualLanguageSubsDict = manually_prepare_dictionary(individualLanguageSubsDict)
|
45 |
+
|
46 |
+
elif skip_translation == False:
|
47 |
+
# Translate
|
48 |
+
individualLanguageSubsDict = translate_dictionary(individualLanguageSubsDict, langDict, translatedSrtFileName, skipTranslation=skip_translation)
|
49 |
+
if stop_after_translation:
|
50 |
+
print("Stopping at translation is enabled. Skipping TTS and building audio.")
|
51 |
+
return
|
52 |
+
|
53 |
+
# Synthesize
|
54 |
+
if batch_tts_synthesize == True:
|
55 |
+
individualLanguageSubsDict = synthesize_dictionary_batch(individualLanguageSubsDict, langDict, skipSynthesize=skip_synthesize)
|
56 |
+
else:
|
57 |
+
individualLanguageSubsDict = synthesize_dictionary(individualLanguageSubsDict, langDict, outputFolder, skipSynthesize=skip_synthesize)
|
58 |
+
print(individualLanguageSubsDict)
|
59 |
+
|
60 |
+
# Build audio
|
61 |
+
individualLanguageSubsDict = build_audio(individualLanguageSubsDict, langDict, totalAudioLength, outputFileName, two_pass_voice_synth)
|
62 |
+
|
app/scripts/audio_builder.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import soundfile
|
2 |
+
import pyrubberband
|
3 |
+
import pathlib
|
4 |
+
import os
|
5 |
+
import io
|
6 |
+
|
7 |
+
|
8 |
+
from . import TTS
|
9 |
+
|
10 |
+
from pydub import AudioSegment
|
11 |
+
from pydub.silence import detect_leading_silence
|
12 |
+
import langcodes
|
13 |
+
|
14 |
+
|
15 |
+
# Set working folder
|
16 |
+
workingFolder = "workingFolder"
|
17 |
+
|
18 |
+
synth_sample_rate = 24000
|
19 |
+
debug_mode = False
|
20 |
+
tts_service = "azure"
|
21 |
+
batch_tts_synthesize = False
|
22 |
+
skip_translation = False
|
23 |
+
stop_after_translation = False
|
24 |
+
skip_translation = False
|
25 |
+
skip_synthesize = False
|
26 |
+
force_stretch_with_twopass = False
|
27 |
+
output_format = "mp3"
|
28 |
+
|
29 |
+
|
30 |
+
def trim_clip(inputSound):
|
31 |
+
trim_leading_silence: AudioSegment = lambda x: x[detect_leading_silence(x) :]
|
32 |
+
trim_trailing_silence: AudioSegment = lambda x: trim_leading_silence(x.reverse()).reverse()
|
33 |
+
strip_silence: AudioSegment = lambda x: trim_trailing_silence(trim_leading_silence(x))
|
34 |
+
strippedSound = strip_silence(inputSound)
|
35 |
+
return strippedSound
|
36 |
+
|
37 |
+
# Function to insert audio into canvas at specific point
|
38 |
+
def insert_audio(canvas, audioToOverlay, startTimeMs):
|
39 |
+
# Create a copy of the canvas
|
40 |
+
canvasCopy = canvas
|
41 |
+
# Overlay the audio onto the copy
|
42 |
+
canvasCopy = canvasCopy.overlay(audioToOverlay, position=int(startTimeMs))
|
43 |
+
# Return the copy
|
44 |
+
return canvasCopy
|
45 |
+
|
46 |
+
# Function to create a canvas of a specific duration in miliseconds
|
47 |
+
def create_canvas(canvasDuration, frame_rate=int(synth_sample_rate)):
|
48 |
+
canvas = AudioSegment.silent(duration=canvasDuration, frame_rate=frame_rate)
|
49 |
+
return canvas
|
50 |
+
|
51 |
+
def get_speed_factor(subsDict, trimmedAudio, desiredDuration, num):
|
52 |
+
virtualTempFile = AudioSegment.from_file(trimmedAudio, format="wav")
|
53 |
+
rawDuration = virtualTempFile.duration_seconds
|
54 |
+
trimmedAudio.seek(0) # This MUST be done to reset the file pointer to the start of the file, otherwise will get errors next time try to access the virtual files
|
55 |
+
# Calculate the speed factor, put into dictionary
|
56 |
+
desiredDuration = float(desiredDuration)
|
57 |
+
speedFactor = (rawDuration*1000) / desiredDuration
|
58 |
+
subsDict[num]['speed_factor'] = speedFactor
|
59 |
+
return subsDict
|
60 |
+
|
61 |
+
def stretch_audio(audioFileToStretch, speedFactor, num):
|
62 |
+
virtualTempAudioFile = io.BytesIO()
|
63 |
+
# Write the raw string to virtualtempaudiofile
|
64 |
+
y, sampleRate = soundfile.read(audioFileToStretch)
|
65 |
+
|
66 |
+
streched_audio = pyrubberband.time_stretch(y, sampleRate, speedFactor, rbargs={'--fine': '--fine'}) # Need to add rbarges in weird way because it demands a dictionary of two values
|
67 |
+
#soundfile.write(f'{workingFolder}\\temp_stretched.wav', streched_audio, sampleRate)
|
68 |
+
soundfile.write(virtualTempAudioFile, streched_audio, sampleRate, format='wav')
|
69 |
+
if debug_mode:
|
70 |
+
soundfile.write(os.path.join(workingFolder, f'{num}_s.wav'), streched_audio, sampleRate) # For debugging, saves the stretched audio files
|
71 |
+
#return AudioSegment.from_file(f'{workingFolder}\\temp_stretched.wav', format="wav")
|
72 |
+
return AudioSegment.from_file(virtualTempAudioFile, format="wav")
|
73 |
+
|
74 |
+
|
75 |
+
def build_audio(subsDict, langDict, totalAudioLength, outputFileName, twoPassVoiceSynth=False):
|
76 |
+
if tts_service == 'azure':
|
77 |
+
twoPassVoiceSynth = False # Azure doesn't need two pass voice synth, so disable it
|
78 |
+
|
79 |
+
virtualTrimmedFileDict = {}
|
80 |
+
# First trim silence off the audio files
|
81 |
+
for key, value in subsDict.items():
|
82 |
+
filePathTrimmed = os.path.join(workingFolder, str(key)) + "_t.wav"
|
83 |
+
subsDict[key]['TTS_FilePath_Trimmed'] = filePathTrimmed
|
84 |
+
|
85 |
+
# Trim the clip and re-write file
|
86 |
+
rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
|
87 |
+
trimmedClip = trim_clip(rawClip)
|
88 |
+
if debug_mode:
|
89 |
+
trimmedClip.export(filePathTrimmed, format="wav")
|
90 |
+
|
91 |
+
# Create virtual file in dictionary with audio to be read later
|
92 |
+
tempTrimmedFile = io.BytesIO()
|
93 |
+
trimmedClip.export(tempTrimmedFile, format="wav")
|
94 |
+
virtualTrimmedFileDict[key] = tempTrimmedFile
|
95 |
+
keyIndex = list(subsDict.keys()).index(key)
|
96 |
+
print(f" Trimmed Audio: {keyIndex+1} of {len(subsDict)}", end="\r")
|
97 |
+
print("\n")
|
98 |
+
|
99 |
+
# Calculates speed factor if necessary. Azure doesn't need this, so skip it
|
100 |
+
if not tts_service == 'azure':
|
101 |
+
# Calculate speed factors for each clip, aka how much to stretch the audio
|
102 |
+
for key, value in subsDict.items():
|
103 |
+
#subsDict = get_speed_factor(subsDict, value['TTS_FilePath_Trimmed'], value['duration_ms'], num=key)
|
104 |
+
subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
|
105 |
+
keyIndex = list(subsDict.keys()).index(key)
|
106 |
+
print(f" Calculated Speed Factor: {keyIndex+1} of {len(subsDict)}", end="\r")
|
107 |
+
print("\n")
|
108 |
+
|
109 |
+
# If two pass voice synth is enabled, have API re-synthesize the clips at the new speed
|
110 |
+
# Azure allows direct specification of audio duration, so no need to re-synthesize
|
111 |
+
if twoPassVoiceSynth == True and not tts_service == 'azure':
|
112 |
+
if batch_tts_synthesize == True and tts_service == 'azure':
|
113 |
+
subsDict = TTS.synthesize_dictionary_batch(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
|
114 |
+
else:
|
115 |
+
subsDict = TTS.synthesize_dictionary(subsDict, langDict, skipSynthesize=skip_synthesize, secondPass=True)
|
116 |
+
|
117 |
+
for key, value in subsDict.items():
|
118 |
+
# Trim the clip and re-write file
|
119 |
+
rawClip = AudioSegment.from_file(value['TTS_FilePath'], format="mp3", frame_rate=int(synth_sample_rate))
|
120 |
+
trimmedClip = trim_clip(rawClip)
|
121 |
+
if debug_mode:
|
122 |
+
# Remove '.wav' from the end of the file path
|
123 |
+
secondPassTrimmedFile = value['TTS_FilePath_Trimmed'][:-4] + "_p2_t.wav"
|
124 |
+
trimmedClip.export(secondPassTrimmedFile, format="wav")
|
125 |
+
trimmedClip.export(virtualTrimmedFileDict[key], format="wav")
|
126 |
+
keyIndex = list(subsDict.keys()).index(key)
|
127 |
+
print(f" Trimmed Audio (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
|
128 |
+
print("\n")
|
129 |
+
|
130 |
+
if force_stretch_with_twopass == True:
|
131 |
+
for key, value in subsDict.items():
|
132 |
+
subsDict = get_speed_factor(subsDict, virtualTrimmedFileDict[key], value['duration_ms'], num=key)
|
133 |
+
keyIndex = list(subsDict.keys()).index(key)
|
134 |
+
print(f" Calculated Speed Factor (2nd Pass): {keyIndex+1} of {len(subsDict)}", end="\r")
|
135 |
+
print("\n")
|
136 |
+
|
137 |
+
# Create canvas to overlay audio onto
|
138 |
+
canvas = create_canvas(totalAudioLength)
|
139 |
+
|
140 |
+
# Stretch audio and insert into canvas
|
141 |
+
for key, value in subsDict.items():
|
142 |
+
if (not twoPassVoiceSynth or force_stretch_with_twopass == True) and not tts_service == 'azure': # Don't stretch if azure is used
|
143 |
+
#stretchedClip = stretch_audio(value['TTS_FilePath_Trimmed'], speedFactor=subsDict[key]['speed_factor'], num=key)
|
144 |
+
stretchedClip = stretch_audio(virtualTrimmedFileDict[key], speedFactor=subsDict[key]['speed_factor'], num=key)
|
145 |
+
else:
|
146 |
+
#stretchedClip = AudioSegment.from_file(value['TTS_FilePath_Trimmed'], format="wav")
|
147 |
+
stretchedClip = AudioSegment.from_file(virtualTrimmedFileDict[key], format="wav")
|
148 |
+
virtualTrimmedFileDict[key].seek(0) # Not 100% sure if this is necessary but it was in the other place it is used
|
149 |
+
|
150 |
+
canvas = insert_audio(canvas, stretchedClip, value['start_ms'])
|
151 |
+
keyIndex = list(subsDict.keys()).index(key)
|
152 |
+
print(f" Final Audio Processed: {keyIndex+1} of {len(subsDict)}", end="\r")
|
153 |
+
print("\n")
|
154 |
+
|
155 |
+
|
156 |
+
# Determine string to use for output format and file extension based on config setting
|
157 |
+
outputFormat=output_format.lower()
|
158 |
+
if outputFormat == "mp3":
|
159 |
+
outputFileName += "mp3"
|
160 |
+
formatString = "mp3"
|
161 |
+
elif outputFormat == "wav":
|
162 |
+
outputFileName += "wav"
|
163 |
+
formatString = "wav"
|
164 |
+
elif outputFormat == "aac":
|
165 |
+
#outputFileName += "m4a"
|
166 |
+
#formatString = "mp4" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
|
167 |
+
outputFileName += "aac"
|
168 |
+
formatString = "adts" # Pydub doesn't accept "aac" as a format, so we have to use "mp4" instead. Alternatively, could use "adts" with file extension "aac"
|
169 |
+
|
170 |
+
canvas = canvas.set_channels(2) # Change from mono to stereo
|
171 |
+
try:
|
172 |
+
print("\nExporting audio file...")
|
173 |
+
canvas.export(outputFileName, format=formatString, bitrate="192k")
|
174 |
+
except:
|
175 |
+
outputFileName = outputFileName + ".bak"
|
176 |
+
canvas.export(outputFileName, format=formatString, bitrate="192k")
|
177 |
+
print("\nThere was an issue exporting the audio, it might be a permission error. The file was saved as a backup with the extension .bak")
|
178 |
+
print("Try removing the .bak extension then listen to the file to see if it worked.\n")
|
179 |
+
input("Press Enter to exit...")
|
180 |
+
|
181 |
+
return subsDict
|
app/scripts/azure_batch.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# Based on Microsoft Azure sample code found here: https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/batch-synthesis/python/synthesis.py
|
5 |
+
# Original License Info Below:
|
6 |
+
# Copyright (c) Microsoft. All rights reserved.
|
7 |
+
# Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
8 |
+
#--------------------------------------------------------------------------------------------------------
|
9 |
+
import os
|
10 |
+
import json
|
11 |
+
import logging
|
12 |
+
import sys
|
13 |
+
|
14 |
+
import requests
|
15 |
+
|
16 |
+
|
17 |
+
logging.basicConfig(stream=sys.stdout, level=logging.ERROR,
|
18 |
+
format="[%(asctime)s] %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
# Your Speech resource key and region
|
22 |
+
# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
|
23 |
+
|
24 |
+
AZURE_SPEECH_KEY = os.environ.get('SPEECH_KEY')
|
25 |
+
AZURE_SPEECH_REGION = os.environ.get('SPEECH_REGION')
|
26 |
+
|
27 |
+
NAME = "Simple synthesis"
|
28 |
+
DESCRIPTION = "Simple synthesis description"
|
29 |
+
|
30 |
+
# The service host suffix.
|
31 |
+
# For azure.cn the host suffix is "customvoice.api.speech.azure.cn"
|
32 |
+
SERVICE_HOST = "customvoice.api.speech.microsoft.com"
|
33 |
+
|
34 |
+
|
35 |
+
def submit_synthesis(payload):
|
36 |
+
url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis'
|
37 |
+
header = {
|
38 |
+
'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY,
|
39 |
+
'Content-Type': 'application/json'
|
40 |
+
}
|
41 |
+
|
42 |
+
response = requests.post(url, json.dumps(payload), headers=header)
|
43 |
+
if response.status_code < 400:
|
44 |
+
logger.info('Batch synthesis job submitted successfully')
|
45 |
+
logger.info(f'Job ID: {response.json()["id"]}')
|
46 |
+
return response.json()["id"]
|
47 |
+
else:
|
48 |
+
logger.error(f'Failed to submit batch synthesis job: {response.text}')
|
49 |
+
|
50 |
+
|
51 |
+
def get_synthesis(job_id):
|
52 |
+
url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis/{job_id}'
|
53 |
+
header = {
|
54 |
+
'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY
|
55 |
+
}
|
56 |
+
response = requests.get(url, headers=header)
|
57 |
+
if response.status_code < 400:
|
58 |
+
logger.info('Get batch synthesis job successfully')
|
59 |
+
logger.info(response.json())
|
60 |
+
#return response.json()['status']
|
61 |
+
return response
|
62 |
+
else:
|
63 |
+
logger.error(f'Failed to get batch synthesis job: {response.text}')
|
64 |
+
|
65 |
+
|
66 |
+
def list_synthesis_jobs(skip: int = 0, top: int = 100):
|
67 |
+
"""List all batch synthesis jobs in the subscription"""
|
68 |
+
url = f'https://{AZURE_SPEECH_REGION}.{SERVICE_HOST}/api/texttospeech/3.1-preview1/batchsynthesis?skip={skip}&top={top}'
|
69 |
+
header = {
|
70 |
+
'Ocp-Apim-Subscription-Key': AZURE_SPEECH_KEY
|
71 |
+
}
|
72 |
+
response = requests.get(url, headers=header)
|
73 |
+
if response.status_code < 400:
|
74 |
+
logger.info(f'List batch synthesis jobs successfully, got {len(response.json()["values"])} jobs')
|
75 |
+
logger.info(response.json())
|
76 |
+
else:
|
77 |
+
logger.error(f'Failed to list batch synthesis jobs: {response.text}')
|
78 |
+
|
app/scripts/azure_translate.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests, uuid, json, os
|
2 |
+
|
3 |
+
|
4 |
+
def azure_translate_text(text_list, from_lang="en", to_lang="hi"):
|
5 |
+
TRANSLATE_API_ENDPOINT = os.environ.get("TRANSLATE_API_ENDPOINT")
|
6 |
+
url = f"{TRANSLATE_API_ENDPOINT}/translate"
|
7 |
+
|
8 |
+
params = {
|
9 |
+
'api-version': '3.0',
|
10 |
+
'from': from_lang,
|
11 |
+
'to': [to_lang]
|
12 |
+
}
|
13 |
+
|
14 |
+
TRANSLATE_KEY = os.environ.get("TRANSLATE_KEY")
|
15 |
+
LOCATION = os.environ.get("SPEECH_REGION")
|
16 |
+
|
17 |
+
headers = {
|
18 |
+
'Ocp-Apim-Subscription-Key': TRANSLATE_KEY,
|
19 |
+
'Ocp-Apim-Subscription-Region': LOCATION,
|
20 |
+
'Content-type': 'application/json',
|
21 |
+
'X-ClientTraceId': str(uuid.uuid4())
|
22 |
+
}
|
23 |
+
body = [{"text": text} for text in text_list]
|
24 |
+
|
25 |
+
request = requests.post(url, params=params, headers=headers, json=body)
|
26 |
+
response = request.json()
|
27 |
+
response = [{"text": text["translations"][0]["text"]} for text in response]
|
28 |
+
return response
|
app/scripts/srt.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def parse_srt_file(srtFileLines, preTranslated=False):
|
4 |
+
# Matches the following example with regex: 00:00:20,130 --> 00:00:23,419
|
5 |
+
subtitleTimeLineRegex = re.compile(r'\d\d:\d\d:\d\d,\d\d\d --> \d\d:\d\d:\d\d,\d\d\d')
|
6 |
+
|
7 |
+
# Create a dictionary
|
8 |
+
subsDict = {}
|
9 |
+
|
10 |
+
# Will add this many milliseconds of extra silence before and after each audio clip / spoken subtitle line
|
11 |
+
addBufferMilliseconds = 0
|
12 |
+
|
13 |
+
# Enumerate lines, and if a line in lines contains only an integer, put that number in the key, and a dictionary in the value
|
14 |
+
# The dictionary contains the start, ending, and duration of the subtitles as well as the text
|
15 |
+
# The next line uses the syntax HH:MM:SS,MMM --> HH:MM:SS,MMM . Get the difference between the two times and put that in the dictionary
|
16 |
+
# For the line after that, put the text in the dictionary
|
17 |
+
for lineNum, line in enumerate(srtFileLines):
|
18 |
+
line = line.strip()
|
19 |
+
if line.isdigit() and subtitleTimeLineRegex.match(srtFileLines[lineNum + 1]):
|
20 |
+
lineWithTimestamps = srtFileLines[lineNum + 1].strip()
|
21 |
+
lineWithSubtitleText = srtFileLines[lineNum + 2].strip()
|
22 |
+
|
23 |
+
# If there are more lines after the subtitle text, add them to the text
|
24 |
+
count = 3
|
25 |
+
while True:
|
26 |
+
# Check if the next line is blank or not
|
27 |
+
if (lineNum+count) < len(srtFileLines) and srtFileLines[lineNum + count].strip():
|
28 |
+
lineWithSubtitleText += ' ' + srtFileLines[lineNum + count].strip()
|
29 |
+
count += 1
|
30 |
+
else:
|
31 |
+
break
|
32 |
+
|
33 |
+
# Create empty dictionary with keys for start and end times and subtitle text
|
34 |
+
subsDict[line] = {'start_ms': '', 'end_ms': '', 'duration_ms': '', 'text': '', 'break_until_next': '', 'srt_timestamps_line': lineWithTimestamps}
|
35 |
+
|
36 |
+
time = lineWithTimestamps.split(' --> ')
|
37 |
+
time1 = time[0].split(':')
|
38 |
+
time2 = time[1].split(':')
|
39 |
+
|
40 |
+
# Converts the time to milliseconds
|
41 |
+
processedTime1 = int(time1[0]) * 3600000 + int(time1[1]) * 60000 + int(time1[2].split(',')[0]) * 1000 + int(time1[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
|
42 |
+
processedTime2 = int(time2[0]) * 3600000 + int(time2[1]) * 60000 + int(time2[2].split(',')[0]) * 1000 + int(time2[2].split(',')[1]) #/ 1000 #Uncomment to turn into seconds
|
43 |
+
timeDifferenceMs = str(processedTime2 - processedTime1)
|
44 |
+
|
45 |
+
# Adjust times with buffer
|
46 |
+
if addBufferMilliseconds > 0 and not preTranslated:
|
47 |
+
subsDict[line]['start_ms_buffered'] = str(processedTime1 + addBufferMilliseconds)
|
48 |
+
subsDict[line]['end_ms_buffered'] = str(processedTime2 - addBufferMilliseconds)
|
49 |
+
subsDict[line]['duration_ms_buffered'] = str((processedTime2 - addBufferMilliseconds) - (processedTime1 + addBufferMilliseconds))
|
50 |
+
else:
|
51 |
+
subsDict[line]['start_ms_buffered'] = str(processedTime1)
|
52 |
+
subsDict[line]['end_ms_buffered'] = str(processedTime2)
|
53 |
+
subsDict[line]['duration_ms_buffered'] = str(processedTime2 - processedTime1)
|
54 |
+
|
55 |
+
# Set the keys in the dictionary to the values
|
56 |
+
subsDict[line]['start_ms'] = str(processedTime1)
|
57 |
+
subsDict[line]['end_ms'] = str(processedTime2)
|
58 |
+
subsDict[line]['duration_ms'] = timeDifferenceMs
|
59 |
+
subsDict[line]['text'] = lineWithSubtitleText
|
60 |
+
if lineNum > 0:
|
61 |
+
# Goes back to previous line's dictionary and writes difference in time to current line
|
62 |
+
subsDict[str(int(line)-1)]['break_until_next'] = processedTime1 - int(subsDict[str(int(line) - 1)]['end_ms'])
|
63 |
+
else:
|
64 |
+
subsDict[line]['break_until_next'] = 0
|
65 |
+
|
66 |
+
|
67 |
+
# Apply the buffer to the start and end times by setting copying over the buffer values to main values
|
68 |
+
if addBufferMilliseconds > 0 and not preTranslated:
|
69 |
+
for key, value in subsDict.items():
|
70 |
+
subsDict[key]['start_ms'] = value['start_ms_buffered']
|
71 |
+
subsDict[key]['end_ms'] = value['end_ms_buffered']
|
72 |
+
subsDict[key]['duration_ms'] = value['duration_ms_buffered']
|
73 |
+
|
74 |
+
return subsDict
|
75 |
+
|
76 |
+
|
77 |
+
def get_duration(filename):
|
78 |
+
import subprocess, json
|
79 |
+
result = subprocess.check_output(f'ffprobe -i {filename} -show_entries format=duration -v quiet -of csv="p=0" -of json', shell=True).decode()
|
80 |
+
|
81 |
+
try:
|
82 |
+
duration = json.loads(result)['format']["duration"]
|
83 |
+
except KeyError:
|
84 |
+
print("Error: Could not get duration of video file. Please check the file path and try again.")
|
85 |
+
durationMS = round(float(duration)*1000) # Convert to milliseconds
|
86 |
+
return durationMS
|
app/scripts/translate.py
ADDED
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: UTF-8 -*-
|
3 |
+
|
4 |
+
# Imports
|
5 |
+
import re, regex
|
6 |
+
from . import utils
|
7 |
+
from .azure_translate import azure_translate_text
|
8 |
+
|
9 |
+
|
10 |
+
from operator import itemgetter
|
11 |
+
import sys
|
12 |
+
import copy
|
13 |
+
import os
|
14 |
+
import html
|
15 |
+
from pathlib import Path
|
16 |
+
|
17 |
+
|
18 |
+
combine_subtitles_max_chars = 200
|
19 |
+
translate_service = 'azure'
|
20 |
+
# -------------------------------- No Translate and Manual Translation Functions -----------------------------------
|
21 |
+
BASE_DIR = Path(__file__).resolve().parent.parent / 'SSML_Customization'
|
22 |
+
|
23 |
+
# Import files and put into dictionaries
|
24 |
+
noTranslateOverrideFile = os.path.join(BASE_DIR, 'dont_translate_phrases.txt')
|
25 |
+
dontTranslateList = utils.txt_to_list(noTranslateOverrideFile)
|
26 |
+
manualTranslationOverrideFile = os.path.join(BASE_DIR, 'Manual_Translations.csv')
|
27 |
+
manualTranslationsDict = utils.csv_to_dict(manualTranslationOverrideFile)
|
28 |
+
urlListFile = os.path.join(BASE_DIR, 'url_list.txt')
|
29 |
+
urlList = utils.txt_to_list(urlListFile)
|
30 |
+
|
31 |
+
# Add span tags around certain words to exclude them from being translated
|
32 |
+
def add_notranslate_tags_from_notranslate_file(text, phraseList):
|
33 |
+
for word in phraseList:
|
34 |
+
findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{word}[.,!?()]?["\']?)(\p{{Z}}|$)' #\p ensures it works with unicode characters
|
35 |
+
findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
|
36 |
+
# Find the word, with optional punctuation after, and optional quotes before or after
|
37 |
+
text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
|
38 |
+
return text
|
39 |
+
|
40 |
+
def remove_notranslate_tags(text):
|
41 |
+
text = text.replace('<span class="notranslate">', '').replace('</span>', '')
|
42 |
+
return text
|
43 |
+
|
44 |
+
def add_notranslate_tags_for_manual_translations(text, langcode):
|
45 |
+
for manualTranslatedText in manualTranslationsDict:
|
46 |
+
# Only replace text if the language matches the entry in the manual translations file
|
47 |
+
if manualTranslatedText['Language Code'] == langcode:
|
48 |
+
originalText = manualTranslatedText['Original Text']
|
49 |
+
findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
|
50 |
+
findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
|
51 |
+
text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
|
52 |
+
return text
|
53 |
+
|
54 |
+
# Replace certain words or phrases with their manual translation
|
55 |
+
def replace_manual_translations(text, langcode):
|
56 |
+
for manualTranslatedText in manualTranslationsDict:
|
57 |
+
# Only replace text if the language matches the entry in the manual translations file
|
58 |
+
if manualTranslatedText['Language Code'] == langcode:
|
59 |
+
originalText = manualTranslatedText['Original Text']
|
60 |
+
translatedText = manualTranslatedText['Translated Text']
|
61 |
+
findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
|
62 |
+
findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
|
63 |
+
# Substitute the matched word with the translated text
|
64 |
+
text = findWordRegexCompiled.sub(rf'\1{translatedText}\3', text)
|
65 |
+
return text
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
#======================================== Translate Text ================================================
|
70 |
+
# Note: This function was almost entirely written by GPT-3 after feeding it my original code and asking it to change it so it
|
71 |
+
# would break up the text into chunks if it was too long. It appears to work
|
72 |
+
|
73 |
+
def process_response_text(text, targetLanguage):
|
74 |
+
text = html.unescape(text)
|
75 |
+
text = remove_notranslate_tags(text)
|
76 |
+
text = replace_manual_translations(text, targetLanguage)
|
77 |
+
return text
|
78 |
+
|
79 |
+
def split_transcript_chunks(text, max_length=5000):
|
80 |
+
# Calculate the total number of utf-8 codepoints
|
81 |
+
#totalCodepoints = len(text.encode("utf-8"))
|
82 |
+
|
83 |
+
# Split the transcript into sentences
|
84 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
85 |
+
|
86 |
+
# Initialize a list to store the chunks of text
|
87 |
+
chunks = []
|
88 |
+
|
89 |
+
# Initialize a string to store a chunk of text
|
90 |
+
chunk = ""
|
91 |
+
|
92 |
+
# For each sentence in the list of sentences
|
93 |
+
for sentence in sentences:
|
94 |
+
# If adding the sentence to the chunk would keep it within the maximum length
|
95 |
+
if len(chunk.encode("utf-8")) + len(sentence.encode("utf-8")) + 1 <= max_length: # Adding 1 to account for space
|
96 |
+
# Add the sentence to the chunk
|
97 |
+
chunk += sentence + " "
|
98 |
+
else:
|
99 |
+
# If adding the sentence would exceed the maximum length and chunk is not empty
|
100 |
+
if chunk:
|
101 |
+
# Add the chunk to the list of chunks
|
102 |
+
chunks.append(chunk.strip())
|
103 |
+
# Start a new chunk with the current sentence
|
104 |
+
chunk = sentence + " "
|
105 |
+
|
106 |
+
# Add the last chunk to the list of chunks (if it's not empty)
|
107 |
+
if chunk:
|
108 |
+
chunks.append(chunk.strip())
|
109 |
+
|
110 |
+
# Return the list of chunks
|
111 |
+
return chunks
|
112 |
+
|
113 |
+
def convertChunkListToCompatibleDict(chunkList):
|
114 |
+
# Create dictionary with numbers as keys and chunks as values
|
115 |
+
chunkDict = {}
|
116 |
+
for i, chunk in enumerate(chunkList, 1):
|
117 |
+
chunkDict[i] = {'text': chunk}
|
118 |
+
return chunkDict
|
119 |
+
|
120 |
+
|
121 |
+
# Translate the text entries of the dictionary
|
122 |
+
def translate_dictionary(inputSubsDict, langDict, translatedSrtFileName, skipTranslation=False, ):
|
123 |
+
targetLanguage = langDict['targetLanguage']
|
124 |
+
sourceLanguage = langDict['sourceLanguage']
|
125 |
+
translateService = langDict['translateService']
|
126 |
+
|
127 |
+
# Create a container for all the text to be translated
|
128 |
+
textToTranslate = []
|
129 |
+
|
130 |
+
for key in inputSubsDict:
|
131 |
+
originalText = inputSubsDict[key]['text']
|
132 |
+
# Add any 'notranslate' tags to the text
|
133 |
+
processedText = add_notranslate_tags_from_notranslate_file(originalText, dontTranslateList)
|
134 |
+
processedText = add_notranslate_tags_from_notranslate_file(processedText, urlList)
|
135 |
+
processedText = add_notranslate_tags_for_manual_translations(processedText, targetLanguage)
|
136 |
+
|
137 |
+
# Add the text to the list of text to be translated
|
138 |
+
textToTranslate.append(processedText)
|
139 |
+
|
140 |
+
# Calculate the total number of utf-8 codepoints
|
141 |
+
codepoints = 0
|
142 |
+
for text in textToTranslate:
|
143 |
+
codepoints += len(text.encode("utf-8"))
|
144 |
+
|
145 |
+
# If the codepoints are greater than 28000, split the request into multiple
|
146 |
+
# Google's API limit is 30000 Utf-8 codepoints per request, while DeepL's is 130000, but we leave some room just in case
|
147 |
+
if skipTranslation == False:
|
148 |
+
if translateService == 'azure':
|
149 |
+
print("Translating text using Azure...")
|
150 |
+
result = azure_translate_text(textToTranslate, sourceLanguage, targetLanguage)
|
151 |
+
|
152 |
+
# Add the translated texts to the dictionary
|
153 |
+
for i, key in enumerate(inputSubsDict):
|
154 |
+
inputSubsDict[key]['translated_text'] = process_response_text(result[i]["text"], targetLanguage)
|
155 |
+
# Print progress, overwrite the same line
|
156 |
+
print(f' Translated: {key} of {len(inputSubsDict)}', end='\r')
|
157 |
+
else:
|
158 |
+
print("Error: Invalid translate_service setting. Only 'Azure' is supported.")
|
159 |
+
sys.exit()
|
160 |
+
else:
|
161 |
+
for key in inputSubsDict:
|
162 |
+
inputSubsDict[key]['translated_text'] = process_response_text(inputSubsDict[key]['text'], targetLanguage) # Skips translating, such as for testing
|
163 |
+
print(" ")
|
164 |
+
|
165 |
+
|
166 |
+
combinedProcessedDict = combine_subtitles_advanced(inputSubsDict, int(combine_subtitles_max_chars))
|
167 |
+
|
168 |
+
if skipTranslation == False:
|
169 |
+
# Write new srt file with translated text
|
170 |
+
with open(translatedSrtFileName, 'w', encoding='utf-8-sig') as f:
|
171 |
+
for key in combinedProcessedDict:
|
172 |
+
f.write(str(key) + '\n')
|
173 |
+
f.write(combinedProcessedDict[key]['srt_timestamps_line'] + '\n')
|
174 |
+
f.write(combinedProcessedDict[key]['translated_text'] + '\n')
|
175 |
+
f.write('\n')
|
176 |
+
|
177 |
+
return combinedProcessedDict
|
178 |
+
|
179 |
+
|
180 |
+
##### Add additional info to the dictionary for each language #####
|
181 |
+
def set_translation_info(languageBatchDict):
|
182 |
+
newBatchSettingsDict = copy.deepcopy(languageBatchDict)
|
183 |
+
|
184 |
+
# If using Azure, set all languages to use Azure in dictionary
|
185 |
+
if translate_service == 'azure':
|
186 |
+
for langNum, langInfo in languageBatchDict.items():
|
187 |
+
newBatchSettingsDict[langNum]['translate_service'] = 'azure'
|
188 |
+
newBatchSettingsDict[langNum]['formality'] = None
|
189 |
+
|
190 |
+
else:
|
191 |
+
print("Error: No valid translation service selected. Please choose a valid service or enable 'skip_translation' in config.")
|
192 |
+
sys.exit()
|
193 |
+
|
194 |
+
return newBatchSettingsDict
|
195 |
+
|
196 |
+
|
197 |
+
#======================================== Combine Subtitle Lines ================================================
|
198 |
+
def combine_subtitles_advanced(inputDict, maxCharacters=200):
|
199 |
+
charRateGoal = 20 #20
|
200 |
+
gapThreshold = 100 # The maximum gap between subtitles to combine
|
201 |
+
noMorePossibleCombines = False
|
202 |
+
# Convert dictionary to list of dictionaries of the values
|
203 |
+
entryList = []
|
204 |
+
|
205 |
+
for key, value in inputDict.items():
|
206 |
+
value['originalIndex'] = int(key)-1
|
207 |
+
entryList.append(value)
|
208 |
+
|
209 |
+
while not noMorePossibleCombines:
|
210 |
+
entryList, noMorePossibleCombines = combine_single_pass(entryList, charRateGoal, gapThreshold, maxCharacters)
|
211 |
+
|
212 |
+
# Convert the list back to a dictionary then return it
|
213 |
+
return dict(enumerate(entryList, start=1))
|
214 |
+
|
215 |
+
def combine_single_pass(entryListLocal, charRateGoal, gapThreshold, maxCharacters):
|
216 |
+
# Want to restart the loop if a change is made, so use this variable, otherwise break only if the end is reached
|
217 |
+
reachedEndOfList = False
|
218 |
+
noMorePossibleCombines = True # Will be set to False if a combination is made
|
219 |
+
|
220 |
+
# Use while loop because the list is being modified
|
221 |
+
while not reachedEndOfList:
|
222 |
+
|
223 |
+
# Need to update original index in here
|
224 |
+
for entry in entryListLocal:
|
225 |
+
entry['originalIndex'] = entryListLocal.index(entry)
|
226 |
+
|
227 |
+
# Will use later to check if an entry is the last one in the list, because the last entry will have originalIndex equal to the length of the list - 1
|
228 |
+
originalNumberOfEntries = len(entryListLocal)
|
229 |
+
|
230 |
+
# Need to calculate the char_rate for each entry, any time something changes, so put it at the top of this loop
|
231 |
+
entryListLocal = calc_list_speaking_rates(entryListLocal, charRateGoal)
|
232 |
+
|
233 |
+
# Sort the list by the difference in speaking speed from charRateGoal
|
234 |
+
priorityOrderedList = sorted(entryListLocal, key=itemgetter('char_rate_diff'), reverse=True)
|
235 |
+
|
236 |
+
# Iterates through the list in order of priority, and uses that index to operate on entryListLocal
|
237 |
+
# For loop is broken after a combination is made, so that the list can be re-sorted and re-iterated
|
238 |
+
for progress, data in enumerate(priorityOrderedList):
|
239 |
+
i = data['originalIndex']
|
240 |
+
# Check if last entry, and therefore will end loop when done with this iteration
|
241 |
+
if progress == len(priorityOrderedList) - 1:
|
242 |
+
reachedEndOfList = True
|
243 |
+
|
244 |
+
# Check if the current entry is outside the upper and lower bounds
|
245 |
+
if (data['char_rate'] > charRateGoal or data['char_rate'] < charRateGoal):
|
246 |
+
|
247 |
+
# Check if the entry is the first in entryListLocal, if so do not consider the previous entry
|
248 |
+
if data['originalIndex'] == 0:
|
249 |
+
considerPrev = False
|
250 |
+
else:
|
251 |
+
considerPrev = True
|
252 |
+
|
253 |
+
# Check if the entry is the last in entryListLocal, if so do not consider the next entry
|
254 |
+
if data['originalIndex'] == originalNumberOfEntries - 1:
|
255 |
+
considerNext = False
|
256 |
+
else:
|
257 |
+
considerNext = True
|
258 |
+
|
259 |
+
# Check if current entry is still in the list - if it has been combined with another entry, it will not be
|
260 |
+
|
261 |
+
|
262 |
+
# Get the char_rate of the next and previous entries, if they exist, and calculate the difference
|
263 |
+
# If the diff is positive, then it is lower than the current char_rate
|
264 |
+
try:
|
265 |
+
nextCharRate = entryListLocal[i+1]['char_rate']
|
266 |
+
nextDiff = data['char_rate'] - nextCharRate
|
267 |
+
except IndexError:
|
268 |
+
considerNext = False
|
269 |
+
nextCharRate = None
|
270 |
+
nextDiff = None
|
271 |
+
try:
|
272 |
+
prevCharRate = entryListLocal[i-1]['char_rate']
|
273 |
+
prevDiff = data['char_rate'] - prevCharRate
|
274 |
+
except IndexError:
|
275 |
+
considerPrev = False
|
276 |
+
prevCharRate = None
|
277 |
+
prevDiff = None
|
278 |
+
|
279 |
+
else:
|
280 |
+
continue
|
281 |
+
|
282 |
+
# Define functions for combining with previous or next entries - Generated with copilot, it's possible this isn't perfect
|
283 |
+
def combine_with_next():
|
284 |
+
entryListLocal[i]['text'] = entryListLocal[i]['text'] + ' ' + entryListLocal[i+1]['text']
|
285 |
+
entryListLocal[i]['translated_text'] = entryListLocal[i]['translated_text'] + ' ' + entryListLocal[i+1]['translated_text']
|
286 |
+
entryListLocal[i]['end_ms'] = entryListLocal[i+1]['end_ms']
|
287 |
+
entryListLocal[i]['end_ms_buffered'] = entryListLocal[i+1]['end_ms_buffered']
|
288 |
+
entryListLocal[i]['duration_ms'] = int(entryListLocal[i+1]['end_ms']) - int(entryListLocal[i]['start_ms'])
|
289 |
+
entryListLocal[i]['duration_ms_buffered'] = int(entryListLocal[i+1]['end_ms_buffered']) - int(entryListLocal[i]['start_ms_buffered'])
|
290 |
+
entryListLocal[i]['srt_timestamps_line'] = entryListLocal[i]['srt_timestamps_line'].split(' --> ')[0] + ' --> ' + entryListLocal[i+1]['srt_timestamps_line'].split(' --> ')[1]
|
291 |
+
del entryListLocal[i+1]
|
292 |
+
|
293 |
+
def combine_with_prev():
|
294 |
+
entryListLocal[i-1]['text'] = entryListLocal[i-1]['text'] + ' ' + entryListLocal[i]['text']
|
295 |
+
entryListLocal[i-1]['translated_text'] = entryListLocal[i-1]['translated_text'] + ' ' + entryListLocal[i]['translated_text']
|
296 |
+
entryListLocal[i-1]['end_ms'] = entryListLocal[i]['end_ms']
|
297 |
+
entryListLocal[i-1]['end_ms_buffered'] = entryListLocal[i]['end_ms_buffered']
|
298 |
+
entryListLocal[i-1]['duration_ms'] = int(entryListLocal[i]['end_ms']) - int(entryListLocal[i-1]['start_ms'])
|
299 |
+
entryListLocal[i-1]['duration_ms_buffered'] = int(entryListLocal[i]['end_ms_buffered']) - int(entryListLocal[i-1]['start_ms_buffered'])
|
300 |
+
entryListLocal[i-1]['srt_timestamps_line'] = entryListLocal[i-1]['srt_timestamps_line'].split(' --> ')[0] + ' --> ' + entryListLocal[i]['srt_timestamps_line'].split(' --> ')[1]
|
301 |
+
del entryListLocal[i]
|
302 |
+
|
303 |
+
|
304 |
+
# Choose whether to consider next and previous entries, and if neither then continue to next loop
|
305 |
+
if data['char_rate'] > charRateGoal:
|
306 |
+
# Check to ensure next/previous rates are lower than current rate, and the combined entry is not too long, and the gap between entries is not too large
|
307 |
+
# Need to add check for considerNext and considerPrev first, because if run other checks when there is no next/prev value to check, it will throw an error
|
308 |
+
if considerNext == False or nextDiff or nextDiff < 0 or (entryListLocal[i]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i]['translated_text']) + len(entryListLocal[i+1]['translated_text']) > maxCharacters):
|
309 |
+
considerNext = False
|
310 |
+
try:
|
311 |
+
if considerPrev == False or not prevDiff or prevDiff < 0 or (entryListLocal[i-1]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i-1]['translated_text']) + len(entryListLocal[i]['translated_text']) > maxCharacters):
|
312 |
+
considerPrev = False
|
313 |
+
except TypeError:
|
314 |
+
considerPrev = False
|
315 |
+
|
316 |
+
elif data['char_rate'] < charRateGoal:
|
317 |
+
# Check to ensure next/previous rates are higher than current rate
|
318 |
+
if considerNext == False or not nextDiff or nextDiff > 0 or (entryListLocal[i]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i]['translated_text']) + len(entryListLocal[i+1]['translated_text']) > maxCharacters):
|
319 |
+
considerNext = False
|
320 |
+
try:
|
321 |
+
if considerPrev == False or not prevDiff or prevDiff > 0 or (entryListLocal[i-1]['break_until_next'] >= gapThreshold) or (len(entryListLocal[i-1]['translated_text']) + len(entryListLocal[i]['translated_text']) > maxCharacters):
|
322 |
+
considerPrev = False
|
323 |
+
except TypeError:
|
324 |
+
considerPrev = False
|
325 |
+
else:
|
326 |
+
continue
|
327 |
+
|
328 |
+
# Continue to next loop if neither are considered
|
329 |
+
if not considerNext and not considerPrev:
|
330 |
+
continue
|
331 |
+
|
332 |
+
# Should only reach this point if two entries are to be combined
|
333 |
+
if data['char_rate'] > charRateGoal:
|
334 |
+
# If both are to be considered, then choose the one with the lower char_rate
|
335 |
+
if considerNext and considerPrev:
|
336 |
+
if nextDiff < prevDiff:
|
337 |
+
combine_with_next()
|
338 |
+
noMorePossibleCombines = False
|
339 |
+
break
|
340 |
+
else:
|
341 |
+
combine_with_prev()
|
342 |
+
noMorePossibleCombines = False
|
343 |
+
break
|
344 |
+
# If only one is to be considered, then combine with that one
|
345 |
+
elif considerNext:
|
346 |
+
combine_with_next()
|
347 |
+
noMorePossibleCombines = False
|
348 |
+
break
|
349 |
+
elif considerPrev:
|
350 |
+
combine_with_prev()
|
351 |
+
noMorePossibleCombines = False
|
352 |
+
break
|
353 |
+
else:
|
354 |
+
print(f"Error U: Should not reach this point! Current entry = {i}")
|
355 |
+
print(f"Current Entry Text = {data['text']}")
|
356 |
+
continue
|
357 |
+
|
358 |
+
elif data['char_rate'] < charRateGoal:
|
359 |
+
# If both are to be considered, then choose the one with the higher char_rate
|
360 |
+
if considerNext and considerPrev:
|
361 |
+
if nextDiff > prevDiff:
|
362 |
+
combine_with_next()
|
363 |
+
noMorePossibleCombines = False
|
364 |
+
break
|
365 |
+
else:
|
366 |
+
combine_with_prev()
|
367 |
+
noMorePossibleCombines = False
|
368 |
+
break
|
369 |
+
# If only one is to be considered, then combine with that one
|
370 |
+
elif considerNext:
|
371 |
+
combine_with_next()
|
372 |
+
noMorePossibleCombines = False
|
373 |
+
break
|
374 |
+
elif considerPrev:
|
375 |
+
combine_with_prev()
|
376 |
+
noMorePossibleCombines = False
|
377 |
+
break
|
378 |
+
else:
|
379 |
+
print(f"Error L: Should not reach this point! Index = {i}")
|
380 |
+
print(f"Current Entry Text = {data['text']}")
|
381 |
+
continue
|
382 |
+
return entryListLocal, noMorePossibleCombines
|
383 |
+
|
384 |
+
#-- End of combine_single_pass --
|
385 |
+
|
386 |
+
#----------------------------------------------------------------------
|
387 |
+
|
388 |
+
# Calculate the number of characters per second for each subtitle entry
|
389 |
+
def calc_dict_speaking_rates(inputDict, dictKey='translated_text'):
|
390 |
+
tempDict = copy.deepcopy(inputDict)
|
391 |
+
for key, value in tempDict.items():
|
392 |
+
tempDict[key]['char_rate'] = round(len(value[dictKey]) / (int(value['duration_ms']) / 1000), 2)
|
393 |
+
return tempDict
|
394 |
+
|
395 |
+
def calc_list_speaking_rates(inputList, charRateGoal, dictKey='translated_text'):
|
396 |
+
tempList = copy.deepcopy(inputList)
|
397 |
+
for i in range(len(tempList)):
|
398 |
+
# Calculate the number of characters per second based on the duration of the entry
|
399 |
+
tempList[i]['char_rate'] = round(len(tempList[i][dictKey]) / (int(tempList[i]['duration_ms']) / 1000), 2)
|
400 |
+
# Calculate the difference between the current char_rate and the goal char_rate - Absolute Value
|
401 |
+
tempList[i]['char_rate_diff'] = abs(round(tempList[i]['char_rate'] - charRateGoal, 2))
|
402 |
+
return tempList
|
app/scripts/utils.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
|
3 |
+
# Interprets a string as a boolean. Returns True or False
|
4 |
+
def parseBool(string, silent=False):
|
5 |
+
if type(string) == str:
|
6 |
+
if string.lower() == 'true':
|
7 |
+
return True
|
8 |
+
elif string.lower() == 'false':
|
9 |
+
return False
|
10 |
+
else:
|
11 |
+
if not silent:
|
12 |
+
raise ValueError(f'Invalid value "{string}". Must be "True" or "False"')
|
13 |
+
elif silent:
|
14 |
+
return string
|
15 |
+
elif type(string) == bool:
|
16 |
+
if string == True:
|
17 |
+
return True
|
18 |
+
elif string == False:
|
19 |
+
return False
|
20 |
+
else:
|
21 |
+
raise ValueError('Not a valid boolean string')
|
22 |
+
|
23 |
+
def parseConfigSetting(setting):
|
24 |
+
# Remove any quotes user may have added in config file
|
25 |
+
setting = setting.strip("\"").strip("\'")
|
26 |
+
|
27 |
+
# Check if it is a boolean
|
28 |
+
if type(parseBool(setting, silent=True)) == bool:
|
29 |
+
return parseBool(setting, silent=True)
|
30 |
+
|
31 |
+
# Check if it is an integer
|
32 |
+
try:
|
33 |
+
return int(setting)
|
34 |
+
except ValueError:
|
35 |
+
pass
|
36 |
+
|
37 |
+
# Otherwise return the string in lower case
|
38 |
+
return setting.lower()
|
39 |
+
|
40 |
+
# Returns a list of dictionaries from a csv file. Where the key is the column name and the value is the value in that column
|
41 |
+
# The column names are set by the first row of the csv file
|
42 |
+
def csv_to_dict(csvFilePath):
|
43 |
+
with open(csvFilePath, "r", encoding='utf-8-sig') as data:
|
44 |
+
entriesDictsList = []
|
45 |
+
for line in csv.DictReader(data):
|
46 |
+
entriesDictsList.append(line)
|
47 |
+
return entriesDictsList
|
48 |
+
|
49 |
+
# Returns a list of strings from a txt file. Ignores empty lines and lines that start with '#'
|
50 |
+
def txt_to_list(txtFilePath):
|
51 |
+
with open(txtFilePath, "r", encoding='utf-8-sig') as data:
|
52 |
+
entriesList = []
|
53 |
+
for line in data:
|
54 |
+
if line.strip() != '' and line.strip()[0] != '#':
|
55 |
+
entriesList.append(line.strip())
|
56 |
+
return entriesList
|
requirements.txt
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
annotated-types==0.5.0
|
2 |
+
anyio==3.7.1
|
3 |
+
azure-cognitiveservices-speech==1.30.0
|
4 |
+
boto3==1.28.18
|
5 |
+
botocore==1.31.18
|
6 |
+
Brotli==1.0.9
|
7 |
+
certifi==2023.7.22
|
8 |
+
cffi==1.15.1
|
9 |
+
charset-normalizer==3.2.0
|
10 |
+
click==8.0.3
|
11 |
+
essentials==1.1.4
|
12 |
+
fastapi==0.100.1
|
13 |
+
h11==0.14.0
|
14 |
+
idna==3.4
|
15 |
+
jmespath==1.0.1
|
16 |
+
langcodes==3.3.0
|
17 |
+
language-data==1.1
|
18 |
+
marisa-trie==0.7.8
|
19 |
+
mutagen==1.46.0
|
20 |
+
numpy==1.25.2
|
21 |
+
pycparser==2.21
|
22 |
+
pycryptodomex==3.18.0
|
23 |
+
pydantic==2.1.1
|
24 |
+
pydantic_core==2.4.0
|
25 |
+
pydub==0.25.1
|
26 |
+
pyrubberband==0.3.0
|
27 |
+
PySoundFile==0.9.0.post1
|
28 |
+
python-dateutil==2.8.2
|
29 |
+
python-dotenv==0.19.2
|
30 |
+
python-multipart==0.0.6
|
31 |
+
regex==2023.6.3
|
32 |
+
requests==2.31.0
|
33 |
+
s3transfer==0.6.1
|
34 |
+
six==1.16.0
|
35 |
+
sniffio==1.3.0
|
36 |
+
soundfile==0.12.1
|
37 |
+
starlette==0.27.0
|
38 |
+
typing_extensions==4.7.1
|
39 |
+
urllib3==1.26.16
|
40 |
+
uvicorn==0.23.2
|
41 |
+
websockets==11.0.3
|
42 |
+
yt-dlp==2023.7.6
|
43 |
+
youtube_transcript_api
|