Spaces:
Running
Running
drewThomasson
commited on
Commit
•
aecf640
1
Parent(s):
b4a108c
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,48 @@ import argparse
|
|
5 |
language_options = [
|
6 |
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"
|
7 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# Convert the list of languages to a string to display in the help text
|
10 |
language_options_str = ", ".join(language_options)
|
@@ -492,17 +534,26 @@ def split_long_sentence(sentence, language='en', max_pauses=10):
|
|
492 |
:param max_pauses: Maximum allowed number of pauses in a sentence.
|
493 |
:return: A list of sentence parts that meet the criteria.
|
494 |
"""
|
495 |
-
#
|
|
|
|
|
|
|
496 |
if language == 'zh-cn':
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
punctuation = [',', '
|
|
|
|
|
|
|
|
|
502 |
else:
|
503 |
-
|
504 |
-
punctuation = [',', ';', '
|
505 |
|
|
|
|
|
506 |
parts = []
|
507 |
while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses:
|
508 |
possible_splits = [i for i, char in enumerate(sentence) if char in punctuation and i < max_length]
|
@@ -568,7 +619,15 @@ def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, targe
|
|
568 |
|
569 |
with open(chapter_path, 'r', encoding='utf-8') as file:
|
570 |
chapter_text = file.read()
|
571 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
572 |
for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
|
573 |
fragments = split_long_sentence(sentence, language=language)
|
574 |
for fragment in fragments:
|
@@ -615,7 +674,15 @@ def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, tar
|
|
615 |
|
616 |
with open(chapter_path, 'r', encoding='utf-8') as file:
|
617 |
chapter_text = file.read()
|
618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
619 |
for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
|
620 |
fragments = split_long_sentence(sentence, language=language)
|
621 |
for fragment in fragments:
|
|
|
5 |
language_options = [
|
6 |
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"
|
7 |
]
|
8 |
+
char_limits = {
|
9 |
+
"en": 250, # English
|
10 |
+
"es": 239, # Spanish
|
11 |
+
"fr": 273, # French
|
12 |
+
"de": 253, # German
|
13 |
+
"it": 213, # Italian
|
14 |
+
"pt": 203, # Portuguese
|
15 |
+
"pl": 224, # Polish
|
16 |
+
"tr": 226, # Turkish
|
17 |
+
"ru": 182, # Russian
|
18 |
+
"nl": 251, # Dutch
|
19 |
+
"cs": 186, # Czech
|
20 |
+
"ar": 166, # Arabic
|
21 |
+
"zh-cn": 82, # Chinese (Simplified)
|
22 |
+
"ja": 71, # Japanese
|
23 |
+
"hu": 224, # Hungarian
|
24 |
+
"ko": 95, # Korean
|
25 |
+
}
|
26 |
+
|
27 |
+
# Mapping of language codes to NLTK's supported language names
|
28 |
+
language_mapping = {
|
29 |
+
"en": "english",
|
30 |
+
"de": "german",
|
31 |
+
"fr": "french",
|
32 |
+
"es": "spanish",
|
33 |
+
"it": "italian",
|
34 |
+
"pt": "portuguese",
|
35 |
+
"nl": "dutch",
|
36 |
+
"pl": "polish",
|
37 |
+
"cs": "czech",
|
38 |
+
"ru": "russian",
|
39 |
+
"tr": "turkish",
|
40 |
+
"el": "greek",
|
41 |
+
"et": "estonian",
|
42 |
+
"no": "norwegian",
|
43 |
+
"ml": "malayalam",
|
44 |
+
"sl": "slovene",
|
45 |
+
"da": "danish",
|
46 |
+
"fi": "finnish",
|
47 |
+
"sv": "swedish"
|
48 |
+
}
|
49 |
+
|
50 |
|
51 |
# Convert the list of languages to a string to display in the help text
|
52 |
language_options_str = ", ".join(language_options)
|
|
|
534 |
:param max_pauses: Maximum allowed number of pauses in a sentence.
|
535 |
:return: A list of sentence parts that meet the criteria.
|
536 |
"""
|
537 |
+
#Get the Max character length for the selected language -2 : with a default of 248 if no language is found
|
538 |
+
max_length = (char_limits.get(language, 250)-2)
|
539 |
+
|
540 |
+
# Adjust the pause punctuation symbols based on language
|
541 |
if language == 'zh-cn':
|
542 |
+
punctuation = [',', '。', ';', '?', '!'] # Chinese-specific pause punctuation including sentence-ending marks
|
543 |
+
elif language == 'ja':
|
544 |
+
punctuation = ['、', '。', ';', '?', '!'] # Japanese-specific pause punctuation
|
545 |
+
elif language == 'ko':
|
546 |
+
punctuation = [',', '。', ';', '?', '!'] # Korean-specific pause punctuation
|
547 |
+
elif language == 'ar':
|
548 |
+
punctuation = ['،', '؛', '؟', '!', '·', '؛', '.'] # Arabic-specific punctuation
|
549 |
+
elif language == 'en':
|
550 |
+
punctuation = [',', ';', '.'] # English-specific pause punctuation
|
551 |
else:
|
552 |
+
# Default pause punctuation for other languages (es, fr, de, it, pt, pl, cs, ru, nl, tr, hu)
|
553 |
+
punctuation = [',', '.', ';', ':', '?', '!']
|
554 |
|
555 |
+
|
556 |
+
|
557 |
parts = []
|
558 |
while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses:
|
559 |
possible_splits = [i for i, char in enumerate(sentence) if char in punctuation and i < max_length]
|
|
|
619 |
|
620 |
with open(chapter_path, 'r', encoding='utf-8') as file:
|
621 |
chapter_text = file.read()
|
622 |
+
# Check if the language code is supported
|
623 |
+
nltk_language = language_mapping.get(language)
|
624 |
+
if nltk_language:
|
625 |
+
# If the language is supported, tokenize using sent_tokenize
|
626 |
+
sentences = sent_tokenize(chapter_text, language=nltk_language)
|
627 |
+
else:
|
628 |
+
# If the language is not supported, handle it (e.g., return the text unchanged)
|
629 |
+
sentences = [chapter_text] # No tokenization, just wrap the text in a list
|
630 |
+
#sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english')
|
631 |
for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
|
632 |
fragments = split_long_sentence(sentence, language=language)
|
633 |
for fragment in fragments:
|
|
|
674 |
|
675 |
with open(chapter_path, 'r', encoding='utf-8') as file:
|
676 |
chapter_text = file.read()
|
677 |
+
# Check if the language code is supported
|
678 |
+
nltk_language = language_mapping.get(language)
|
679 |
+
if nltk_language:
|
680 |
+
# If the language is supported, tokenize using sent_tokenize
|
681 |
+
sentences = sent_tokenize(chapter_text, language=nltk_language)
|
682 |
+
else:
|
683 |
+
# If the language is not supported, handle it (e.g., return the text unchanged)
|
684 |
+
sentences = [chapter_text] # No tokenization, just wrap the text in a list
|
685 |
+
#sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english')
|
686 |
for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
|
687 |
fragments = split_long_sentence(sentence, language=language)
|
688 |
for fragment in fragments:
|