drewThomasson commited on
Commit
aecf640
1 Parent(s): b4a108c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -10
app.py CHANGED
@@ -5,6 +5,48 @@ import argparse
5
  language_options = [
6
  "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"
7
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Convert the list of languages to a string to display in the help text
10
  language_options_str = ", ".join(language_options)
@@ -492,17 +534,26 @@ def split_long_sentence(sentence, language='en', max_pauses=10):
492
  :param max_pauses: Maximum allowed number of pauses in a sentence.
493
  :return: A list of sentence parts that meet the criteria.
494
  """
495
- # Adjust the max_length and punctuation symbols based on language
 
 
 
496
  if language == 'zh-cn':
497
- max_length = 82 # Chinese-specific max length
498
- punctuation = [',', '', ';', '!', '?'] # Chinese-specific punctuation
499
- elif language == 'it':
500
- max_length = 213 # Italian-specific max length
501
- punctuation = [',', ';', '.'] # Standard punctuation
 
 
 
 
502
  else:
503
- max_length = 249 # Default max length for other languages
504
- punctuation = [',', ';', '.'] # Default punctuation
505
 
 
 
506
  parts = []
507
  while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses:
508
  possible_splits = [i for i, char in enumerate(sentence) if char in punctuation and i < max_length]
@@ -568,7 +619,15 @@ def convert_chapters_to_audio_custom_model(chapters_dir, output_audio_dir, targe
568
 
569
  with open(chapter_path, 'r', encoding='utf-8') as file:
570
  chapter_text = file.read()
571
- sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english')
 
 
 
 
 
 
 
 
572
  for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
573
  fragments = split_long_sentence(sentence, language=language)
574
  for fragment in fragments:
@@ -615,7 +674,15 @@ def convert_chapters_to_audio_standard_model(chapters_dir, output_audio_dir, tar
615
 
616
  with open(chapter_path, 'r', encoding='utf-8') as file:
617
  chapter_text = file.read()
618
- sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english')
 
 
 
 
 
 
 
 
619
  for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
620
  fragments = split_long_sentence(sentence, language=language)
621
  for fragment in fragments:
 
5
  language_options = [
6
  "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"
7
  ]
8
+ char_limits = {
9
+ "en": 250, # English
10
+ "es": 239, # Spanish
11
+ "fr": 273, # French
12
+ "de": 253, # German
13
+ "it": 213, # Italian
14
+ "pt": 203, # Portuguese
15
+ "pl": 224, # Polish
16
+ "tr": 226, # Turkish
17
+ "ru": 182, # Russian
18
+ "nl": 251, # Dutch
19
+ "cs": 186, # Czech
20
+ "ar": 166, # Arabic
21
+ "zh-cn": 82, # Chinese (Simplified)
22
+ "ja": 71, # Japanese
23
+ "hu": 224, # Hungarian
24
+ "ko": 95, # Korean
25
+ }
26
+
27
+ # Mapping of language codes to NLTK's supported language names
28
+ language_mapping = {
29
+ "en": "english",
30
+ "de": "german",
31
+ "fr": "french",
32
+ "es": "spanish",
33
+ "it": "italian",
34
+ "pt": "portuguese",
35
+ "nl": "dutch",
36
+ "pl": "polish",
37
+ "cs": "czech",
38
+ "ru": "russian",
39
+ "tr": "turkish",
40
+ "el": "greek",
41
+ "et": "estonian",
42
+ "no": "norwegian",
43
+ "ml": "malayalam",
44
+ "sl": "slovene",
45
+ "da": "danish",
46
+ "fi": "finnish",
47
+ "sv": "swedish"
48
+ }
49
+
50
 
51
  # Convert the list of languages to a string to display in the help text
52
  language_options_str = ", ".join(language_options)
 
534
  :param max_pauses: Maximum allowed number of pauses in a sentence.
535
  :return: A list of sentence parts that meet the criteria.
536
  """
537
+ #Get the Max character length for the selected language -2 : with a default of 248 if no language is found
538
+ max_length = (char_limits.get(language, 250)-2)
539
+
540
+ # Adjust the pause punctuation symbols based on language
541
  if language == 'zh-cn':
542
+ punctuation = [',', '。', ';', '?', '!'] # Chinese-specific pause punctuation including sentence-ending marks
543
+ elif language == 'ja':
544
+ punctuation = ['、', '', ';', '?', '!'] # Japanese-specific pause punctuation
545
+ elif language == 'ko':
546
+ punctuation = [',', '。', '', '', '!'] # Korean-specific pause punctuation
547
+ elif language == 'ar':
548
+ punctuation = ['،', '؛', '؟', '!', '·', '؛', '.'] # Arabic-specific punctuation
549
+ elif language == 'en':
550
+ punctuation = [',', ';', '.'] # English-specific pause punctuation
551
  else:
552
+ # Default pause punctuation for other languages (es, fr, de, it, pt, pl, cs, ru, nl, tr, hu)
553
+ punctuation = [',', '.', ';', ':', '?', '!']
554
 
555
+
556
+
557
  parts = []
558
  while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses:
559
  possible_splits = [i for i, char in enumerate(sentence) if char in punctuation and i < max_length]
 
619
 
620
  with open(chapter_path, 'r', encoding='utf-8') as file:
621
  chapter_text = file.read()
622
+ # Check if the language code is supported
623
+ nltk_language = language_mapping.get(language)
624
+ if nltk_language:
625
+ # If the language is supported, tokenize using sent_tokenize
626
+ sentences = sent_tokenize(chapter_text, language=nltk_language)
627
+ else:
628
+ # If the language is not supported, handle it (e.g., return the text unchanged)
629
+ sentences = [chapter_text] # No tokenization, just wrap the text in a list
630
+ #sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english')
631
  for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
632
  fragments = split_long_sentence(sentence, language=language)
633
  for fragment in fragments:
 
674
 
675
  with open(chapter_path, 'r', encoding='utf-8') as file:
676
  chapter_text = file.read()
677
+ # Check if the language code is supported
678
+ nltk_language = language_mapping.get(language)
679
+ if nltk_language:
680
+ # If the language is supported, tokenize using sent_tokenize
681
+ sentences = sent_tokenize(chapter_text, language=nltk_language)
682
+ else:
683
+ # If the language is not supported, handle it (e.g., return the text unchanged)
684
+ sentences = [chapter_text] # No tokenization, just wrap the text in a list
685
+ #sentences = sent_tokenize(chapter_text, language='italian' if language == 'it' else 'english')
686
  for sentence in tqdm(sentences, desc=f"Chapter {chapter_num}"):
687
  fragments = split_long_sentence(sentence, language=language)
688
  for fragment in fragments: