Spaces:

Plachta
/

VALL-E-X

Running on A10G

Plachta commited on Aug 30, 2023

Commit

c0d010f

•

1 Parent(s): f32f220

Replaced Encodec with Vocos

Files changed (5) hide show

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ import gradio as gr
 from vocos import Vocos
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
-from utils.sentence_cutter import split_text_into_sentences
 torch._C._jit_set_profiling_executor(False)
 torch._C._jit_set_profiling_mode(False)
@@ -331,6 +331,7 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
     fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
     sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
     """
     if len(text) > 1000:
         return "Rejected, Text too long (should be less than 1000 characters)", None
     mode = 'fixed-prompt'

 from vocos import Vocos
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 torch._C._jit_set_profiling_executor(False)
 torch._C._jit_set_profiling_mode(False)
     fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
     sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
     """
+    from utils.sentence_cutter import split_text_into_sentences
     if len(text) > 1000:
         return "Rejected, Text too long (should be less than 1000 characters)", None
     mode = 'fixed-prompt'

utils/g2p/english.py CHANGED Viewed

@@ -19,7 +19,6 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
 import re
 from unidecode import unidecode
 import inflect
-import eng_to_ipa as ipa
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@@ -158,6 +157,7 @@ def mark_dark_l(text):
 def english_to_ipa(text):
     text = unidecode(text).lower()
     text = expand_abbreviations(text)
     text = normalize_numbers(text)

 import re
 from unidecode import unidecode
 import inflect
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 def english_to_ipa(text):
+    import eng_to_ipa as ipa
     text = unidecode(text).lower()
     text = expand_abbreviations(text)
     text = normalize_numbers(text)

utils/g2p/japanese.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import re
 from unidecode import unidecode
-import pyopenjtalk
@@ -74,7 +73,7 @@ def symbols_to_japanese(text):
 def japanese_to_romaji_with_accent(text):
     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
     text = symbols_to_japanese(text)
     sentences = re.split(_japanese_marks, text)
     marks = re.findall(_japanese_marks, text)

 import re
 from unidecode import unidecode
 def japanese_to_romaji_with_accent(text):
     '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
+    import pyopenjtalk
     text = symbols_to_japanese(text)
     sentences = re.split(_japanese_marks, text)
     marks = re.findall(_japanese_marks, text)

utils/g2p/mandarin.py CHANGED Viewed

@@ -4,7 +4,6 @@ import re
 import jieba
 import cn2an
 import logging
-from pypinyin import lazy_pinyin, BOPOMOFO
 # List of (Latin alphabet, bopomofo) pairs:
@@ -241,7 +240,7 @@ def number_to_chinese(text):
 def chinese_to_bopomofo(text):
     text = text.replace('、', '，').replace('；', '，').replace('：', '，')
     words = jieba.lcut(text, cut_all=False)
     text = ''

 import jieba
 import cn2an
 import logging
 # List of (Latin alphabet, bopomofo) pairs:
 def chinese_to_bopomofo(text):
+    from pypinyin import lazy_pinyin, BOPOMOFO
     text = text.replace('、', '，').replace('；', '，').replace('：', '，')
     words = jieba.lcut(text, cut_all=False)
     text = ''

utils/sentence_cutter.py CHANGED Viewed

@@ -40,16 +40,4 @@ def split_text_into_sentences(text):
         return sentences
-    raise RuntimeError("It is impossible to reach here.")
-long_text = """
-This is a very long paragraph, so most TTS model is unable to handle it. Hence, we have to split it into several sentences. With the help of NLTK, we can split it into sentences. However, the punctuation is not preserved, so we have to add it back. How are we going to do write this code? Let's see.
-"""
-long_text = """
-现在我们要来尝试一下中文分句。因为很不幸的是，NLTK不支持中文分句。幸运的是，我们可以使用jieba来分句。但是，jieba分句后，标点符号会丢失，所以我们要手动添加回去。我现在正在想办法把这个例句写的更长更复杂一点，来测试jieba分句的性能。嗯......省略号，感觉不太好，因为省略号不是句号，所以jieba不会把它当作句子的结尾。会这样吗？我们来试试看。
-"""
-long_text = """
-これなら、英語と中国語の分句もできる。でも、日本語はどうする？まつわ、ChatGPTに僕と教えてください。ちょーと待ってください。あ、出来た！
-"""


40
41	return sentences
42
43	+ raise RuntimeError("It is impossible to reach here.")