Spaces:

Flux9665
/

PoeticTTS

Running

App Files Files

Flux9665 commited on Nov 3, 2022

Commit

7466150

1 Parent(s): 5489b55

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -56

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import numpy as np
 import torch
 from InferenceInterfaces.UtteranceCloner import UtteranceCloner
-from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
 from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
 from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
@@ -38,11 +37,6 @@ class TTS_Interface:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
-        # for simplicity, since we are using an oracle for this demo, and we have seen enough German data to get by without word boundaries
-        self.utterance_cloner.tf.use_word_boundaries = False
-        self.utterance_cloner.tts.text2phone.use_word_boundaries = False
         self.utterance_cloner.tts.set_language("de")
         self.acoustic_model = Aligner()
         self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
@@ -52,6 +46,9 @@ class TTS_Interface:
         reference_audio = "reference_audios/2.wav"
         self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=True)
         self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
         #######
         self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
@@ -95,35 +92,35 @@ class TTS_Interface:
         duration = self.duration.clone()
         # lengthening
         lenghtening_candidates = [
-            # ('f', 27),
-            # ('l', 28),
-            ('ʏ', 29),
-            ('ç', 30),
-            # ('t', 31),
-            ('ɪ', 32),
-            # ('ɡ', 33),
-            ('ə', 34),
-            ('n', 35),
-            # ('z', 66),
-            ('ɑ', 67),
-            # ('ə', 68),
-            ('n', 69),
-            # ('b', 84),
-            ('e', 85),
-            # ('p', 86),
-            # ('t', 87),
-            ('ə', 88)
             ]
         for lenghtening_candidate in lenghtening_candidates:
             duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening
         # pauses
-        pause_candidates = [('~', 36),
-                            ('~', 70),
-                            ('~', 89)]
         for pause_candidate in pause_candidates:
             duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur
@@ -132,38 +129,38 @@ class TTS_Interface:
         # pitch raise
         pitch_candidates = [
-            # ('k', 37),
-            ('y', 38),
-            ('l', 39),
-            ('ə', 40),
-            ('ʃ', 41),
-            ('a', 42),
-            ('t', 43),
-            # ('ə', 44),
-            # ('n', 45),
-            ('a', 71),
-            ('l', 72),
-            ('v', 96),
-            ('ɛ', 97),
-            ('l', 98),
-            # ('ə', 99),
-            # ('n', 100)
             ]
         for pitch_candidate in pitch_candidates:
             pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up
-        fixme = [('f', 27),
-                 ('l', 28),
-                 ('ʏ', 29),
-                 ('ç', 30),
-                 ('t', 31),
-                 ('ɪ', 32),
-                 ('ɡ', 33),
-                 ('ə', 34),
-                 ('n', 35)
                  ]
         for pitch_candidate in fixme:
             pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)

 import torch
 from InferenceInterfaces.UtteranceCloner import UtteranceCloner
 from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
 from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
         self.utterance_cloner.tts.set_language("de")
         self.acoustic_model = Aligner()
         self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
         reference_audio = "reference_audios/2.wav"
         self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=True)
         self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
+        print(self.phones)
+        for index, phone in enumerate(self.phones):
+            print(index, phone)
         #######
         self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
         duration = self.duration.clone()
         # lengthening
         lenghtening_candidates = [
+            # ('f', 33),
+            # ('l', 34),
+            ('ʏ', 35),
+            ('ç', 36),
+            # ('t', 37),
+            ('ɪ', 38),
+            # ('ɡ', 39),
+            ('ə', 40),
+            ('n', 41),
+            # ('z', 79),
+            ('ɑ', 80),
+            # ('ə', 81),
+            ('n', 82),
+            # ('b', 103),
+            ('e', 104),
+            # ('p', 105),
+            # ('t', 106),
+            ('ə', 107)
             ]
         for lenghtening_candidate in lenghtening_candidates:
             duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening
         # pauses
+        pause_candidates = [('~', 42),
+                            ('~', 83),
+                            ('~', 108)]
         for pause_candidate in pause_candidates:
             duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur
         # pitch raise
         pitch_candidates = [
+            # ('k', 44),
+            ('y', 45),
+            ('l', 46),
+            ('ə', 47),
+            ('ʃ', 49),
+            ('a', 50),
+            ('t', 51),
+            # ('ə', 52),
+            # ('n', 53),
+            ('a', 85),
+            ('l', 86),
+            ('v', 118),
+            ('ɛ', 119),
+            ('l', 120),
+            # ('ə', 121),
+            # ('n', 122)
             ]
         for pitch_candidate in pitch_candidates:
             pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up
+        fixme = [('f', 33),
+                 ('l', 34),
+                 ('ʏ', 35),
+                 ('ç', 36),
+                 ('t', 37),
+                 ('ɪ', 38),
+                 ('ɡ', 39),
+                 ('ə', 40),
+                 ('n', 41)
                  ]
         for pitch_candidate in fixme:
             pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)