Flux9665 commited on
Commit
7466150
1 Parent(s): 5489b55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -56
app.py CHANGED
@@ -12,7 +12,6 @@ import numpy as np
12
  import torch
13
 
14
  from InferenceInterfaces.UtteranceCloner import UtteranceCloner
15
- from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
16
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
17
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
18
 
@@ -38,11 +37,6 @@ class TTS_Interface:
38
  def __init__(self):
39
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
40
  self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
41
-
42
- # for simplicity, since we are using an oracle for this demo, and we have seen enough German data to get by without word boundaries
43
- self.utterance_cloner.tf.use_word_boundaries = False
44
- self.utterance_cloner.tts.text2phone.use_word_boundaries = False
45
-
46
  self.utterance_cloner.tts.set_language("de")
47
  self.acoustic_model = Aligner()
48
  self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
@@ -52,6 +46,9 @@ class TTS_Interface:
52
  reference_audio = "reference_audios/2.wav"
53
  self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=True)
54
  self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
 
 
 
55
 
56
  #######
57
  self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
@@ -95,35 +92,35 @@ class TTS_Interface:
95
  duration = self.duration.clone()
96
  # lengthening
97
  lenghtening_candidates = [
98
- # ('f', 27),
99
- # ('l', 28),
100
- ('ʏ', 29),
101
- ('ç', 30),
102
- # ('t', 31),
103
- ('ɪ', 32),
104
- # ('ɡ', 33),
105
- ('ə', 34),
106
- ('n', 35),
107
-
108
- # ('z', 66),
109
- ('ɑ', 67),
110
- # ('ə', 68),
111
- ('n', 69),
112
-
113
- # ('b', 84),
114
- ('e', 85),
115
- # ('p', 86),
116
- # ('t', 87),
117
- ('ə', 88)
118
  ]
119
 
120
  for lenghtening_candidate in lenghtening_candidates:
121
  duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening
122
 
123
  # pauses
124
- pause_candidates = [('~', 36),
125
- ('~', 70),
126
- ('~', 89)]
127
 
128
  for pause_candidate in pause_candidates:
129
  duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur
@@ -132,38 +129,38 @@ class TTS_Interface:
132
  # pitch raise
133
 
134
  pitch_candidates = [
135
- # ('k', 37),
136
- ('y', 38),
137
- ('l', 39),
138
- ('ə', 40),
139
- ('ʃ', 41),
140
- ('a', 42),
141
- ('t', 43),
142
- # ('ə', 44),
143
- # ('n', 45),
144
-
145
- ('a', 71),
146
- ('l', 72),
147
-
148
- ('v', 96),
149
- ('ɛ', 97),
150
- ('l', 98),
151
- # ('ə', 99),
152
- # ('n', 100)
153
  ]
154
 
155
  for pitch_candidate in pitch_candidates:
156
  pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up
157
 
158
- fixme = [('f', 27),
159
- ('l', 28),
160
- ('ʏ', 29),
161
- ('ç', 30),
162
- ('t', 31),
163
- ('ɪ', 32),
164
- ('ɡ', 33),
165
- ('ə', 34),
166
- ('n', 35)
167
  ]
168
  for pitch_candidate in fixme:
169
  pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)
 
12
  import torch
13
 
14
  from InferenceInterfaces.UtteranceCloner import UtteranceCloner
 
15
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
16
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
17
 
 
37
  def __init__(self):
38
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
39
  self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
 
 
 
 
 
40
  self.utterance_cloner.tts.set_language("de")
41
  self.acoustic_model = Aligner()
42
  self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
 
46
  reference_audio = "reference_audios/2.wav"
47
  self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=True)
48
  self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
49
+ print(self.phones)
50
+ for index, phone in enumerate(self.phones):
51
+ print(index, phone)
52
 
53
  #######
54
  self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
 
92
  duration = self.duration.clone()
93
  # lengthening
94
  lenghtening_candidates = [
95
+ # ('f', 33),
96
+ # ('l', 34),
97
+ ('ʏ', 35),
98
+ ('ç', 36),
99
+ # ('t', 37),
100
+ ('ɪ', 38),
101
+ # ('ɡ', 39),
102
+ ('ə', 40),
103
+ ('n', 41),
104
+
105
+ # ('z', 79),
106
+ ('ɑ', 80),
107
+ # ('ə', 81),
108
+ ('n', 82),
109
+
110
+ # ('b', 103),
111
+ ('e', 104),
112
+ # ('p', 105),
113
+ # ('t', 106),
114
+ ('ə', 107)
115
  ]
116
 
117
  for lenghtening_candidate in lenghtening_candidates:
118
  duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening
119
 
120
  # pauses
121
+ pause_candidates = [('~', 42),
122
+ ('~', 83),
123
+ ('~', 108)]
124
 
125
  for pause_candidate in pause_candidates:
126
  duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur
 
129
  # pitch raise
130
 
131
  pitch_candidates = [
132
+ # ('k', 44),
133
+ ('y', 45),
134
+ ('l', 46),
135
+ ('ə', 47),
136
+ ('ʃ', 49),
137
+ ('a', 50),
138
+ ('t', 51),
139
+ # ('ə', 52),
140
+ # ('n', 53),
141
+
142
+ ('a', 85),
143
+ ('l', 86),
144
+
145
+ ('v', 118),
146
+ ('ɛ', 119),
147
+ ('l', 120),
148
+ # ('ə', 121),
149
+ # ('n', 122)
150
  ]
151
 
152
  for pitch_candidate in pitch_candidates:
153
  pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up
154
 
155
+ fixme = [('f', 33),
156
+ ('l', 34),
157
+ ('ʏ', 35),
158
+ ('ç', 36),
159
+ ('t', 37),
160
+ ('ɪ', 38),
161
+ ('ɡ', 39),
162
+ ('ə', 40),
163
+ ('n', 41)
164
  ]
165
  for pitch_candidate in fixme:
166
  pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)