Flux9665 commited on
Commit
0d40f57
1 Parent(s): 52c413f

switch to oracle style demo rather than re-running everything everytime, because huggingface gives a lot less compute than it used to

Browse files
Files changed (1) hide show
  1. app.py +68 -20
app.py CHANGED
@@ -96,29 +96,77 @@ class TTS_Interface:
96
  self.split_audio(reference_audio, text_list)
97
  # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def read(self, _, speaker_1, speaker_2, speaker_3):
100
  reference_audio = "reference_audios/clone_me_5.wav"
101
- prompt = "Betty Botter bought some butter, but she said the butters bitter. If I put it in my batter, it will make my batter bitter. But a bit of better butter will make my batter better."
102
- text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
103
- # we don't split on the punctuation marks because we want to retain them.
104
 
105
- self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
106
- part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
107
- reference_transcription=text_list[0],
108
- clone_speaker_identity=False,
109
- lang="en")
110
-
111
- self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_2])
112
- part_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
113
- reference_transcription=text_list[1],
114
- clone_speaker_identity=False,
115
- lang="en")
116
-
117
- self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_3])
118
- part_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
119
- reference_transcription=text_list[2],
120
- clone_speaker_identity=False,
121
- lang="en")
 
 
 
122
 
123
  return "alignment.png", \
124
  reference_audio, \
96
  self.split_audio(reference_audio, text_list)
97
  # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
98
 
99
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
100
+ self.part_1_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
101
+ reference_transcription=text_list[0],
102
+ clone_speaker_identity=False,
103
+ lang="en")
104
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
105
+ self.part_1_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
106
+ reference_transcription=text_list[0],
107
+ clone_speaker_identity=False,
108
+ lang="en")
109
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
110
+ self.part_1_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
111
+ reference_transcription=text_list[0],
112
+ clone_speaker_identity=False,
113
+ lang="en")
114
+
115
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
116
+ self.part_2_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
117
+ reference_transcription=text_list[1],
118
+ clone_speaker_identity=False,
119
+ lang="en")
120
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
121
+ self.part_2_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
122
+ reference_transcription=text_list[1],
123
+ clone_speaker_identity=False,
124
+ lang="en")
125
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
126
+ self.part_2_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
127
+ reference_transcription=text_list[1],
128
+ clone_speaker_identity=False,
129
+ lang="en")
130
+
131
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_1.flac")
132
+ self.part_3_voice_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
133
+ reference_transcription=text_list[2],
134
+ clone_speaker_identity=False,
135
+ lang="en")
136
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_2.wav")
137
+ self.part_3_voice_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
138
+ reference_transcription=text_list[2],
139
+ clone_speaker_identity=False,
140
+ lang="en")
141
+ self.utterance_cloner.tts.set_utterance_embedding("reference_audios/voice_3.wav")
142
+ self.part_3_voice_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
143
+ reference_transcription=text_list[2],
144
+ clone_speaker_identity=False,
145
+ lang="en")
146
+
147
  def read(self, _, speaker_1, speaker_2, speaker_3):
148
  reference_audio = "reference_audios/clone_me_5.wav"
 
 
 
149
 
150
+ if speaker_1 == "Voice 1":
151
+ part_1 = self.part_1_voice_1
152
+ elif speaker_1 == "Voice 2":
153
+ part_1 = self.part_1_voice_2
154
+ elif speaker_1 == "Voice 3":
155
+ part_1 = self.part_1_voice_3
156
+
157
+ if speaker_2 == "Voice 1":
158
+ part_2 = self.part_2_voice_1
159
+ elif speaker_2 == "Voice 2":
160
+ part_2 = self.part_2_voice_2
161
+ elif speaker_2 == "Voice 3":
162
+ part_2 = self.part_2_voice_3
163
+
164
+ if speaker_3 == "Voice 1":
165
+ part_3 = self.part_3_voice_1
166
+ elif speaker_3 == "Voice 2":
167
+ part_3 = self.part_3_voice_2
168
+ elif speaker_3 == "Voice 3":
169
+ part_3 = self.part_3_voice_3
170
 
171
  return "alignment.png", \
172
  reference_audio, \