Flux9665 commited on
Commit
1f52d1a
1 Parent(s): 509ace1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
 
3
  os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
@@ -6,14 +8,15 @@ os.system("mv toucan_codebase/* .")
6
  from run_model_downloader import download_models
7
 
8
  download_models()
9
-
10
  import gradio as gr
11
  import numpy as np
12
  import torch
 
 
13
  from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
14
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
15
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
16
- from InferenceInterfaces.UtteranceCloner import UtteranceCloner
17
 
18
 
19
  def float2pcm(sig, dtype='int16'):
@@ -42,10 +45,10 @@ class TTS_Interface:
42
  self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
43
  self.acoustic_model = self.acoustic_model.to(self.device)
44
  self.dc = DurationCalculator(reduction_factor=1)
45
- self.tf = ArticulatoryCombinedTextFrontend(language="en")
46
  self.text = "Quellen hattest du ihm, hattest dem Flüchtigen, kühle Schatten geschenkt, und die Gestade sahen, all ihm nach, und es bebte, aus den Wellen ihr lieblich Bild."
47
  reference_audio = "reference_audios/2.wav"
48
- self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=False)
 
49
  self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
50
 
51
  #######
@@ -56,7 +59,7 @@ class TTS_Interface:
56
  durations=self.duration,
57
  pitch=self.pitch,
58
  energy=self.energy,
59
- input_is_phones=True).cpu().numpy()
60
  self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
61
  self.current_voice = "female"
62
  self.cloned_speech_female = self.utterance_cloner.tts(self.phones,
@@ -64,7 +67,7 @@ class TTS_Interface:
64
  durations=self.duration,
65
  pitch=self.pitch,
66
  energy=self.energy,
67
- input_is_phones=True).cpu().numpy()
68
 
69
  #######
70
  self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
@@ -89,7 +92,8 @@ class TTS_Interface:
89
 
90
  duration = self.duration.clone()
91
  # lengthening
92
- lenghtening_candidates = [ # ('f', 27),
 
93
  # ('l', 28),
94
  ('ʏ', 29),
95
  ('ç', 30),
@@ -125,7 +129,8 @@ class TTS_Interface:
125
  pitch = self.pitch.clone()
126
  # pitch raise
127
 
128
- pitch_candidates = [ # ('k', 37),
 
129
  ('y', 38),
130
  ('l', 39),
131
  ('ə', 40),
 
1
+ """
2
+
3
  import os
4
 
5
  os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
 
8
  from run_model_downloader import download_models
9
 
10
  download_models()
11
+ """
12
  import gradio as gr
13
  import numpy as np
14
  import torch
15
+
16
+ from InferenceInterfaces.UtteranceCloner import UtteranceCloner
17
  from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
18
  from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
19
  from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
 
20
 
21
 
22
  def float2pcm(sig, dtype='int16'):
 
45
  self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
46
  self.acoustic_model = self.acoustic_model.to(self.device)
47
  self.dc = DurationCalculator(reduction_factor=1)
 
48
  self.text = "Quellen hattest du ihm, hattest dem Flüchtigen, kühle Schatten geschenkt, und die Gestade sahen, all ihm nach, und es bebte, aus den Wellen ihr lieblich Bild."
49
  reference_audio = "reference_audios/2.wav"
50
+ self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=True)
51
+ self.utterance_cloner.tts.text2phone.use_word_boundaries = False
52
  self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
53
 
54
  #######
 
59
  durations=self.duration,
60
  pitch=self.pitch,
61
  energy=self.energy,
62
+ phones=True).cpu().numpy()
63
  self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
64
  self.current_voice = "female"
65
  self.cloned_speech_female = self.utterance_cloner.tts(self.phones,
 
67
  durations=self.duration,
68
  pitch=self.pitch,
69
  energy=self.energy,
70
+ phones=True).cpu().numpy()
71
 
72
  #######
73
  self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
 
92
 
93
  duration = self.duration.clone()
94
  # lengthening
95
+ lenghtening_candidates = [
96
+ # ('f', 27),
97
  # ('l', 28),
98
  ('ʏ', 29),
99
  ('ç', 30),
 
129
  pitch = self.pitch.clone()
130
  # pitch raise
131
 
132
+ pitch_candidates = [
133
+ # ('k', 37),
134
  ('y', 38),
135
  ('l', 39),
136
  ('ə', 40),