skytnt commited on
Commit
8935672
β€’
1 Parent(s): 7f43d7a
app.py CHANGED
@@ -14,7 +14,7 @@ from text import text_to_sequence
14
  from mel_processing import spectrogram_torch
15
 
16
 
17
- def get_text(text):
18
  text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
19
  if hps.data.add_blank:
20
  text_norm = commons.intersperse(text_norm, 0)
@@ -22,10 +22,12 @@ def get_text(text):
22
  return text_norm
23
 
24
 
25
- def tts_fn(text, speaker_id):
26
  if len(text) > 150:
27
  return "Error: Text is too long", None
28
- stn_tst = get_text(text)
 
 
29
  with no_grad():
30
  x_tst = stn_tst.unsqueeze(0)
31
  x_tst_lengths = LongTensor([stn_tst.size(0)])
@@ -35,13 +37,20 @@ def tts_fn(text, speaker_id):
35
  return "Success", (hps.data.sampling_rate, audio)
36
 
37
 
38
- def vc_fn(original_speaker_id, target_speaker_id, input_audio):
39
  if input_audio is None:
40
  return "You need to upload an audio", None
41
  sampling_rate, audio = input_audio
42
  duration = audio.shape[0] / sampling_rate
43
  if duration > 30:
44
  return "Error: Audio is too long", None
 
 
 
 
 
 
 
45
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
46
  if len(audio.shape) > 1:
47
  audio = librosa.to_mono(audio.transpose(1, 0))
@@ -62,17 +71,26 @@ def vc_fn(original_speaker_id, target_speaker_id, input_audio):
62
 
63
 
64
  if __name__ == '__main__':
65
- config_path = "saved_model/config.json"
66
- model_path = "saved_model/model.pth"
67
- hps = utils.get_hparams_from_file(config_path)
68
- model = SynthesizerTrn(
69
- len(hps.symbols),
70
- hps.data.filter_length // 2 + 1,
71
- hps.train.segment_size // hps.data.hop_length,
72
- n_speakers=hps.data.n_speakers,
73
- **hps.model)
74
- utils.load_checkpoint(model_path, model, None)
75
- model.eval()
 
 
 
 
 
 
 
 
 
76
 
77
  app = gr.Blocks()
78
 
@@ -85,16 +103,16 @@ if __name__ == '__main__':
85
  with gr.TabItem("TTS"):
86
  with gr.Column():
87
  tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにけは。")
88
- tts_input2 = gr.Dropdown(label="Speaker", choices=hps.speakers, type="index", value=hps.speakers[0])
89
  tts_submit = gr.Button("Generate", variant="primary")
90
  tts_output1 = gr.Textbox(label="Output Message")
91
  tts_output2 = gr.Audio(label="Output Audio")
92
  with gr.TabItem("Voice Conversion"):
93
  with gr.Column():
94
- vc_input1 = gr.Dropdown(label="Original Speaker", choices=hps.speakers, type="index",
95
- value=hps.speakers[0])
96
- vc_input2 = gr.Dropdown(label="Target Speaker", choices=hps.speakers, type="index",
97
- value=hps.speakers[1])
98
  vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
99
  vc_submit = gr.Button("Convert", variant="primary")
100
  vc_output1 = gr.Textbox(label="Output Message")
 
14
  from mel_processing import spectrogram_torch
15
 
16
 
17
+ def get_text(text, hps):
18
  text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
19
  if hps.data.add_blank:
20
  text_norm = commons.intersperse(text_norm, 0)
 
22
  return text_norm
23
 
24
 
25
+ def tts_fn(text, speaker):
26
  if len(text) > 150:
27
  return "Error: Text is too long", None
28
+ model, hps = models[model_idx[speaker]]
29
+ speaker_id = speaker_idx[speaker]
30
+ stn_tst = get_text(text, hps)
31
  with no_grad():
32
  x_tst = stn_tst.unsqueeze(0)
33
  x_tst_lengths = LongTensor([stn_tst.size(0)])
 
37
  return "Success", (hps.data.sampling_rate, audio)
38
 
39
 
40
+ def vc_fn(original_speaker, target_speaker, input_audio):
41
  if input_audio is None:
42
  return "You need to upload an audio", None
43
  sampling_rate, audio = input_audio
44
  duration = audio.shape[0] / sampling_rate
45
  if duration > 30:
46
  return "Error: Audio is too long", None
47
+ if model_idx[original_speaker] != model_idx[target_speaker]:
48
+ return "Error: Can not convert voice between different model", None
49
+
50
+ model, hps = models[model_idx[original_speaker]]
51
+ original_speaker_id = speaker_idx[original_speaker]
52
+ target_speaker_id = speaker_idx[target_speaker]
53
+
54
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
55
  if len(audio.shape) > 1:
56
  audio = librosa.to_mono(audio.transpose(1, 0))
 
71
 
72
 
73
  if __name__ == '__main__':
74
+ models = []
75
+ model_idx = []
76
+ speaker_idx = []
77
+ speakers = []
78
+ for i in range(0, 2):
79
+ config_path = f"saved_model/{i}/config.json"
80
+ model_path = f"saved_model/{i}/model.pth"
81
+ hps = utils.get_hparams_from_file(config_path)
82
+ model = SynthesizerTrn(
83
+ len(hps.symbols),
84
+ hps.data.filter_length // 2 + 1,
85
+ hps.train.segment_size // hps.data.hop_length,
86
+ n_speakers=hps.data.n_speakers,
87
+ **hps.model)
88
+ utils.load_checkpoint(model_path, model, None)
89
+ model.eval()
90
+ models.append((model, hps))
91
+ speakers = speakers + [f"model{i}/{x}" for x in hps.speakers]
92
+ model_idx = model_idx + [i] * len(hps.speakers)
93
+ speaker_idx = speaker_idx + list(range(0, len(hps.speakers)))
94
 
95
  app = gr.Blocks()
96
 
 
103
  with gr.TabItem("TTS"):
104
  with gr.Column():
105
  tts_input1 = gr.TextArea(label="Text (150 words limitation)", value="こんにけは。")
106
+ tts_input2 = gr.Dropdown(label="Speaker", choices=speakers, type="index", value=speakers[0])
107
  tts_submit = gr.Button("Generate", variant="primary")
108
  tts_output1 = gr.Textbox(label="Output Message")
109
  tts_output2 = gr.Audio(label="Output Audio")
110
  with gr.TabItem("Voice Conversion"):
111
  with gr.Column():
112
+ vc_input1 = gr.Dropdown(label="Original Speaker", choices=speakers, type="index",
113
+ value=speakers[0])
114
+ vc_input2 = gr.Dropdown(label="Target Speaker", choices=speakers, type="index",
115
+ value=speakers[1])
116
  vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
117
  vc_submit = gr.Button("Convert", variant="primary")
118
  vc_output1 = gr.Textbox(label="Output Message")
saved_model/{config.json β†’ 0/config.json} RENAMED
File without changes
saved_model/{model.pth β†’ 0/model.pth} RENAMED
File without changes
saved_model/1/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8022ffb2ae81ff2c84edde380bbdfc60b9ad933f767c5187d4fcfd5c964315b1
3
+ size 1302
saved_model/1/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f07377ad8af65adaad59315b40efe67c020f51dc526da66f4e11f812687392e
3
+ size 158884173
text/cleaners.py CHANGED
@@ -1,51 +1,58 @@
1
  import re
2
  from unidecode import unidecode
3
  import pyopenjtalk
 
4
  pyopenjtalk._lazy_init()
5
 
6
  # Regular expression matching Japanese without punctuation marks:
7
- _japanese_characters = re.compile(r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 
8
 
9
  # Regular expression matching non-Japanese characters or punctuation marks:
10
- _japanese_marks = re.compile(r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
 
11
 
12
 
13
  def japanese_cleaners(text):
14
- '''Pipeline for notating accent in Japanese text.'''
15
- '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
16
- sentences = re.split(_japanese_marks, text)
17
- marks = re.findall(_japanese_marks, text)
18
- text = ''
19
- for i, sentence in enumerate(sentences):
20
- if re.match(_japanese_characters, sentence):
21
- if text!='':
22
- text+=' '
23
- labels = pyopenjtalk.extract_fullcontext(sentence)
24
- for n, label in enumerate(labels):
25
- phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
26
- if phoneme not in ['sil','pau']:
27
- text += phoneme.replace('ch','Κ§').replace('sh','Κƒ').replace('cl','Q')
28
- else:
29
- continue
30
- n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
31
- a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
32
- a2 = int(re.search(r"\+(\d+)\+", label).group(1))
33
- a3 = int(re.search(r"\+(\d+)/", label).group(1))
34
- if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil','pau']:
35
- a2_next=-1
36
- else:
37
- a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
38
- # Accent phrase boundary
39
- if a3 == 1 and a2_next == 1:
40
- text += ' '
41
- # Falling
42
- elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
43
- text += '↓'
44
- # Rising
45
- elif a2 == 1 and a2_next == 2:
46
- text += '↑'
47
- if i<len(marks):
48
- text += unidecode(marks[i]).replace(' ','')
49
- if re.match('[A-Za-z]',text[-1]):
50
- text += '.'
51
- return text
 
 
 
 
 
1
  import re
2
  from unidecode import unidecode
3
  import pyopenjtalk
4
+
5
  pyopenjtalk._lazy_init()
6
 
7
  # Regular expression matching Japanese without punctuation marks:
8
+ _japanese_characters = re.compile(
9
+ r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
10
 
11
  # Regular expression matching non-Japanese characters or punctuation marks:
12
+ _japanese_marks = re.compile(
13
+ r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
14
 
15
 
16
  def japanese_cleaners(text):
17
+ '''Pipeline for notating accent in Japanese text.'''
18
+ '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
19
+ sentences = re.split(_japanese_marks, text)
20
+ marks = re.findall(_japanese_marks, text)
21
+ text = ''
22
+ for i, sentence in enumerate(sentences):
23
+ if re.match(_japanese_characters, sentence):
24
+ if text != '':
25
+ text += ' '
26
+ labels = pyopenjtalk.extract_fullcontext(sentence)
27
+ for n, label in enumerate(labels):
28
+ phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
29
+ if phoneme not in ['sil', 'pau']:
30
+ text += phoneme.replace('ch', 'Κ§').replace('sh', 'Κƒ').replace('cl', 'Q')
31
+ else:
32
+ continue
33
+ n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
34
+ a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
35
+ a2 = int(re.search(r"\+(\d+)\+", label).group(1))
36
+ a3 = int(re.search(r"\+(\d+)/", label).group(1))
37
+ if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
38
+ a2_next = -1
39
+ else:
40
+ a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
41
+ # Accent phrase boundary
42
+ if a3 == 1 and a2_next == 1:
43
+ text += ' '
44
+ # Falling
45
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
46
+ text += '↓'
47
+ # Rising
48
+ elif a2 == 1 and a2_next == 2:
49
+ text += '↑'
50
+ if i < len(marks):
51
+ text += unidecode(marks[i]).replace(' ', '')
52
+ if re.match('[A-Za-z]', text[-1]):
53
+ text += '.'
54
+ return text
55
+
56
+
57
+ def japanese_cleaners2(text):
58
+ return japanese_cleaners(text).replace('ts', 'Κ¦')