mzltest commited on
Commit
0072f65
1 Parent(s): 86f6316
.ipynb_checkpoints/README-checkpoint.md CHANGED
@@ -5,6 +5,7 @@ colorFrom: green
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.24.1
 
8
  app_file: app.py
9
  pinned: false
10
  ---
 
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.24.1
8
+ python_version: 3.9.16
9
  app_file: app.py
10
  pinned: false
11
  ---
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+ from torch import no_grad, LongTensor
5
+ import argparse
6
+ import commons
7
+ from mel_processing import spectrogram_torch
8
+ import utils
9
+ from models import SynthesizerTrn
10
+ import gradio as gr
11
+ import librosa
12
+ import webbrowser
13
+
14
+ from text import text_to_sequence, _clean_text
15
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+ language_marks = {
17
+ "Japanese": "",
18
+ "日本語": "[JA]",
19
+ "简体中文": "[ZH]",
20
+ "English": "[EN]",
21
+ "Mix": "",
22
+ }
23
+ lang = ['日本語', '简体中文', 'English', 'Mix']
24
+ def get_text(text, hps, is_symbol):
25
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
26
+ if hps.data.add_blank:
27
+ text_norm = commons.intersperse(text_norm, 0)
28
+ text_norm = LongTensor(text_norm)
29
+ return text_norm
30
+
31
+ def create_tts_fn(model, hps, speaker_ids):
32
+ def tts_fn(text, speaker, language, speed):
33
+ if language is not None:
34
+ text = language_marks[language] + text + language_marks[language]
35
+ speaker_id = speaker_ids[speaker]
36
+ stn_tst = get_text(text, hps, False)
37
+ with no_grad():
38
+ x_tst = stn_tst.unsqueeze(0).to(device)
39
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
40
+ sid = LongTensor([speaker_id]).to(device)
41
+ audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
42
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
43
+ del stn_tst, x_tst, x_tst_lengths, sid
44
+ return "Success", (hps.data.sampling_rate, audio)
45
+
46
+ return tts_fn
47
+
48
+ def create_vc_fn(model, hps, speaker_ids):
49
+ def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
50
+ input_audio = record_audio if record_audio is not None else upload_audio
51
+ if input_audio is None:
52
+ return "You need to record or upload an audio", None
53
+ sampling_rate, audio = input_audio
54
+ original_speaker_id = speaker_ids[original_speaker]
55
+ target_speaker_id = speaker_ids[target_speaker]
56
+
57
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
58
+ if len(audio.shape) > 1:
59
+ audio = librosa.to_mono(audio.transpose(1, 0))
60
+ if sampling_rate != hps.data.sampling_rate:
61
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
62
+ with no_grad():
63
+ y = torch.FloatTensor(audio)
64
+ y = y / max(-y.min(), y.max()) / 0.99
65
+ y = y.to(device)
66
+ y = y.unsqueeze(0)
67
+ spec = spectrogram_torch(y, hps.data.filter_length,
68
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
69
+ center=False).to(device)
70
+ spec_lengths = LongTensor([spec.size(-1)]).to(device)
71
+ sid_src = LongTensor([original_speaker_id]).to(device)
72
+ sid_tgt = LongTensor([target_speaker_id]).to(device)
73
+ audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
74
+ 0, 0].data.cpu().float().numpy()
75
+ del y, spec, spec_lengths, sid_src, sid_tgt
76
+ return "Success", (hps.data.sampling_rate, audio)
77
+
78
+ return vc_fn
79
+ if __name__ == "__main__":
80
+ parser = argparse.ArgumentParser()
81
+ parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
82
+ parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
83
+ parser.add_argument("--share", default=False, help="make link public (used in colab)")
84
+
85
+ args = parser.parse_args()
86
+ hps = utils.get_hparams_from_file(args.config_dir)
87
+
88
+
89
+ net_g = SynthesizerTrn(
90
+ len(hps.symbols),
91
+ hps.data.filter_length // 2 + 1,
92
+ hps.train.segment_size // hps.data.hop_length,
93
+ n_speakers=hps.data.n_speakers,
94
+ **hps.model).to(device)
95
+ _ = net_g.eval()
96
+
97
+ _ = utils.load_checkpoint(args.model_dir, net_g, None)
98
+ speaker_ids = hps.speakers
99
+ speakers = list(hps.speakers.keys())
100
+ tts_fn = create_tts_fn(net_g, hps, speaker_ids)
101
+ vc_fn = create_vc_fn(net_g, hps, speaker_ids)
102
+ app = gr.Blocks()
103
+ with app:
104
+ with gr.Tab("Text-to-Speech"):
105
+ with gr.Row():
106
+ with gr.Column():
107
+ textbox = gr.TextArea(label="Text",
108
+ placeholder="Type your sentence here",
109
+ value="こんにちわ。", elem_id=f"tts-input")
110
+ # select character
111
+ char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
112
+ language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
113
+ duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
114
+ label='速度 Speed')
115
+ with gr.Column():
116
+ text_output = gr.Textbox(label="Message")
117
+ audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
118
+ btn = gr.Button("Generate!")
119
+ btn.click(tts_fn,
120
+ inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
121
+ outputs=[text_output, audio_output])
122
+ with gr.Tab("Voice Conversion"):
123
+ gr.Markdown("""
124
+ 录制或上传声音,并选择要转换的音色。
125
+ """)
126
+ with gr.Column():
127
+ record_audio = gr.Audio(label="record your voice", source="microphone")
128
+ upload_audio = gr.Audio(label="or upload audio here", source="upload")
129
+ source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
130
+ target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
131
+ with gr.Column():
132
+ message_box = gr.Textbox(label="Message")
133
+ converted_audio = gr.Audio(label='converted audio')
134
+ btn = gr.Button("Convert!")
135
+ btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
136
+ outputs=[message_box, converted_audio])
137
+ webbrowser.open("http://127.0.0.1:7860")
138
+ app.launch(share=args.share)
139
+
README.md CHANGED
@@ -5,6 +5,7 @@ colorFrom: green
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.24.1
 
8
  app_file: app.py
9
  pinned: false
10
  ---
 
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.24.1
8
+ python_version: 3.9.16
9
  app_file: app.py
10
  pinned: false
11
  ---
monotonic_align/__pycache__/__init__.cpython-39.pyc CHANGED
Binary files a/monotonic_align/__pycache__/__init__.cpython-39.pyc and b/monotonic_align/__pycache__/__init__.cpython-39.pyc differ