fhieni commited on
Commit
a448668
1 Parent(s): 3d6e41f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import sys
5
+ from vinorm import TTSnorm
6
+ from utils_audio import convert_to_wav
7
+
8
+ sys.path.append("vits")
9
+ import commons
10
+ import utils
11
+ from models import SynthesizerTrn
12
+ from text.symbols import symbols
13
+ from text import text_to_sequence
14
+ from scipy.io.wavfile import write
15
+ import logging
16
+
17
+ numba_logger = logging.getLogger("numba")
18
+ numba_logger.setLevel(logging.WARNING)
19
+
20
+
21
+ from resemblyzer import preprocess_wav, VoiceEncoder
22
+
23
+
24
+ device = "cpu"
25
+
26
+
27
+ def get_text(texts, hps):
28
+ text_norm_list = []
29
+ for text in texts.split(","):
30
+ chunk_strings = []
31
+ chunk_len = 30
32
+ for i in range(0, len(text.split()), chunk_len):
33
+ chunk = " ".join(text.split()[i : i + chunk_len])
34
+ chunk_strings.append(chunk)
35
+ for chunk_string in chunk_strings:
36
+ text_norm = text_to_sequence(chunk_string, hps.data.text_cleaners)
37
+ if hps.data.add_blank:
38
+ text_norm = commons.intersperse(text_norm, 0)
39
+ text_norm_list.append(torch.LongTensor(text_norm))
40
+ return text_norm_list
41
+
42
+
43
+ def get_speaker_embedding(path):
44
+ encoder = VoiceEncoder(device="cpu")
45
+ path = convert_to_wav(path)
46
+ wav = preprocess_wav(path)
47
+ embed = encoder.embed_utterance(wav)
48
+ return embed
49
+
50
+
51
+ class VoiceClone:
52
+ def __init__(self, checkpoint_path):
53
+ hps = utils.get_hparams_from_file("./vits/configs/vivos.json")
54
+ self.net_g = SynthesizerTrn(
55
+ len(symbols),
56
+ hps.data.filter_length // 2 + 1,
57
+ hps.train.segment_size // hps.data.hop_length,
58
+ n_speakers=hps.data.n_speakers,
59
+ **hps.model
60
+ ).to(device)
61
+ _ = self.net_g.eval()
62
+
63
+ _ = utils.load_checkpoint(checkpoint_path, self.net_g, None)
64
+
65
+ self.hps = hps
66
+
67
+ def infer(self, text, ref_audio):
68
+ text_norm = TTSnorm(text)
69
+ stn_tst_list = get_text(text_norm, self.hps)
70
+ with torch.no_grad():
71
+ audios = []
72
+ for stn_tst in stn_tst_list:
73
+ x_tst = stn_tst.to(device).unsqueeze(0)
74
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
75
+
76
+ speaker_embedding = get_speaker_embedding(ref_audio)
77
+ speaker_embedding = (
78
+ torch.FloatTensor(torch.from_numpy(speaker_embedding))
79
+ .unsqueeze(0)
80
+ .to(device)
81
+ )
82
+
83
+ audio = self.net_g.infer(
84
+ x_tst,
85
+ x_tst_lengths,
86
+ speaker_embedding=speaker_embedding,
87
+ noise_scale=0.667,
88
+ noise_scale_w=0.8,
89
+ length_scale=1,
90
+ )
91
+
92
+ audio = audio[0][0, 0].data.cpu().float().numpy()
93
+ audios.append(audio)
94
+ print(audio.shape)
95
+
96
+ audios = np.concatenate(audios, axis=0)
97
+ write(ref_audio.replace(".wav", "_clone.wav"), 22050, audios)
98
+ return ref_audio.replace(".wav", "_clone.wav"), text_norm
99
+
100
+
101
+ # object = VoiceClone("vits/logs/vivos/G_7700000.pth")
102
+ object = VoiceClone("vits/logs/vivos/G_150000.pth")
103
+
104
+
105
+ def clonevoice(text: str, speaker_wav, file_upload, language: str):
106
+ speaker_source = ""
107
+ if speaker_wav is not None:
108
+ speaker_source = speaker_wav
109
+ elif file_upload is not None:
110
+ speaker_source = file_upload
111
+ else:
112
+ speaker_source = "vits/audio/sontung.wav"
113
+
114
+ print(speaker_source)
115
+
116
+ outfile, text_norm = object.infer(text, speaker_source)
117
+
118
+ return [outfile, text_norm]
119
+
120
+
121
+ inputs = [
122
+ gr.Textbox(
123
+ label="Input",
124
+ value="muốn ngồi ở một vị trí không ai ngồi được thì phải chịu cảm giác không ai chịu được",
125
+ max_lines=3,
126
+ ),
127
+ gr.Audio(label="Speaker Wav", source="microphone", type="filepath"),
128
+ gr.Audio(label="Speaker Wav", source="upload", type="filepath"),
129
+ gr.Radio(label="Language", choices=["Vietnamese"], value="en"),
130
+ ]
131
+ outputs = [gr.Audio(label="Output"), gr.TextArea()]
132
+
133
+ demo = gr.Interface(fn=clonevoice, inputs=inputs, outputs=outputs)
134
+
135
+ demo.launch(debug=True)