truong-xuan-linh commited on
Commit
5c60553
1 Parent(s): 2b52fe2
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ *test*
3
+ temp
README.md CHANGED
@@ -1 +1,12 @@
1
- # VTTS-speechT5
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VTTS speecht5
3
+ emoji: 🗣️
4
+ colorFrom: green
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # multilingual_speecht5
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(page_title="SpeechT5", page_icon = "static/images/PLAYGROUND_LOGO_REDESIGN_IMAGE.png")
3
+ hide_menu_style = """
4
+ <style>
5
+ footer {visibility: hidden;}
6
+ </style>
7
+ """
8
+ st.markdown(hide_menu_style, unsafe_allow_html= True)
9
+
10
+ import glob
11
+ from src.model import Model, dataset_dict
12
+
13
+ if "model_name" not in st.session_state:
14
+ st.session_state.model_name = None
15
+ st.session_state.audio = None
16
+ st.session_state.wav_file = None
17
+
18
+ with st.sidebar.form("my_form"):
19
+
20
+ text = st.text_input("Your input: ")
21
+ model_name = st.selectbox(label="Model: ", options=["truong-xuan-linh/speecht5-vietnamese-commonvoice",
22
+ "truong-xuan-linh/speecht5-vietnamese-voiceclone-lsvsc",
23
+ "truong-xuan-linh/speecht5-vietnamese-hlpcvoice",
24
+ "truong-xuan-linh/speecht5-vietnamese-vstnvoice",
25
+ "truong-xuan-linh/speecht5-vietnamese-kcbnvoice",
26
+ "truong-xuan-linh/speecht5-irmvivoice",
27
+ "truong-xuan-linh/speecht5-vietnamese-voiceclone",
28
+ "truong-xuan-linh/speecht5-multilingual-voiceclone-speechbrain",
29
+ "truong-xuan-linh/speecht5-vietnamese-voiceclone-v3",
30
+ "truong-xuan-linh/speecht5-multilingual-voiceclone-pynote",
31
+ "truong-xuan-linh/speecht5-multilingual-voiceclone-speechbrain-nonverbal"])
32
+
33
+ speaker_id = st.selectbox("source voice", options= list(dataset_dict.keys()))
34
+ speaker_url = st.text_input("speaker url", value="")
35
+ # speaker_id = st.selectbox("source voice", options= glob.glob("voices/*.wav"))
36
+ if st.session_state.model_name != model_name :
37
+ st.session_state.model_name = model_name
38
+ st.session_state.model = Model(model_name=model_name)
39
+ st.session_state.speaker_id = speaker_id
40
+
41
+ # Every form must have a submit button.
42
+ submitted = st.form_submit_button("Submit")
43
+ if submitted:
44
+ st.session_state.audio = st.session_state.model.inference(text=text, speaker_id=speaker_id, speaker_url=speaker_url)
45
+
46
+ audio_holder = st.empty()
47
+ audio_holder.audio(st.session_state.audio, sample_rate=16000)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.1.2
2
+ numpy==1.23.5
3
+ transformers==4.38.2
4
+ uroman-python==1.2.8.1
5
+ datasets==2.16.1
6
+ deepfilternet==0.5.6
7
+ torchaudio==2.1.2
8
+ librosa==0.10.0
9
+ streamlit==1.29.0
10
+ pydub==0.25.1
11
+ speechbrain==0.5.16
src/model.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ import requests
4
+ import torchaudio
5
+ import numpy as np
6
+ from src.reduce_noise import smooth_and_reduce_noise, model_remove_noise, model, df_state
7
+ import io
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+ from pydub import AudioSegment
10
+ import re
11
+ from uroman import uroman
12
+ # from src.pynote_speaker_embedding import create_speaker_embedding
13
+ from src.speechbrain_speaker_embedding import create_speaker_embedding
14
+
15
+ from datasets import load_dataset
16
+ dataset = load_dataset("truong-xuan-linh/vi-xvector-speechbrain",
17
+ download_mode="force_redownload",
18
+ verification_mode="no_checks",
19
+ cache_dir="temp/",
20
+ revision="5ea5e4345258333cbc6d1dd2544f6c658e66a634")
21
+ dataset = dataset["train"].to_list()
22
+
23
+ dataset_dict = {}
24
+
25
+ for rc in dataset:
26
+ dataset_dict[rc["speaker_id"]] = rc["embedding"]
27
+
28
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
29
+
30
+ def remove_special_characters(sentence):
31
+ # Use regular expression to keep only letters, periods, and commas
32
+ sentence_after_removal = re.sub(r'[^a-zA-Z\s,.\u00C0-\u1EF9]', ' ', sentence)
33
+ return sentence_after_removal
34
+
35
+ from scipy.signal import butter, lfilter
36
+
37
+ def butter_bandpass(lowcut, highcut, fs, order=5):
38
+ nyq = 0.5 * fs
39
+ low = lowcut / nyq
40
+ high = highcut / nyq
41
+ b, a = butter(order, [low, high], btype='band')
42
+ return b, a
43
+
44
+ def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
45
+ b, a = butter_bandpass(lowcut, highcut, fs, order=order)
46
+ y = lfilter(b, a, data)
47
+ return y
48
+
49
+ def korean_splitter(string):
50
+ pattern = re.compile('[가-힣]+')
51
+ matches = pattern.findall(string)
52
+ return matches
53
+
54
+ def uroman_normalization(string):
55
+ korean_inputs = korean_splitter(string)
56
+ for korean_input in korean_inputs:
57
+ korean_roman = uroman(korean_input)
58
+ string = string.replace(korean_input, korean_roman)
59
+ return string
60
+
61
+ class Model():
62
+
63
+ def __init__(self, model_name):
64
+ self.model_name = model_name
65
+ self.processor = SpeechT5Processor.from_pretrained(model_name)
66
+ self.model = SpeechT5ForTextToSpeech.from_pretrained(model_name)
67
+ # self.model.generate = partial(self.model.generate, use_cache=True)
68
+
69
+ self.model.eval()
70
+ if model_name == "truong-xuan-linh/speecht5-vietnamese-commonvoice" or model_name == "truong-xuan-linh/speecht5-irmvivoice":
71
+ self.speaker_embeddings = torch.zeros((1, 512)) # or load xvectors from a file
72
+ else:
73
+ self.speaker_embeddings = torch.ones((1, 512)) # or load xvectors from a file
74
+
75
+ def inference(self, text, speaker_id=None, speaker_url=""):
76
+ # if self.model_name == "truong-xuan-linh/speecht5-vietnamese-voiceclone-v2":
77
+ # # self.speaker_embeddings = torch.tensor(dataset_dict_v2[speaker_id])
78
+ # wavform, _ = torchaudio.load(speaker_id)
79
+ # self.speaker_embeddings = create_speaker_embedding(wavform)[0]
80
+
81
+ if "voiceclone" in self.model_name:
82
+ if not speaker_url:
83
+ self.speaker_embeddings = torch.tensor(dataset_dict[speaker_id])
84
+ else:
85
+ response = requests.get(speaker_url)
86
+ audio_stream = io.BytesIO(response.content)
87
+ audio_segment = AudioSegment.from_file(audio_stream, format="wav")
88
+ audio_segment = audio_segment.set_channels(1)
89
+ audio_segment = audio_segment.set_frame_rate(16000)
90
+ audio_segment = audio_segment.set_sample_width(2)
91
+ wavform, _ = torchaudio.load(audio_segment.export())
92
+ self.speaker_embeddings = create_speaker_embedding(wavform)[0]
93
+ # self.speaker_embeddings = create_speaker_embedding(speaker_id)[0]
94
+ # wavform, _ = torchaudio.load("voices/kcbn1.wav")
95
+ # self.speaker_embeddings = create_speaker_embedding(wavform)[0]
96
+ # wavform, _ = torchaudio.load(wav_file)
97
+ # self.speaker_embeddings = create_speaker_embedding(wavform)[0]
98
+
99
+
100
+ with torch.no_grad():
101
+ full_speech = []
102
+ separators = r";|\.|!|\?|\n"
103
+ text = uroman_normalization(text)
104
+ text = text.replace(" ", "▁")
105
+ split_texts = re.split(separators, text)
106
+
107
+ for split_text in split_texts:
108
+
109
+ if split_text != "▁":
110
+ # split_text = remove_special_characters(" ," + split_text) + " ,"
111
+ split_text = split_text.lower() + "▁"
112
+ print(split_text)
113
+ inputs = self.processor.tokenizer(text=split_text, return_tensors="pt")
114
+ speech = self.model.generate_speech(inputs["input_ids"], threshold=0.5, speaker_embeddings=self.speaker_embeddings, vocoder=vocoder)
115
+ full_speech.append(speech.numpy())
116
+ # full_speech.append(butter_bandpass_filter(speech.numpy(), lowcut=10, highcut=5000, fs=16000, order=2))
117
+ out_audio = model_remove_noise(model, df_state, np.concatenate(full_speech))
118
+ return out_audio
119
+
120
+ @staticmethod
121
+ def moving_average(data, window_size):
122
+ return np.convolve(data, np.ones(window_size)/window_size, mode='same')
123
+
124
+ # woman: VIVOSSPK26, VIVOSSPK02, VIVOSSPK40
125
+
126
+ # man: VIVOSSPK28, VIVOSSPK36, VIVOSDEV09, VIVOSSPK33, VIVOSSPK23
127
+
src/pynote_speaker_embedding.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # from pyannote.audio import Model, Inference
3
+ # speaker_model = Model.from_pretrained("pyannote/embedding",
4
+ # use_auth_token="")
5
+ # inference = Inference(speaker_model, window="whole")
6
+
7
+ # def create_speaker_embedding(audio_dir):
8
+ # with torch.no_grad():
9
+ # embedding = inference(audio_dir)
10
+ # embedding = torch.tensor([[embedding]])
11
+ # speaker_embeddings = torch.nn.functional.normalize(embedding, dim=-1)
12
+ # return speaker_embeddings
src/reduce_noise.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scipy.signal
2
+ import librosa
3
+ from df.enhance import enhance, init_df, load_audio, save_audio
4
+ import torch
5
+ from torchaudio.functional import resample
6
+
7
+ # Load default model
8
+ model, df_state, _ = init_df()
9
+
10
+ def smooth_and_reduce_noise(audio_signal, sampling_rate):
11
+ # Apply a low-pass filter for smoothing
12
+ cutoff_frequency = 1700 # Adjust as needed
13
+ nyquist = 0.5 * sampling_rate
14
+ normal_cutoff = cutoff_frequency / nyquist
15
+ b, a = scipy.signal.butter(4, normal_cutoff, btype='low', analog=False)
16
+ smoothed_signal = scipy.signal.filtfilt(b, a, audio_signal)
17
+
18
+ # Reduce noise using librosa's denoiser
19
+ denoised_signal = librosa.effects.preemphasis(smoothed_signal, coef=0.95)
20
+
21
+ return denoised_signal
22
+
23
+ def model_remove_noise(model, df_state, np_audio):
24
+ #Read audio
25
+ audio = torch.tensor([np_audio])
26
+ audio = resample(audio, 16000, df_state.sr())
27
+
28
+ #Inference
29
+ enhanced = enhance(model, df_state, audio).cpu().numpy()
30
+
31
+ #Save
32
+ dtype=torch.int16
33
+ out_audio = torch.as_tensor(enhanced)
34
+ if out_audio.ndim == 1:
35
+ out_audio.unsqueeze_(0)
36
+ if dtype == torch.int16 and out_audio.dtype != torch.int16:
37
+ out_audio = (out_audio * (1 << 15)).to(torch.int16)
38
+ if dtype == torch.float32 and out_audio.dtype != torch.float32:
39
+ out_audio = out_audio.to(torch.float32) / (1 << 15)
40
+
41
+ out_audio = resample(audio, df_state.sr(), 16000)
42
+
43
+ return out_audio.cpu().numpy()
src/speechbrain_speaker_embedding.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ import os
4
+ from speechbrain.pretrained import EncoderClassifier
5
+
6
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
7
+
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ speaker_model = EncoderClassifier.from_hparams(
10
+ source=spk_model_name,
11
+ run_opts={"device": device},
12
+ savedir=os.path.join("/tmp", spk_model_name),
13
+ )
14
+
15
+ def create_speaker_embedding(waveform):
16
+ with torch.no_grad():
17
+ speaker_embeddings = speaker_model.encode_batch(waveform)
18
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=-1)
19
+ return speaker_embeddings