Alidr79 commited on
Commit
84a8863
·
verified ·
1 Parent(s): e5273b4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import DataLoader
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ from transformers import VitsTokenizer, VitsModel, set_seed
6
+ from transformers import SpeechT5HifiGan
7
+ from datasets import load_dataset
8
+ from tqdm import tqdm
9
+ import soundfile as sf
10
+ import librosa
11
+
12
+ dataset = load_dataset('SeyedAli/Persian-Speech-Dataset')
13
+
14
+ dataset = dataset["test"].select(range(100))
15
+
16
+
17
+ def set_seed(seed):
18
+ torch.manual_seed(seed)
19
+ if torch.cuda.is_available():
20
+ torch.cuda.manual_seed_all(seed)
21
+
22
+ set_seed(1)
23
+ # Load model directly
24
+ from transformers import AutoProcessor, AutoModelForTextToSpectrogram
25
+
26
+ processor = AutoProcessor.from_pretrained("Alidr79/speecht5_v2_best")
27
+ model = AutoModelForTextToSpectrogram.from_pretrained("Alidr79/speecht5_v2_best")
28
+
29
+
30
+ from speechbrain.inference.classifiers import EncoderClassifier
31
+ import os
32
+
33
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
34
+
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ speaker_model = EncoderClassifier.from_hparams(
37
+ source=spk_model_name,
38
+ run_opts={"device": device},
39
+ savedir=os.path.join("/tmp", spk_model_name),
40
+ )
41
+
42
+
43
+ def create_speaker_embedding(waveform):
44
+ with torch.no_grad():
45
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
46
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
47
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
48
+ return speaker_embeddings
49
+
50
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
51
+
52
+ from PersianG2p import Persian_g2p_converter
53
+ from scipy.io import wavfile
54
+ import soundfile as sf
55
+
56
+
57
+ PersianG2Pconverter = Persian_g2p_converter(use_large = True)
58
+
59
+ import noisereduce as nr
60
+
61
+ def denoise_audio(audio, sr):
62
+ # Perform noise reduction
63
+ denoised_audio = nr.reduce_noise(y=audio, sr=sr)
64
+ return denoised_audio
65
+
66
+
67
+ import noisereduce as nr
68
+ from pydub import AudioSegment
69
+ def match_target_amplitude(sound, target_dBFS):
70
+ change_in_dBFS = target_dBFS - sound.dBFS
71
+ return sound.apply_gain(change_in_dBFS)
72
+
73
+ import librosa
74
+ def tts_fn(slider_value, input_text):
75
+ audio_embedding = dataset[slider_value]['audio']['array']
76
+ sample_rate_embedding = dataset[slider_value]['audio']['sampling_rate']
77
+ if original_sr != target_sr:
78
+ audio_embedding = librosa.resample(audio_embedding, orig_sr=sample_rate_embedding, target_sr=16_000)
79
+
80
+
81
+ with torch.no_grad():
82
+ speaker_embedding = create_speaker_embedding(audio_embedding)
83
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
84
+
85
+ phonemes = PersianG2Pconverter.transliterate(input_text, tidy = False, secret = True)
86
+ text = "</s>"
87
+ for i in phonemes.replace(' .', '').split(" "):
88
+ text += i + " <pad> "
89
+
90
+ text += "</s>"
91
+ print(text)
92
+
93
+ with torch.no_grad():
94
+ inputs = processor(text = text, return_tensors="pt")
95
+
96
+ with torch.no_grad():
97
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embedding, minlenratio = 2, maxlenratio = 4, threshold = 0.3)
98
+
99
+ with torch.no_grad():
100
+ speech = vocoder(spectrogram)
101
+
102
+ speech = speech.numpy().reshape(-1)
103
+ speech_denoised = denoise_audio(speech, 16000)
104
+ sf.write("in_speech.wav", speech_denoised, 16000)
105
+
106
+ sound = AudioSegment.from_wav("in_speech.wav", "wav")
107
+ normalized_sound = match_target_amplitude(sound, -20.0)
108
+ normalized_sound.export("out_sound.wav", format="wav")
109
+
110
+ sample_rate_out, audio_out = wavfile.read("out_sound.wav")
111
+
112
+ assert sample_rate_out == 16_000
113
+
114
+ return 16000, (audio_out.reshape(-1)).astype(np.int16)
115
+
116
+
117
+ import gradio as gr
118
+
119
+ slider = gr.Slider(
120
+ minimum=0,
121
+ maximum=100,
122
+ value=86,
123
+ step=1,
124
+ label="Select a speaker"
125
+ )
126
+
127
+ # Create the text input component
128
+ text_input = gr.Textbox(
129
+ label="Enter some text",
130
+ placeholder="Type something here..."
131
+ )
132
+
133
+
134
+ demo = gr.Interface(
135
+ fn = tts_fn,
136
+ inputs=[slider, text_input], # List of inputs
137
+ outputs = "audio"
138
+ )
139
+
140
+ demo.launch()