Alidr79's picture
Update app.py
227db2b verified
raw
history blame
5.4 kB
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from transformers import SpeechT5HifiGan
from datasets import load_dataset
from tqdm import tqdm
import soundfile as sf
import librosa
import random
dataset = load_dataset('pourmand1376/asr-farsi-youtube-chunked-10-seconds', split = "test")
import librosa
from datasets import load_dataset, Audio
def resample_audio(example):
# Resample to 16 kHz
y_resampled = librosa.resample(example["audio"]["array"], orig_sr=example["audio"]["sampling_rate"], target_sr=16000)
# Update the example with the resampled audio and new sample rate
example["audio"]["array"] = y_resampled
example["audio"]["sampling_rate"] = 16000
return example
dataset = dataset.select(range(1000))
dataset = dataset.map(resample_audio)
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from transformers import SpeechT5HifiGan
from datasets import load_dataset
from tqdm import tqdm
import soundfile as sf
import librosa
def set_seed(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
set_seed(1)
# Load model directly
from transformers import AutoProcessor, AutoModelForTextToSpectrogram
processor = AutoProcessor.from_pretrained("Alidr79/speecht5_v3_youtube")
model = AutoModelForTextToSpectrogram.from_pretrained("Alidr79/speecht5_v3_youtube")
from speechbrain.inference.classifiers import EncoderClassifier
import os
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
from PersianG2p import Persian_g2p_converter
from scipy.io import wavfile
import soundfile as sf
PersianG2Pconverter = Persian_g2p_converter(use_large = True)
import noisereduce as nr
def denoise_audio(audio, sr):
# Perform noise reduction
denoised_audio = nr.reduce_noise(y=audio, sr=sr)
return denoised_audio
import noisereduce as nr
from pydub import AudioSegment
def match_target_amplitude(sound, target_dBFS):
change_in_dBFS = target_dBFS - sound.dBFS
return sound.apply_gain(change_in_dBFS)
import librosa
def tts_fn(slider_value, input_text):
audio_embedding = dataset[slider_value]['audio']['array']
sample_rate_embedding = dataset[slider_value]['audio']['sampling_rate']
if sample_rate_embedding != 16000:
audio_embedding = librosa.resample(audio_embedding, orig_sr=sample_rate_embedding, target_sr=16_000)
with torch.no_grad():
speaker_embedding = create_speaker_embedding(audio_embedding)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
phonemes = PersianG2Pconverter.transliterate(input_text, tidy = False, secret = True)
# text = "</s>"
# for i in phonemes.replace(' .', '').split(" "):
# text += i + " <pad> "
text = phonemes
print("sentence:", input_text)
print("sentence phonemes:", text)
with torch.no_grad():
inputs = processor(text = text, return_tensors="pt")
with torch.no_grad():
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embedding, minlenratio = 2, maxlenratio = 4, threshold = 0.3)
with torch.no_grad():
speech = vocoder(spectrogram)
speech = speech.numpy().reshape(-1)
speech_denoised = denoise_audio(speech, 16000)
sf.write("in_speech.wav", speech_denoised, 16000)
sound = AudioSegment.from_wav("in_speech.wav", "wav")
normalized_sound = match_target_amplitude(sound, -20.0)
normalized_sound.export("out_sound.wav", format="wav")
sample_rate_out, audio_out = wavfile.read("out_sound.wav")
assert sample_rate_out == 16_000
return 16000, (audio_out.reshape(-1)).astype(np.int16)
def master_fn(slider_value, input_text):
if "." not in input_text:
input_text += '.'
print(f"speaker_id = {slider_value}")
all_speech = []
for sentence in input_text.split("."):
if sentence != '' and sentence != ' ' and sentence != '\n':
sampling_rate_response, audio_chunk_response = tts_fn(slider_value, sentence)
all_speech.append(audio_chunk_response)
audio_response = np.concatenate(all_speech)
return sampling_rate_response, audio_response
import gradio as gr
slider = gr.Slider(
minimum=0,
maximum=(len(dataset)-1),
value=600,
step=1,
label="Select a speaker(Good examples : 600, 604, 910, 7, 13)"
)
# Create the text input component
text_input = gr.Textbox(
label="Enter some text",
placeholder="Type something here..."
)
demo = gr.Interface(
fn = master_fn,
inputs=[slider, text_input], # List of inputs
outputs = "audio"
)
demo.launch()