|
import torch |
|
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan |
|
import soundfile as sf |
|
import gradio as gr |
|
import scipy.io.wavfile as wav |
|
import numpy as np |
|
import wave |
|
from datasets import load_dataset, Audio, config |
|
|
|
|
|
checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl" |
|
processor = SpeechT5Processor.from_pretrained(checkpoint) |
|
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) |
|
tokenizer = processor.tokenizer |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
def prepare_dataset(example): |
|
audio = example["audio"] |
|
|
|
example = processor( |
|
text=transString(example["sentence"]), |
|
audio_target=audio["array"], |
|
sampling_rate=audio["sampling_rate"], |
|
return_attention_mask=False, |
|
) |
|
|
|
|
|
example["labels"] = example["labels"][0] |
|
|
|
|
|
example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) |
|
|
|
return example |
|
|
|
|
|
config.HF_DATASETS_CUSTOM_HEADERS = { |
|
"Authorization": "Bearer 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" |
|
} |
|
from huggingface_hub import notebook_login |
|
|
|
notebook_login() |
|
|
|
test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "ur", split="test") |
|
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000)) |
|
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names) |
|
|
|
|
|
|
|
buck2uni = { |
|
u"\u0627":"A", |
|
u"\u0627":"A", |
|
u"\u0675":"A", |
|
u"\u0673":"A", |
|
u"\u0630":"A", |
|
u"\u0622":"AA", |
|
u"\u0628":"B", |
|
u"\u067E":"P", |
|
u"\u062A":"T", |
|
u"\u0637":"T", |
|
u"\u0679":"T", |
|
u"\u062C":"J", |
|
u"\u0633":"S", |
|
u"\u062B":"S", |
|
u"\u0635":"S", |
|
u"\u0686":"CH", |
|
u"\u062D":"H", |
|
u"\u0647":"H", |
|
u"\u0629":"H", |
|
u"\u06DF":"H", |
|
u"\u062E":"KH", |
|
u"\u062F":"D", |
|
u"\u0688":"D", |
|
u"\u0630":"Z", |
|
u"\u0632":"Z", |
|
u"\u0636":"Z", |
|
u"\u0638":"Z", |
|
u"\u068E":"Z", |
|
u"\u0631":"R", |
|
u"\u0691":"R", |
|
u"\u0634":"SH", |
|
u"\u063A":"GH", |
|
u"\u0641":"F", |
|
u"\u06A9":"K", |
|
u"\u0642":"K", |
|
u"\u06AF":"G", |
|
u"\u0644":"L", |
|
u"\u0645":"M", |
|
u"\u0646":"N", |
|
u"\u06BA":"N", |
|
u"\u0648":"O", |
|
u"\u0649":"Y", |
|
u"\u0626":"Y", |
|
u"\u06CC":"Y", |
|
u"\u06D2":"E", |
|
u"\u06C1":"H", |
|
u"\u064A":"E" , |
|
u"\u06C2":"AH" , |
|
u"\u06BE":"H" , |
|
u"\u0639":"A" , |
|
u"\u0643":"K" , |
|
u"\u0621":"A", |
|
u"\u0624":"O", |
|
u"\u060C":"" |
|
} |
|
|
|
def transString(string, reverse=0): |
|
"""Given a Unicode string, transliterate into Buckwalter. To go from |
|
Buckwalter back to Unicode, set reverse=1""" |
|
for k, v in buck2uni.items(): |
|
if not reverse: |
|
string = string.replace(k, v) |
|
else: |
|
string = string.replace(v, k) |
|
return string |
|
|
|
|
|
def generate_audio(text): |
|
|
|
roman_urdu = transString(text) |
|
|
|
|
|
inputs = processor(text=roman_urdu, return_tensors="pt") |
|
|
|
|
|
example = test_dataset[22] |
|
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) |
|
|
|
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) |
|
|
|
return speech |
|
|
|
def text_to_speech(text): |
|
|
|
audio_output = generate_audio(text) |
|
|
|
|
|
from IPython.display import Audio |
|
|
|
audio = Audio(audio_output.numpy(), rate=16000) |
|
|
|
return audio |
|
|
|
|
|
inputs = gr.inputs.Textbox(label="Enter text in Urdu") |
|
outputs = gr.outputs.Audio(label="Audio") |
|
|
|
interface = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs, title="Urdu TTS") |
|
interface.launch() |
|
|
|
|