import torch from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan import soundfile as sf import gradio as gr import scipy.io.wavfile as wav import numpy as np import wave from datasets import load_dataset, Audio, config # Load the TTS model from the Hugging Face Hub checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl" # Replace with your actual model name processor = SpeechT5Processor.from_pretrained(checkpoint) model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) tokenizer = processor.tokenizer vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") def prepare_dataset(example): audio = example["audio"] example = processor( text=transString(example["sentence"]), audio_target=audio["array"], sampling_rate=audio["sampling_rate"], return_attention_mask=False, ) # strip off the batch dimension example["labels"] = example["labels"][0] # use SpeechBrain to obtain x-vector example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) return example # Set the authentication token config.HF_DATASETS_CUSTOM_HEADERS = { "Authorization": "Bearer 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" } from huggingface_hub import notebook_login notebook_login() test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "ur", split="test") test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000)) test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names) # Buckwalter to Unicode mapping buck2uni = { u"\u0627":"A", u"\u0627":"A", u"\u0675":"A", u"\u0673":"A", u"\u0630":"A", u"\u0622":"AA", u"\u0628":"B", u"\u067E":"P", u"\u062A":"T", u"\u0637":"T", u"\u0679":"T", u"\u062C":"J", u"\u0633":"S", u"\u062B":"S", u"\u0635":"S", u"\u0686":"CH", u"\u062D":"H", u"\u0647":"H", u"\u0629":"H", u"\u06DF":"H", u"\u062E":"KH", u"\u062F":"D", u"\u0688":"D", u"\u0630":"Z", u"\u0632":"Z", u"\u0636":"Z", u"\u0638":"Z", u"\u068E":"Z", u"\u0631":"R", u"\u0691":"R", u"\u0634":"SH", u"\u063A":"GH", u"\u0641":"F", u"\u06A9":"K", u"\u0642":"K", u"\u06AF":"G", u"\u0644":"L", u"\u0645":"M", u"\u0646":"N", u"\u06BA":"N", u"\u0648":"O", u"\u0649":"Y", u"\u0626":"Y", u"\u06CC":"Y", u"\u06D2":"E", u"\u06C1":"H", u"\u064A":"E" , u"\u06C2":"AH" , u"\u06BE":"H" , u"\u0639":"A" , u"\u0643":"K" , u"\u0621":"A", u"\u0624":"O", u"\u060C":"" #seperator ulta comma } def transString(string, reverse=0): """Given a Unicode string, transliterate into Buckwalter. To go from Buckwalter back to Unicode, set reverse=1""" for k, v in buck2uni.items(): if not reverse: string = string.replace(k, v) else: string = string.replace(v, k) return string def generate_audio(text): # Convert input text to Roman Urdu roman_urdu = transString(text) # Tokenize the input text inputs = processor(text=roman_urdu, return_tensors="pt") # Generate audio from the SpeechT5 model example = test_dataset[22] speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) return speech def text_to_speech(text): # Generate audio audio_output = generate_audio(text) # Save audio as a .wav file from IPython.display import Audio audio = Audio(audio_output.numpy(), rate=16000) return audio # Define the Gradio interface inputs = gr.inputs.Textbox(label="Enter text in Urdu") outputs = gr.outputs.Audio(label="Audio") interface = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs, title="Urdu TTS") interface.launch()