import torch from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor import soundfile as sf import gradio as gr # Load the TTS model from the Hugging Face Hub checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl" # Replace with your actual model name processor = SpeechT5Processor.from_pretrained(checkpoint) model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) tokenizer = processor.tokenizer # Buckwalter to Unicode mapping buck2uni = { u"\u0627": "A", u"\u0675": "A", u"\u0673": "A", u"\u0630": "A", u"\u0622": "AA", # Rest of the mapping... } def transString(string, reverse=0): """Given a Unicode string, transliterate into Buckwalter. To go from Buckwalter back to Unicode, set reverse=1""" for k, v in buck2uni.items(): if not reverse: string = string.replace(k, v) else: string = string.replace(v, k) return string def generate_audio(text): # Convert input text to Roman Urdu roman_urdu = transString(text) # Tokenize the input text inputs = tokenizer(roman_urdu, return_tensors="pt").input_values # Generate speech from the model with torch.no_grad(): logits = model(inputs).logits # Convert logits to audio waveform predicted_ids = torch.argmax(logits, dim=-1) audio = tokenizer.decode(predicted_ids[0], skip_special_tokens=True) return audio def text_to_speech(text): # Generate audio audio_output = generate_audio(text) # Save audio as a .wav file sf.write("output.wav", audio_output, samplerate=22050) return "output.wav" # Define the Gradio interface inputs = gr.inputs.Textbox(label="Enter text in Urdu") outputs = gr.outputs.Audio(label="Audio") interface = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs, title="Urdu TTS") interface.launch()