import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer from urllib.request import urlopen from io import BytesIO import soundfile as sf import numpy as np # Load the TTS model from the Hugging Face Hub model_name = "arham061/speecht5_finetuned_voxpopuli_nl" # Replace with your actual model name model = Wav2Vec2ForCTC.from_pretrained(model_name) tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) # Buckwalter to Unicode mapping buck2uni = { u"\u0627": "A", u"\u0675": "A", u"\u0673": "A", u"\u0630": "A", u"\u0622": "AA", u"\u0628": "B", u"\u067E": "P", u"\u062A": "T", u"\u0637": "T", u"\u0679": "T", u"\u062C": "J", u"\u0633": "S", u"\u062B": "S", u"\u0635": "S", u"\u0686": "CH", u"\u062D": "H", u"\u0647": "H", u"\u0629": "H", u"\u06DF": "H", u"\u062E": "KH", u"\u062F": "D", u"\u0688": "D", u"\u0630": "Z", u"\u0632": "Z", u"\u0636": "Z", u"\u0638": "Z", u"\u068E": "Z", u"\u0631": "R", u"\u0691": "R", u"\u0634": "SH", u"\u063A": "GH", u"\u0641": "F", u"\u06A9": "K", u"\u0642": "K", u"\u06AF": "G", u"\u0644": "L", u"\u0645": "M", u"\u0646": "N", u"\u06BA": "N", u"\u0648": "O", u"\u0649": "Y", u"\u0626": "Y", u"\u06CC": "Y", u"\u06D2": "E", u"\u06C1": "H", u"\u064A": "E", u"\u06C2": "AH", u"\u06BE": "H", u"\u0639": "A", u"\u0643": "K", u"\u0621": "A", u"\u0624": "O", u"\u060C": "", # separator ulta comma } def transString(string, reverse=0): """Given a Unicode string, transliterate into Buckwalter. To go from Buckwalter back to Unicode, set reverse=1""" for k, v in buck2uni.items(): if not reverse: string = string.replace(k, v) else: string = string.replace(v, k) return string def generate_audio(text): # Convert input text to Roman Urdu roman_urdu = transString(text) # Tokenize the input text inputs = tokenizer(roman_urdu, return_tensors="pt").input_values # Generate speech from the model with torch.no_grad(): logits = model(inputs).logits # Convert logits to audio waveform predicted_ids = torch.argmax(logits, dim=-1) audio = tokenizer.decode(predicted_ids[0], skip_special_tokens=True) return audio # Example usage def main(): # Get input text in Urdu input_text_urdu = input("Enter text in Urdu: ") # Generate audio audio_output = generate_audio(input_text_urdu) # Save audio as a .wav file sf.write("output.wav", audio_output, samplerate=22050) print("Audio generated and saved as 'output.wav'") if __name__ == "__main__": main()