urdu_TTS / app.py
arham061's picture
Update app.py
b98f522
raw
history blame
1.86 kB
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
import soundfile as sf
import gradio as gr
# Load the TTS model from the Hugging Face Hub
checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl" # Replace with your actual model name
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
tokenizer = processor.tokenizer
# Buckwalter to Unicode mapping
buck2uni = {
u"\u0627": "A",
u"\u0675": "A",
u"\u0673": "A",
u"\u0630": "A",
u"\u0622": "AA",
# Rest of the mapping...
}
def transString(string, reverse=0):
"""Given a Unicode string, transliterate into Buckwalter. To go from
Buckwalter back to Unicode, set reverse=1"""
for k, v in buck2uni.items():
if not reverse:
string = string.replace(k, v)
else:
string = string.replace(v, k)
return string
def generate_audio(text):
# Convert input text to Roman Urdu
roman_urdu = transString(text)
# Tokenize the input text
inputs = tokenizer(roman_urdu, return_tensors="pt").input_values
# Generate speech from the model
with torch.no_grad():
logits = model(inputs).logits
# Convert logits to audio waveform
predicted_ids = torch.argmax(logits, dim=-1)
audio = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
return audio
def text_to_speech(text):
# Generate audio
audio_output = generate_audio(text)
# Save audio as a .wav file
sf.write("output.wav", audio_output, samplerate=22050)
return "output.wav"
# Define the Gradio interface
inputs = gr.inputs.Textbox(label="Enter text in Urdu")
outputs = gr.outputs.Audio(label="Audio")
interface = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs, title="Urdu TTS")
interface.launch()