|
import torch |
|
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor |
|
import soundfile as sf |
|
import gradio as gr |
|
|
|
|
|
checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl" |
|
processor = SpeechT5Processor.from_pretrained(checkpoint) |
|
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) |
|
tokenizer = processor.tokenizer |
|
|
|
|
|
|
|
buck2uni = { |
|
u"\u0627": "A", |
|
u"\u0675": "A", |
|
u"\u0673": "A", |
|
u"\u0630": "A", |
|
u"\u0622": "AA", |
|
|
|
} |
|
|
|
def transString(string, reverse=0): |
|
"""Given a Unicode string, transliterate into Buckwalter. To go from |
|
Buckwalter back to Unicode, set reverse=1""" |
|
for k, v in buck2uni.items(): |
|
if not reverse: |
|
string = string.replace(k, v) |
|
else: |
|
string = string.replace(v, k) |
|
return string |
|
|
|
|
|
def generate_audio(text): |
|
|
|
roman_urdu = transString(text) |
|
|
|
|
|
inputs = tokenizer(roman_urdu, return_tensors="pt").input_values |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(inputs).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
audio = tokenizer.decode(predicted_ids[0], skip_special_tokens=True) |
|
|
|
return audio |
|
|
|
|
|
def text_to_speech(text): |
|
|
|
audio_output = generate_audio(text) |
|
|
|
|
|
sf.write("output.wav", audio_output, samplerate=22050) |
|
|
|
return "output.wav" |
|
|
|
|
|
|
|
inputs = gr.inputs.Textbox(label="Enter text in Urdu") |
|
outputs = gr.outputs.Audio(label="Audio") |
|
|
|
interface = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs, title="Urdu TTS") |
|
interface.launch() |
|
|