|
import torch |
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer |
|
from urllib.request import urlopen |
|
from io import BytesIO |
|
import soundfile as sf |
|
import numpy as np |
|
|
|
|
|
model_name = "jjyaoao/speecht5_finetuned_voxpopuli_nl" |
|
model = Wav2Vec2ForCTC.from_pretrained(model_name) |
|
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) |
|
|
|
|
|
buck2uni = { |
|
u"\u0627": "A", |
|
u"\u0675": "A", |
|
u"\u0673": "A", |
|
u"\u0630": "A", |
|
u"\u0622": "AA", |
|
u"\u0628": "B", |
|
u"\u067E": "P", |
|
u"\u062A": "T", |
|
u"\u0637": "T", |
|
u"\u0679": "T", |
|
u"\u062C": "J", |
|
u"\u0633": "S", |
|
u"\u062B": "S", |
|
u"\u0635": "S", |
|
u"\u0686": "CH", |
|
u"\u062D": "H", |
|
u"\u0647": "H", |
|
u"\u0629": "H", |
|
u"\u06DF": "H", |
|
u"\u062E": "KH", |
|
u"\u062F": "D", |
|
u"\u0688": "D", |
|
u"\u0630": "Z", |
|
u"\u0632": "Z", |
|
u"\u0636": "Z", |
|
u"\u0638": "Z", |
|
u"\u068E": "Z", |
|
u"\u0631": "R", |
|
u"\u0691": "R", |
|
u"\u0634": "SH", |
|
u"\u063A": "GH", |
|
u"\u0641": "F", |
|
u"\u06A9": "K", |
|
u"\u0642": "K", |
|
u"\u06AF": "G", |
|
u"\u0644": "L", |
|
u"\u0645": "M", |
|
u"\u0646": "N", |
|
u"\u06BA": "N", |
|
u"\u0648": "O", |
|
u"\u0649": "Y", |
|
u"\u0626": "Y", |
|
u"\u06CC": "Y", |
|
u"\u06D2": "E", |
|
u"\u06C1": "H", |
|
u"\u064A": "E", |
|
u"\u06C2": "AH", |
|
u"\u06BE": "H", |
|
u"\u0639": "A", |
|
u"\u0643": "K", |
|
u"\u0621": "A", |
|
u"\u0624": "O", |
|
u"\u060C": "", |
|
} |
|
|
|
def transString(string, reverse=0): |
|
"""Given a Unicode string, transliterate into Buckwalter. To go from |
|
Buckwalter back to Unicode, set reverse=1""" |
|
for k, v in buck2uni.items(): |
|
if not reverse: |
|
string = string.replace(k, v) |
|
else: |
|
string = string.replace(v, k) |
|
return string |
|
|
|
|
|
def generate_audio(text): |
|
|
|
roman_urdu = transString(text) |
|
|
|
|
|
inputs = tokenizer(roman_urdu, return_tensors="pt").input_values |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(inputs).logits |
|
|
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
audio = tokenizer.decode(predicted_ids[0], skip_special_tokens=True) |
|
|
|
return audio |
|
|
|
|
|
|
|
def main(): |
|
|
|
input_text_urdu = input("Enter text in Urdu: ") |
|
|
|
|
|
audio_output = generate_audio(input_text_urdu) |
|
|
|
|
|
sf.write("output.wav", audio_output, samplerate=22050) |
|
|
|
print("Audio generated and saved as 'output.wav'") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|