import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processorf, SpeechT5HifiGan
import soundfile as sf
import gradio as gr
import scipy.io.wavfile as wav
import numpy as np

# Load the TTS model from the Hugging Face Hub
checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl"  # Replace with your actual model name
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
tokenizer = processor.tokenizer
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


# Buckwalter to Unicode mapping
buck2uni = {
            u"\u0627":"A",
            u"\u0627":"A",
            u"\u0675":"A",
            u"\u0673":"A",
            u"\u0630":"A",
            u"\u0622":"AA",
            u"\u0628":"B",
            u"\u067E":"P",
            u"\u062A":"T",
            u"\u0637":"T",
            u"\u0679":"T",
            u"\u062C":"J",
            u"\u0633":"S",
            u"\u062B":"S",
            u"\u0635":"S",
            u"\u0686":"CH",
            u"\u062D":"H",
            u"\u0647":"H",
            u"\u0629":"H",
            u"\u06DF":"H",
            u"\u062E":"KH",
            u"\u062F":"D",
            u"\u0688":"D",
            u"\u0630":"Z",
            u"\u0632":"Z",
            u"\u0636":"Z",
            u"\u0638":"Z",
            u"\u068E":"Z",
            u"\u0631":"R",
            u"\u0691":"R",
            u"\u0634":"SH",
            u"\u063A":"GH",
            u"\u0641":"F",
            u"\u06A9":"K",
            u"\u0642":"K",
            u"\u06AF":"G",
            u"\u0644":"L",
            u"\u0645":"M",
            u"\u0646":"N",
            u"\u06BA":"N",
            u"\u0648":"O",
            u"\u0649":"Y",
            u"\u0626":"Y",
            u"\u06CC":"Y",

            u"\u06D2":"E",
            u"\u06C1":"H",
            u"\u064A":"E"  ,
            u"\u06C2":"AH"  ,
            u"\u06BE":"H"  ,
            u"\u0639":"A"  ,
            u"\u0643":"K" ,
            u"\u0621":"A",
            u"\u0624":"O",
            u"\u060C":"" #seperator ulta comma
}

def transString(string, reverse=0):
    """Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1"""
    for k, v in buck2uni.items():
        if not reverse:
            string = string.replace(k, v)
        else:
            string = string.replace(v, k)
    return string


def generate_audio(text):
    # Convert input text to Roman Urdu
    roman_urdu = transString(text)

    # Tokenize the input text
    inputs = tokenizer(roman_urdu, return_tensors="pt")

    # Generate speech from the SpeechT5 model
    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    return speech


def text_to_speech(text):
    # Generate audio
    audio_output = generate_audio(text)

    # Save audio as a .wav file
    sf.write("output.wav", audio_output.numpy(), samplerate=16000)

    return "output.wav"


# Define the Gradio interface
inputs = gr.inputs.Textbox(label="Enter text in Urdu")
outputs = gr.outputs.Audio(label="Audio")

interface = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs, title="Urdu TTS")
interface.launch()