import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor, SpeechT5HifiGan
import soundfile as sf
import gradio as gr
import scipy.io.wavfile as wav
import numpy as np
import wave
from datasets import load_dataset, Audio, config

# Load the TTS model from the Hugging Face Hub
checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl"  # Replace with your actual model name
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
tokenizer = processor.tokenizer
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=transString(example["sentence"]),
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

# Set the authentication token
config.HF_DATASETS_CUSTOM_HEADERS = {
    "Authorization": "Bearer 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"
}
from huggingface_hub import notebook_login

notebook_login()

test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "ur", split="test")
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)


# Buckwalter to Unicode mapping
buck2uni = {
            u"\u0627":"A",
            u"\u0627":"A",
            u"\u0675":"A",
            u"\u0673":"A",
            u"\u0630":"A",
            u"\u0622":"AA",
            u"\u0628":"B",
            u"\u067E":"P",
            u"\u062A":"T",
            u"\u0637":"T",
            u"\u0679":"T",
            u"\u062C":"J",
            u"\u0633":"S",
            u"\u062B":"S",
            u"\u0635":"S",
            u"\u0686":"CH",
            u"\u062D":"H",
            u"\u0647":"H",
            u"\u0629":"H",
            u"\u06DF":"H",
            u"\u062E":"KH",
            u"\u062F":"D",
            u"\u0688":"D",
            u"\u0630":"Z",
            u"\u0632":"Z",
            u"\u0636":"Z",
            u"\u0638":"Z",
            u"\u068E":"Z",
            u"\u0631":"R",
            u"\u0691":"R",
            u"\u0634":"SH",
            u"\u063A":"GH",
            u"\u0641":"F",
            u"\u06A9":"K",
            u"\u0642":"K",
            u"\u06AF":"G",
            u"\u0644":"L",
            u"\u0645":"M",
            u"\u0646":"N",
            u"\u06BA":"N",
            u"\u0648":"O",
            u"\u0649":"Y",
            u"\u0626":"Y",
            u"\u06CC":"Y",
            u"\u06D2":"E",
            u"\u06C1":"H",
            u"\u064A":"E"  ,
            u"\u06C2":"AH"  ,
            u"\u06BE":"H"  ,
            u"\u0639":"A"  ,
            u"\u0643":"K" ,
            u"\u0621":"A",
            u"\u0624":"O",
            u"\u060C":"" #seperator ulta comma
}

def transString(string, reverse=0):
    """Given a Unicode string, transliterate into Buckwalter. To go from
    Buckwalter back to Unicode, set reverse=1"""
    for k, v in buck2uni.items():
        if not reverse:
            string = string.replace(k, v)
        else:
            string = string.replace(v, k)
    return string


def generate_audio(text):
    # Convert input text to Roman Urdu
    roman_urdu = transString(text)

    # Tokenize the input text
    inputs = processor(text=roman_urdu, return_tensors="pt")

    # Generate audio from the SpeechT5 model
    example = test_dataset[22]
    speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)

    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    return speech

def text_to_speech(text):
    # Generate audio
    audio_output = generate_audio(text)

    # Save audio as a .wav file
    from IPython.display import Audio

    audio = Audio(audio_output.numpy(), rate=16000)

    return audio

# Define the Gradio interface
inputs = gr.inputs.Textbox(label="Enter text in Urdu")
outputs = gr.outputs.Audio(label="Audio")

interface = gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs, title="Urdu TTS")
interface.launch()