from typing import  Dict, List, Text, Any
import re
from transformers import SpeechT5ForTextToSpeech
from transformers import SpeechT5Processor
from transformers import SpeechT5HifiGan
import soundfile
import torch
import numpy as np

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type != 'cuda':
    raise ValueError("need to run on GPU")
# set mixed precision dtype
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16


class EndpointHandler():
    def __init__(self, path=""):
        # Load all required models 
        self.model_id = "Oysiyl/speecht5_tts_common_voice_uk"
        self.model = SpeechT5ForTextToSpeech.from_pretrained(self.model_id, torch_dtype=dtype).to(device)
        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
        self.speaker_embeddings = torch.tensor(np.load("embed.npy"), dtype=dtype).to(device)
    
    @staticmethod
    def remove_special_characters_s(text: Text) -> Text:
        chars_to_remove_regex = '[\…\–\"\“\%\‘\”\�\»\«\„\`\'́]'
        # remove special characters
        text = re.sub(chars_to_remove_regex, '', text)
        text = re.sub("՚", "'", text)
        text = re.sub("’", "'", text)
        text = re.sub(r'ы', 'и', text)
        text = text.lower()
        return text

    @staticmethod
    def cyrillic_to_latin(text: Text) -> Text:
        replacements = [
            ('а', 'a'),
            ('б', 'b'),
            ('в', 'v'),
            ('г', 'h'),
            ('д', 'd'),
            ('е', 'e'),
            ('ж', 'zh'),
            ('з', 'z'),
            ('и', 'y'),
            ('й', 'j'),
            ('к', 'k'),
            ('л', 'l'),
            ('м', 'm'),
            ('н', 'n'),
            ('о', 'o'),
            ('п', 'p'),
            ('р', 'r'),
            ('с', 's'),
            ('т', 't'),
            ('у', 'u'),
            ('ф', 'f'),
            ('х', 'h'),
            ('ц', 'ts'),
            ('ч', 'ch'),
            ('ш', 'sh'),
            ('щ', 'sch'),
            ('ь', "'"),
            ('ю', 'ju'),
            ('я', 'ja'),
            ('є', 'je'),
            ('і', 'i'),
            ('ї', 'ji'),
            ('ґ', 'g')
        ]

        for src, dst in replacements:
            text = text.replace(src, dst)
        return text

    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        """
        :param data: A dictionary contains `inputs`.
        :return: A dictionary with `image` field contains image in base64.
        """
        text = data.pop("inputs", None)
        
        # Check if text is not provided
        if text is None:
            return {"error": "Please provide a text."}        
     
        # run inference pipeline
        text = self.remove_special_characters_s(text)
        text = self.cyrillic_to_latin(text)
        input_ids = self.processor(text=text, return_tensors="pt")['input_ids'].to(device)
        spectrogram = self.model.generate_speech(input_ids, self.speaker_embeddings)
        with torch.no_grad():
            speech = self.vocoder(spectrogram)
        if device.type != 'cuda':
            out = speech.numpy()
        else:
            out = speech.cpu().numpy()

        # return output audio in numpy format
        return out