from TTS.api import TTS
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
import whisper
model = whisper.load_model("small")
import os
os.system('pip install voicefixer --upgrade')
from voicefixer import VoiceFixer
voicefixer = VoiceFixer()
import gradio as gr
import openai
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement

enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)

import re
import random
import string
import librosa
import numpy as np

from pathlib import Path
from scipy.io.wavfile import write

from encoder import inference as encoder
from vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer

mes1 = [
    {"role": "system", "content": "You are a TOEFL examiner. Help me improve my oral Englsih and give me feedback."}
]

mes2 = [
    {"role": "system", "content": "You are a mental health therapist. Your name is Tina."}
]

mes3 = [
    {"role": "system", "content": "You are my personal assistant. Your name is Alice."}
]

res = []

class Mandarin:
    def __init__(self):
        self.encoder_path = "encoder/saved_models/pretrained.pt"
        self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
        self.config_fpath = "vocoder/hifigan/config_16k_.json"
        self.accent = "synthesizer/saved_models/普通话.pt"

        synthesizers_cache = {}
        if synthesizers_cache.get(self.accent) is None:
            self.current_synt = Synthesizer(Path(self.accent))
            synthesizers_cache[self.accent] = self.current_synt
        else:
            self.current_synt = synthesizers_cache[self.accent]

        encoder.load_model(Path(self.encoder_path))
        gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)

    def setVoice(self, timbre):
        self.timbre = timbre
        wav, sample_rate,  = librosa.load(self.timbre)

        encoder_wav = encoder.preprocess_wav(wav, sample_rate)
        self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

    def say(self, text):
        texts = filter(None, text.split("\n"))
        punctuation = "！，。、？!,.?：:" # punctuate and split/clean text
        processed_texts = []
        for text in texts:
            for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
                if processed_text:
                    processed_texts.append(processed_text.strip())
        texts = processed_texts
        embeds = [self.embed] * len(texts)

        specs = self.current_synt.synthesize_spectrograms(texts, embeds)
        spec = np.concatenate(specs, axis=1)
        wav, sample_rate = gan_vocoder.infer_waveform(spec)

        return wav, sample_rate

def greet(apikey, upload, audio, choice1, voice=None):

    openai.api_key = apikey

    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")

    # decode the audio
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    res.append(result.text)

    if choice1 == "TOEFL":
      messages = mes1
    elif choice1 == "Therapist":
      messages = mes2
    elif choice1 == "Alice":
      messages = mes3

    # chatgpt
    n = len(res)
    content = res[n-1]
    messages.append({"role": "user", "content": content})

    completion = openai.ChatCompletion.create(
      model = "gpt-3.5-turbo",
      messages = messages
    )

    chat_response = completion.choices[0].message.content

    messages.append({"role": "assistant", "content": chat_response})   

    if voice is None:
        voice = Mandarin()
        voice.setVoice(upload)
        voice.say("加载成功")
    wav, sample_rate = voice.say(chat_response)

    output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

    write(output_file, sample_rate, wav.astype(np.float32))

    voicefixer.restore(input=output_file, # input wav file path
                    output="audio1.wav", # output wav file path
                    cuda=True, # whether to use gpu acceleration
                    mode = 0) # You can try out mode 0, 1, or 2 to find out the best result
    
    noisy = enhance_model.load_audio(
    "audio1.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

    return [result.text, chat_response, "enhanced.wav", voice]

def main():
    gr.Interface(
        fn=greet,
        inputs=[
            gr.Textbox(lines=1, label = "请填写您的OpenAI-API-key"),
            gr.Audio(source="upload", label = "请上传您喜欢的声音(wav文件)", type="filepath"),
            gr.Audio(source="microphone", label = "和您的专属AI聊天吧！", type="filepath"),
            gr.Radio(["TOEFL", "Therapist", "Alice"], label="TOEFL Examiner, Therapist Tina, or Assistant Alice?"),
            gr.State([]),
        ],
        outputs=[
            gr.Textbox(label="Speech to Text"), gr.Textbox(label="ChatGPT Output"), gr.Audio(label="Audio with Custom Voice"), gr.State([]),
        ], 
    ).launch()

if __name__=="__main__":
    main()