import os
import sys
import time
import requests
import json
from subprocess import Popen, PIPE
import threading
from huggingface_hub import hf_hub_download
import gradio as gr

hf_model_name = "Pendrokar/xvapitch_nvidia"
hf_cache_models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/'
models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/'

try:
	os.symlink('/home/user/.cache/huggingface/hub/models--Pendrokar--TorchMoji/snapshots/58217568daaf64d3621245dd5c88c94e651a08d6', '/home/user/app/resources/app/plugins/deepmoji_plugings/model', target_is_directory=True)
except:
	print('Failed to create symlink to DeepMoji model, may already be there.')

voice_models = [
	("Male #6671", "ccby_nvidia_hifi_6671_M"),
	("Male #6670", "ccby_nvidia_hifi_6670_M"),
	("Male #9017", "ccby_nvidia_hifi_9017_M"),
	("Male #6097", "ccby_nvidia_hifi_6097_M"),
	("Female #92", "ccby_nvidia_hifi_92_F"),
	("Female #11697", "ccby_nvidia_hifi_11697_F"),
	("Female #12787", "ccby_nvidia_hifi_12787_F"),
	("Female #11614", "ccby_nv_hifi_11614_F"),
	("Female #8051", "ccby_nvidia_hifi_8051_F"),
	("Female #9136", "ccby_nvidia_hifi_9136_F"),
]
current_voice_model = None
base_speaker_emb = ''

# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
languages = [
    ("🇬🇧 EN", "en"),
    ("🇩🇪 DE", "de"),
    ("🇪🇸 ES", "es"),
    ("🇮🇹 IT", "it"),
    ("🇳🇱 NL", "nl"),
    ("🇵🇹 PT", "pt"),
    ("🇵🇱 PL", "pl"),
    ("🇷🇴 RO", "ro"),
    ("🇸🇪 SV", "sv"),
    ("🇩🇰 DA", "da"),
    ("🇫🇮 FI", "fi"),
    ("🇭🇺 HU", "hu"),
    ("🇬🇷 EL", "el"),
    ("🇫🇷 FR", "fr"),
    ("🇷🇺 RU", "ru"),
    ("🇺🇦 UK", "uk"),
    ("🇹🇷 TR", "tr"),
    ("🇸🇦 AR", "ar"),
    ("🇮🇳 HI", "hi"),
    ("🇯🇵 JP", "jp"),
    ("🇰🇷 KO", "ko"),
    ("🇨🇳 ZH", "zh"),
    ("🇻🇳 VI", "vi"),
    ("🇻🇦 LA", "la"),
    ("HA", "ha"),
    ("SW", "sw"),
    ("🇳🇬 YO", "yo"),
    ("WO", "wo"),
]

# Translated from English by DeepMind's Gemini Pro
default_text = {
	"ar": "هذا هو صوتي.",
	"da": "Sådan lyder min stemme.",
	"de": "So klingt meine Stimme.",
	"el": "Έτσι ακούγεται η φωνή μου.",
	"en": "This is what my voice sounds like.",
	"es": "Así suena mi voz.",
	"fi": "Näin ääneni kuulostaa.",
	"fr": "Voici à quoi ressemble ma voix.",
	"ha": "Wannan ne muryata ke.",
	"hi": "यह मेरी आवाज़ कैसी लगती है।",
	"hu": "Így hangzik a hangom.",
	"it": "Così suona la mia voce.",
	"jp": "これが私の声です。",
	"ko": "여기 제 목소리가 어떤지 들어보세요.",
	"la": "Haec est vox mea sonans.",
	"nl": "Dit is hoe mijn stem klinkt.",
	"pl": "Tak brzmi mój głos.",
	"pt": "É assim que minha voz soa.",
	"ro": "Așa sună vocea mea.",
	"ru": "Вот как звучит мой голос.",
	"sv": "Såhär låter min röst.",
	"sw": "Sauti yangu inasikika hivi.",
	"tr": "Benim sesimin sesi böyle.",
	"uk": "Ось як звучить мій голос.",
	"vi": "Đây là giọng nói của tôi.",
	"wo": "Ndox li neen xewnaal ma.",
	"yo": "Ìyí ni ohùn mi ńlá.",
	"zh": "这是我的声音。",
}

def run_xvaserver():
	# start the process without waiting for a response
	print('Running xVAServer subprocess...\n')
	xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/')

	# Wait for a moment to ensure the server starts up
	time.sleep(10)

	# Check if the server is running
	if xvaserver.poll() is not None:
		print("Web server failed to start.")
		sys.exit(0)

	# contact local xVASynth server
	print('Attempting to connect to xVASynth...')
	try:
		response = requests.get('http://0.0.0.0:8008')
		response.raise_for_status()  # If the response contains an HTTP error status code, raise an exception
	except requests.exceptions.RequestException as err:
		print('Failed to connect!')
		return

	print('xVAServer running on port 8008')

	# load default model
	load_model("ccby_nvidia_hifi_6671_M")

	# Wait for the process to exit
	xvaserver.wait()

def load_model(voice_model_name):
	model_path =  models_path + voice_model_name

	model_type = 'xVAPitch'
	language = 'en'

	data = {
		'outputs': None,
		'version': '3.0',
		'model': model_path,
		'modelType': model_type,
		'base_lang': language,
		'pluginsContext': '{}',
	}

	embs = base_speaker_emb

	try:
		response = requests.post('http://0.0.0.0:8008/loadModel', json=data, timeout=60)
		response.raise_for_status()  # If the response contains an HTTP error status code, raise an exception
		current_voice_model = voice_model_name

		with open(model_path + '.json', 'r', encoding='utf-8') as f:
		    voice_model_json = json.load(f)
		embs = voice_model_json['games'][0]['base_speaker_emb']
	except requests.exceptions.RequestException as err:
		print('Failed to load voice model!')

	return embs

def predict(
	input_text,
	voice,
	lang,
	pacing,
	pitch,
	energy,
	anger,
	happy,
	sad,
	surprise,
	use_deepmoji
):
	# grab only the first 1000 characters
	input_text = input_text[:1000]

	# load voice model if not the current model
	if (current_voice_model != voice):
		base_speaker_emb = load_model(voice)

	model_type = 'xVAPitch'
	pace = pacing if pacing else 1.0
	save_path = '/tmp/xvapitch_audio_sample.wav'
	language = lang
	use_sr = 0
	use_cleanup = 0

	pluginsContext = {}
	pluginsContext["mantella_settings"] = {
		"emAngry": (anger if anger > 0 else 0),
		"emHappy": (happy if happy > 0 else 0),
		"emSad": (sad if sad > 0 else 0),
		"emSurprise": (surprise if surprise > 0 else 0),
		"run_model": use_deepmoji
	}


	data = {
		'pluginsContext': json.dumps(pluginsContext),
		'modelType': model_type,
		# pad with whitespaces as a workaround to avoid cutoffs
		'sequence': input_text.center(len(input_text) + 2, ' '),
		'pace': pace,
		'outfile': save_path,
		'vocoder': 'n/a',
		'base_lang': language,
		'base_emb': base_speaker_emb,
		'useSR': use_sr,
		'useCleanup': use_cleanup,
	}

	try:
		response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
		response.raise_for_status()  # If the response contains an HTTP error status code, raise an exception
		# response_data = json.loads(response.text)
	except requests.exceptions.RequestException as err:
		print('Failed to synthesize!')
		print('server.log contents:')
		with open('resources/app/server.log', 'r') as f:
			print(f.read())
		return ['', err]

	print('server.log contents:')
	with open('resources/app/server.log', 'r') as f:
		print(f.read())

	return [save_path, response.text]

input_textbox = gr.Textbox(
	label="Input Text",
	value="This is what my voice sounds like.",
	info="Also accepts ARPAbet symbols placed within {} brackets.",
	lines=1,
	max_lines=5,
	autofocus=True
)
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger", info="Tread lightly beyond 0.9")
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness", info="Tread lightly beyond 0.7")
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness", info="Duration increased when beyond 0.2")
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise", info="Does not play well with Happiness with either being beyond 0.3")
voice_radio = gr.Radio(
	voice_models,
	value="ccby_nvidia_hifi_6671_M",
	label="Voice",
	info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
)

def set_default_text(lang):
	input_textbox = gr.Textbox(
		label="Input Text",
		value=default_text[lang],
		lines=1,
		max_lines=5,
		autofocus=True
	)

language_radio = gr.Radio(
	languages,
	value="en",
	label="Language",
	info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
)
# language_radio.change(set_default_text)
deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values")

gradio_app = gr.Interface(
	predict,
	[
		input_textbox,
		voice_radio,
		language_radio,
		pacing_slider,
		pitch_slider,
		energy_slider,
		anger_slider,
		happy_slider,
		sad_slider,
		surprise_slider,
		deepmoji_checkbox
	],
	outputs=[
		gr.Audio(label="22kHz audio output", type="filepath"),
		gr.Textbox(label="xVASynth Server Response")
	],
	title="xVASynth (WIP)",
	clear_btn=gr.Button(visible=False)
	# examples=[
	# 	["Once, I headed in much deeper. But I doubt I'll ever do that again.", 1],
	# 	["You love hurting me, huh?", 1.5],
	# 	["Ah, I see. Well, I'm afraid I can't help with that.", 1],
	# 	["Embrace your demise!", 1],
	# 	["Never come back!", 1]
	# ],
	# cache_examples=None
)


if __name__ == "__main__":
	# Run the web server in a separate thread
	web_server_thread = threading.Thread(target=run_xvaserver)
	print('Starting xVAServer thread')
	web_server_thread.start()

	print('running Gradio interface')
	gradio_app.launch()

	# Wait for the web server thread to finish (shouldn't be reached in normal execution)
	web_server_thread.join()