# -*- coding: utf-8 -*-

"""
@Author     : Rong Ye
@Time       : May 2022
@Contact    : yerong@bytedance
@Description:
"""

import os
import traceback
import shutil
import yaml
import re
from pydub import AudioSegment
import gradio as gr
from huggingface_hub import snapshot_download


LANGUAGE_CODES = {
    "German": "de",
    "Spanish": "es",
    "French": "fr",
    "Italian": "it",
    "Netherlands": "nl",
    "Portuguese": "pt",
    "Romanian": "ro",
    "Russian": "ru",
}

LANG_GEN_SETUPS = {
    "de": {"beam": 10, "lenpen": 0.7},
    "es": {"beam": 10, "lenpen": 0.1},
    "fr": {"beam": 10, "lenpen": 1.0},
    "it": {"beam": 10, "lenpen": 0.5},
    "nl": {"beam": 10, "lenpen": 0.4},
    "pt": {"beam": 10, "lenpen": 0.9},
    "ro": {"beam": 10, "lenpen": 1.0},
    "ru": {"beam": 10, "lenpen": 0.3},
}

os.system("git clone https://github.com/ReneeYe/ConST")
os.system("mv ConST ConST_git")
os.system('mv -n ConST_git/* ./')
os.system("rm -rf ConST_git")
os.system("pip3 install --editable ./")
os.system("mkdir -p data checkpoint")


huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models")
print(huggingface_model_dir)


def convert_audio_to_16k_wav(audio_input):
    sound = AudioSegment.from_file(audio_input)
    sample_rate = sound.frame_rate
    num_channels = sound.channels
    num_frames = int(sound.frame_count())
    filename = audio_input.split("/")[-1]
    print("original file is at:", audio_input)
    if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
        if num_channels > 1:
            sound = sound.set_channels(1)
        if sample_rate != 16000:
            sound = sound.set_frame_rate(16000)
        num_frames = int(sound.frame_count())
        filename = filename.replace(".wav", "") + "_16k.wav"
        sound.export(f"data/{filename}", format="wav")
    else:
        shutil.copy(audio_input, f'data/{filename}')
    return filename, num_frames


def prepare_tsv(file_name, n_frame, language, task="ST"):
    tgt_lang = LANGUAGE_CODES[language]
    with open("data/test_case.tsv", "w") as f:
        f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n")
        f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n")


def get_vocab_and_yaml(language):
    tgt_lang = LANGUAGE_CODES[language]
    # get: spm_ende.model and spm_ende.txt, and save to data/xxx
    # if exist, no need to download
    shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data")
    shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data")

    # write yaml file
    abs_path = os.popen("pwd").read().strip()
    yaml_dict = LANG_GEN_SETUPS[tgt_lang]
    yaml_dict["input_channels"] = 1
    yaml_dict["use_audio_input"] = True
    yaml_dict["prepend_tgt_lang_tag"] = True
    yaml_dict["prepend_src_lang_tag"] = True
    yaml_dict["audio_root"] = os.path.join(abs_path, "data")
    yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt"
    yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece",
                                  "sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")}
    with open("data/config.yaml", "w") as f:
        yaml.dump(yaml_dict, f)


def get_model(language):
    # download models to checkpoint/xxx
    return os.path.join(huggingface_model_dir, f"models/const_en{LANGUAGE_CODES[language]}.pt")


def generate(model_path):
    os.system(f"python3 fairseq_cli/generate.py data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
                --max-tokens 4000000 --max-source-positions 4000000 \
                --config-yaml config.yaml  --path {model_path} | tee temp.txt")
    output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3")
    return output.read().strip()


def post_processing(raw_sentence):
    output_sentence = raw_sentence
    if ":" in raw_sentence:
        splited_sent = raw_sentence.split(":")
        if len(splited_sent) == 2:
            prefix = splited_sent[0].strip()
            if len(prefix) <= 3:
                output_sentence = splited_sent[1].strip()
            elif ("(" in prefix) and (")" in prefix):
                bgm = re.findall(r"\(.*?\)", prefix)[0]
                if len(prefix.replace(bgm, "").strip()) <= 3:
                    output_sentence = splited_sent[1].strip()
                elif len(splited_sent[1].strip()) > 8:
                    output_sentence = splited_sent[1].strip()

    elif ("(" in raw_sentence) and (")" in raw_sentence):
        bgm_list = re.findall(r"\(.*?\)", raw_sentence)
        for bgm in bgm_list:
            if len(raw_sentence.replace(bgm, "").strip()) > 5:
                output_sentence = output_sentence.replace(bgm, "").strip()
        if len(output_sentence) <= 5:
            output_sentence = raw_sentence
    return output_sentence


def remove_temp_files(audio_file):
    os.remove("temp.txt")
    os.remove("data/test_case.tsv")
    os.remove(f"data/{audio_file}")


def run(audio_file, language):
    try:
        converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
        prepare_tsv(converted_audio_file, n_frame, language)
        get_vocab_and_yaml(language)
        model_path = get_model(language)
        generated_output = post_processing(generate(model_path))
        remove_temp_files(converted_audio_file)
        return generated_output
    except:
        traceback.print_exc()
        return error_output(language)


def error_output(language):
    return f"Fail to translate the audio into {language}, you may use the examples I provide."


inputs = [
        gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
        gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="German", label="From English to Languages X..."),
    ]

iface = gr.Interface(
    fn=run,
    inputs=inputs,
    outputs=[gr.outputs.Textbox(label="The translation")],
    examples=[['short-case.wav', "German"], ['long-case.wav', "German"]],
    title="ConST: an end-to-end speech translator",
    description='ConST is an end-to-end speech-to-text translation model, whose algorithm corresponds to the '
                'NAACL 2022 paper *"Cross-modal Contrastive Learning for Speech Translation"* (see the paper at https://arxiv.org/abs/2205.02444 for more details). '
                'This is a live demo for ConST, to translate English into eight European languages. \n'
                'p.s. For better experience, we recommend using **Chrome** to record audio.',
    article="- The motivation of the ConST model is to use the contrastive learning method to learn similar representations for semantically similar speech and text, " \
            "thus leveraging MT to help improve ST performance. \n"
            "- The models you are experiencing are trained based on the MuST-C dataset (https://ict.fbk.eu/must-c/), " \
            "which only contains about 250k parallel data at each translation direction. "
            "The translation performance of these language directions varies from 20-30+ BLEU, "
            "so it is normal to find some flaws in the translation, and we are trying to improve the models, "
            "such as training on larger datasets and developing more advanced algorithms.\n"
            "- If you want to know how to train the models, you may refer to https://github.com/ReneeYe/ConST.",
    theme="peach",
)
iface.launch()