'''Librispeech 100h English ASR demo
@ML2 --> @HuggingFace

2022-02-23 jkang first created
2022-02-28 jkang char added (bpe, word, char now all included)
'''

import os
from difflib import Differ
from glob import glob
from loguru import logger
import librosa
import gradio as gr

from espnet2.bin.asr_inference import Speech2Text

# ---------- Settings ----------
GPU_ID = '-1'
os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'

# SERVER_PORT = 42208
# SERVER_NAME = "0.0.0.0"

MODEL_DIR = './model'

EXAMPLE_DIR = './examples'
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
# examples = [[example] for example in examples]

d = Differ()

# ---------- Logging ----------
logger.add('app.log', mode='a')
logger.info('============================= App restarted =============================')

# ---------- Model ----------
logger.info('download model')
logger.info('model downloaded')
model_word = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_word")
model_bpe = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer")
model_char = Speech2Text.from_pretrained("jkang/espnet2_librispeech_100_conformer_char")
logger.info('model loaded')

def predict(wav_file):
    logger.info('wav file loaded')
    # Load audio    
    speech, rate = librosa.load(wav_file, sr=16000)

    logger.info(f'--- speech.shape: {speech.shape}')
    logger.info(f'--- speech.dtype: {speech.dtype}')
    logger.info(f'--- speech rate: {rate}')

    # Run inference
    W = model_word(speech)[0]
    B = model_bpe(speech)[0]
    C = model_char(speech)[0]
    word_decoded = W[0]
    bpe_decoded = B[0]
    char_decoded = C[0]
    # comparison = ''.join(list(d.compare([word_decoded+'\n'], [bpe_decoded+'\n'])))

    logger.info(f'--- word-based model decoded: {word_decoded}')
    logger.info(f'--- BPE-based model decoded: {bpe_decoded}')
    logger.info(f'--- char-based model decoded: {char_decoded}')

    logger.info('finished')
    # return word_decoded, bpe_decoded, comparison
    return bpe_decoded, char_decoded, word_decoded

iface = gr.Interface(
    predict,
    title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
    description='Two models were trained on Librispeech (clean-100h)',
    inputs=[
        gr.inputs.Audio(label='wav file', source='upload', type='filepath')
    ],
    outputs=[
        gr.outputs.Textbox(label='Decoding result (BPE 기반 model)'),
        gr.outputs.Textbox(label='Decoding result (char 기반 model)'),
        gr.outputs.Textbox(label='Decoding result (word 기반 model)'),
        # gr.outputs.Textbox(label='Comparison'),
    ],
    examples=examples,
    examples_per_page=5,
    css='.examples-gallery {display: grid !important;}'
    # article='<p style="text-align:center">Model URL<a target="_blank" href="https://huggingface.co/jkang/espnet2_librispeech_100_conformer">🤗</a></p>',
)

if __name__ == '__main__':
    try:
        iface.launch(debug=True,
                    #  server_name=SERVER_NAME,
                    #  server_port=SERVER_PORT,
                     enable_queue=True,
                     )    
    except KeyboardInterrupt as e:
        print(e)

    finally:
        iface.close()