import time
import torch
import string
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text


import soundfile
import librosa.display
import matplotlib.pyplot as plt
import gradio as gr


lang = 'multilingual'
fs = 16000 
tag = 'ftshijt/open_li52_asr_train_asr_raw_bpe7000_valid.acc.ave_10best' 

d = ModelDownloader()
speech2text = Speech2Text(
    **d.download_and_unpack(tag),
    device="cpu",
    minlenratio=0.0,
    maxlenratio=0.0,
    ctc_weight=0.3,
    beam_size=10,
    batch_size=0,
    nbest=1
)

def text_normalizer(text):
    text = text.upper()
    return text.translate(str.maketrans('', '', string.punctuation))

def inference(audio):
  speech, rate = soundfile.read(audio.name)
  assert rate == fs, "mismatch in sampling rate"
  nbests = speech2text(speech)
  text, *_ = nbests[0]
  return f"ASR hypothesis: {text_normalizer(text)}"
  
inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs =  gr.outputs.Textbox(label="Output Text")

title = "ESPnet2-ASR"
description = "Gradio demo for Real-time ASR with ESPnet2. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://espnet.github.io/espnet/'>ESPnet: end-to-end speech processing toolkit</a> | <a href='https://github.com/espnet/espnet'>Github Repo</a></p>"

examples = [
    ["poem.wav"]
]
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article,examples=examples).launch()