import gradio as gr
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
# config
model_name = "vumichien/wav2vec2-large-xlsr-japanese-hỉragana"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)


def process_audio_file(file):
    data, sr = librosa.load(file)
    if sr != 16000:
        data = librosa.resample(data, sr, 16000).squeeze()
    print(data.shape)
    inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs


def transcribe(file_mic, file_upload):
    warn_output = ""
    if (file_mic is not None) and (file_upload is not None):
        warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the " \
                      "microphone will be used and the uploaded audio will be discarded.\n "
        file = file_mic
    elif (file_mic is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"
    elif file_mic is not None:
        file = file_mic
    else:
        file = file_upload
    inputs = process_audio_file(file)
    with torch.no_grad():
        output_logit = model(inputs.input_values, attention_mask=inputs.attention_mask.to("cuda")).logits
    pred_ids = torch.argmax(output_logit, dim=-1)
    return warn_output + processor.batch_decode(pred_ids)


iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type='filepath', optional=True),
        gr.inputs.Audio(source="upload", type='filepath', optional=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Transcribe Japanese audio to Hiragana",
    description="A simple interface to transcribe from spoken Japanese to Hiragana.",
    article="<p style='text-align: center'><a href='https://huggingface.co/facebook/wav2vec2-xls-r-1b-en-to-15' target='_blank'>Click to learn more about XLS-R-1B-EN-15 </a> | <a href='https://arxiv.org/abs/2111.09296' target='_blank'> With 🎙️ from Facebook XLS-R </a></p>",
    enable_queue=True,
    allow_flagging=False,
)
iface.launch()