import gradio as gr
from src.inference import Wav2Vec2Inference
import librosa
import os, sys
import soundfile

model_name = "arifagustyawan/wav2vec2-large-xlsr-53-id"
asr = Wav2Vec2Inference(model_name)

def convert(inputfile, outfile):
    target_sr = 16000
    data, sample_rate = librosa.load(inputfile)
    data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr)
    soundfile.write(outfile, data, target_sr)

def parse_transcription_record(wav_file):
    filename = wav_file.split('.')[0]
    convert(wav_file, filename + "16k.wav")
    transcription, confidence = asr.file_to_text(filename + "16k.wav")
    return transcription, confidence
    return filename + "16k.wav", transcription

def parse_transcription_file(wav_file):
    filename = wav_file.name.split('.')[0]
    convert(wav_file.name, filename + "16k.wav")
    transcription, confidence = asr.file_to_text(filename + "16k.wav")
    return transcription, confidence
    return filename + "16k.wav", transcription

examples = [
    [os.path.join("assets", "halo.wav")]
    
]
record_audio = gr.Interface(
    fn = parse_transcription_record, 
    inputs = gr.Audio(sources="microphone", type="filepath", label = "Click button to record audio"),
    outputs = [gr.Textbox(label="Transcription"), gr.Textbox(label="Confidence")],
    analytics_enabled=False,
    allow_flagging = "never",
    title="Automatic Speech Recognition",
    description="Click the button bellow to record audio!",
)

upload_file = gr.Interface(
    fn = parse_transcription_file, 
    inputs = gr.File(type= "filepath", label = "Upload file here"),
    outputs = [gr.Textbox(label="Transcription"), gr.Textbox(label="Confidence")],
    examples = examples,
    analytics_enabled=False,
    title="Automatic Speech Recognition",
    allow_flagging = "never",
    description="Upload or drag and drop the audio file here!",
)


demo = gr.TabbedInterface([record_audio, upload_file], ["Record Audio", "Upload Audio"])

if __name__ == "__main__":
    demo.launch()