import nltk
import librosa
import torch
import gradio as gr
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC
nltk.download("punkt")

## Loading the pre-trained model and the tokenizer
model_name = "facebook/wav2vec2-base-960h"
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Creating a function that makes sure that the speech input has a sampling rate of 16 kHz
def load_data(input_file):
    speech, sample_rate = librosa.load(input_file)
    if len(speech.shape) > 1:
        speech = speech[:, 0] + speech[:, 1]
    if sample_rate != 16000:
        speech = librosa.resample(speech, sample_rate, 16000)
    return speech

# Creating a function for correcting the letter casing in a sentence
def correct_casing(input_sentence):
    sentences = nltk.sent_tokenize(input_sentence)
    return (' '.join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences]))

# Defining a function for getting a transcript of the audio input
def asr_transcript(input_file):
    speech = load_data(input_file)
    input_values = tokenizer(speech, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.decode(predicted_ids[0])
    transcription = correct_casing(transcription.lower())
    return transcription

# Creating a UI for the model using gr.Interface
gr.Interface(fn=asr_transcript,
             inputs=gr.components.Audio(type="filepath", optional=True, label="Speaker"),
             outputs=gr.components.Textbox(label="Output Text"),
             title="ASR using Wav2Vec 2.0",
             description="This application displays transcribed text for given audio input",
             examples=[["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]],
             theme="grass").launch()