import gradio as gr
from huggingsound import SpeechRecognitionModel
from transformers import logging
from transformers import pipeline
from transformers import BertTokenizer, BertModel
from pydub import AudioSegment
unmasker = pipeline('fill-mask', model='bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
import os


def levenshtein_distance(s, t):
    m, n = len(s), len(t)
    d = [[0] * (n+1) for _ in range(m+1)]
    
    for i in range(m+1):
        d[i][0] = i
    
    for j in range(n+1):
        d[0][j] = j
        
    for j in range(1, n+1):
        for i in range(1, m+1):
            if s[i-1] == t[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
                
    return d[m][n]

def collate(input):
    pun_marks = [",", ".", "?", "!", ";", ":", "-", "—", "(", ")", "[", "]", "{", "}", "'", "\"", "`"]
    output = ""
    Capital = True
    Dash = False
    for i in range(len(input)):
        if input[i] in pun_marks:
            output += input[i]
            if input[i] in [".", "("]:
                Capital = True
            if input[i] in ["-", "'"]:
                Dash = True
            else:
                Dash = False
        else:
            str = ""
            if (Dash == False):
                str += " "
            if Capital:
                str += input[i].capitalize()
                Capital = False
            else:
                str += input[i]
            output += str
    return output

def everything(audio_paths):
    w2vmodel = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    logging.set_verbosity_error() #change'error' to 'warning' or remove this if you want to see the warning
    # https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english
    # https://huggingface.co/bert-base-uncased

    transcriptions = w2vmodel.transcribe(audio_paths)
    
    return transcriptions
    # input = transcriptions[0]["transcription"]
    # input = input.split()

    #     #(1) is a strategy where tokens are used to determine lexicographic distance
    #     #(2) is a strategy where replaced words 
    # for t in range(1):
    #     # output = [] #(2)
    #     for i in range(len(input)):
    #         temp = input[i]
    #         token = tokenizer(temp)['input_ids'][1]
    #         input[i] = "[MASK]"
    # apiint = unmasker(' '.join(input))
    # dist = []
    # for r in range(5):
    #     # if (np.abs((apiint[r]['token'] - token)) < 2): #(1)
    #     dist.append(levenshtein_distance(temp, apiint[r]['token_str']))
    # lindex = 0
    # l = dist[0]
    # for r in range(5):
    #     if dist[r] < l:
    #         lindex = r

    #         l = dist[r]
    # if l <= 2:
    #     input[i] = apiint[lindex]['token_str']
    #     # output.append(apiint[lindex]['token_str']) #(2)
    # else:
    #     input[i] = temp
    #     # output.append(temp) #(2)
    # # input[i] = temp #(2)

    # for t in range(1):
    #     inndex = 1
    #     for i in range(len(input)):
    #         input.insert(inndex, "[MASK]")
    #         # print(' '.join(input))
    #         apiint = unmasker(' '.join(input))
    #         if (apiint[0]['token'] < 1500):
    #             input[inndex] = apiint[0]["token_str"]
    #             inndex += 2
    #         else:
    #             del input[inndex]
    #             inndex += 1

    # st.write(collate(input))

    # # In comparison, a plain autocorrect gives this output:

    # # "The b-movie by Jerry Sinclair, the sound of buzzing 
    # # bees, can be heard according to all known laws of 
    # # aviation that is no way for b to be able to fly its 
    # # wings are too small to get its start little body off 
    # # the ground, the be, of course, flies anyway because ``
    # # bees don't care what humans think is possible. 
    # # Barbuda is guaranteed one member of the House of 
    # # Representatives and two members of the Senate."

    # # - https://huggingface.co/oliverguhr/spelling-correction-english-base?text=lets+do+a+comparsion

demo = gr.Interface(fn=everything,
                    inputs = [gr.UploadButton],
                    outputs = ["text"])