import gradio as gr from huggingsound import SpeechRecognitionModel from transformers import logging from transformers import pipeline from transformers import BertTokenizer, BertModel from pydub import AudioSegment unmasker = pipeline('fill-mask', model='bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained("bert-base-uncased") import os def levenshtein_distance(s, t): m, n = len(s), len(t) d = [[0] * (n+1) for _ in range(m+1)] for i in range(m+1): d[i][0] = i for j in range(n+1): d[0][j] = j for j in range(1, n+1): for i in range(1, m+1): if s[i-1] == t[j-1]: d[i][j] = d[i-1][j-1] else: d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1]) return d[m][n] def collate(input): pun_marks = [",", ".", "?", "!", ";", ":", "-", "—", "(", ")", "[", "]", "{", "}", "'", "\"", "`"] output = "" Capital = True Dash = False for i in range(len(input)): if input[i] in pun_marks: output += input[i] if input[i] in [".", "("]: Capital = True if input[i] in ["-", "'"]: Dash = True else: Dash = False else: str = "" if (Dash == False): str += " " if Capital: str += input[i].capitalize() Capital = False else: str += input[i] output += str return output def everything(audio_paths): w2vmodel = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english") logging.set_verbosity_error() #change'error' to 'warning' or remove this if you want to see the warning # https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english # https://huggingface.co/bert-base-uncased transcriptions = w2vmodel.transcribe(audio_paths) return transcriptions # input = transcriptions[0]["transcription"] # input = input.split() # #(1) is a strategy where tokens are used to determine lexicographic distance # #(2) is a strategy where replaced words # for t in range(1): # # output = [] #(2) # for i in range(len(input)): # temp = input[i] # token = tokenizer(temp)['input_ids'][1] # input[i] = "[MASK]" # apiint = unmasker(' '.join(input)) # dist = [] # for r in range(5): # # if (np.abs((apiint[r]['token'] - token)) < 2): #(1) # dist.append(levenshtein_distance(temp, apiint[r]['token_str'])) # lindex = 0 # l = dist[0] # for r in range(5): # if dist[r] < l: # lindex = r # l = dist[r] # if l <= 2: # input[i] = apiint[lindex]['token_str'] # # output.append(apiint[lindex]['token_str']) #(2) # else: # input[i] = temp # # output.append(temp) #(2) # # input[i] = temp #(2) # for t in range(1): # inndex = 1 # for i in range(len(input)): # input.insert(inndex, "[MASK]") # # print(' '.join(input)) # apiint = unmasker(' '.join(input)) # if (apiint[0]['token'] < 1500): # input[inndex] = apiint[0]["token_str"] # inndex += 2 # else: # del input[inndex] # inndex += 1 # st.write(collate(input)) # # In comparison, a plain autocorrect gives this output: # # "The b-movie by Jerry Sinclair, the sound of buzzing # # bees, can be heard according to all known laws of # # aviation that is no way for b to be able to fly its # # wings are too small to get its start little body off # # the ground, the be, of course, flies anyway because `` # # bees don't care what humans think is possible. # # Barbuda is guaranteed one member of the House of # # Representatives and two members of the Senate." # # - https://huggingface.co/oliverguhr/spelling-correction-english-base?text=lets+do+a+comparsion demo = gr.Interface(fn=everything, inputs = [gr.UploadButton], outputs = ["text"])