from deepmultilingualpunctuation import PunctuationModel import gradio as gr import re # https://stackoverflow.com/questions/22800401/how-to-capitalize-the-first-letter-of-every-sentence def cap(match): return(match.group().capitalize()) def predict(brakes, transcript): # preprocess the text by removing filler words # Define a list of filler words to remove filler_words = ["um", "uh", "hmm", "ha", "er", "ah"] words = transcript.split() clean_words = [word for word in words if word.lower() not in filler_words] input_text = " ".join(clean_words) # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation #pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)" # Use re.sub to replace the filler words with empty strings #clean_input_text = re.sub(pattern, "", input_text) # Do the punctuation restauration model = PunctuationModel() output_text = model.restore_punctuation(input_text) srt_file = input_text punctuated = output_text # if any of the line brake methods are implemented, # return the text as a single line pcnt_file_cr = output_text if 'timelines' in brakes: # restore the carrige returns srt_file_strip=srt_file.strip() srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip) srt_file_array=srt_file_sub.split(' ') pcnt_file_array=punctuated.split(' ') # goal: restore the break points i.e. the same number of lines as the srt file # this is necessary, because each line in the srt file corresponds to a frame from the video if len(srt_file_array)!=len(pcnt_file_array): return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array) pcnt_file_array_hash = [] for idx, item in enumerate(srt_file_array): if item.endswith('#'): pcnt_file_array_hash.append(pcnt_file_array[idx]+'#') else: pcnt_file_array_hash.append(pcnt_file_array[idx]) # assemble the array back to a string pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n') elif 'sentences' in brakes: split_text = output_text.split('. ') pcnt_file_cr = '.\n'.join(split_text) regex1 = r"\bi\b" regex2 = r"(?<=[.?!;])\s*\w" regex3 = r"^\w" pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr))) return pcnt_file_cr_cap if __name__ == "__main__": title = "Deep Punkt App" description = """ Description:
Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words.
""" examples = [['sentences', "my name is clara i live in berkeley california"]] interface = gr.Interface(fn = predict, inputs = [gr.Radio(["sentences", "timelines"], label="brakes"), "text"], outputs = ["text"], title = title, description = description, examples=examples, queue=True, allow_flagging="never") interface.launch()