import gradio as gr import functions import docx2txt import pdfplumber import re import pandas as pd import nltk from nltk.tokenize import sent_tokenize nltk.download("punkt") from sentence_transformers import SentenceTransformer,util import numpy as np df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets words=df.values.T[0].tolist() def reading_word(string): text = docx2txt.process("var.docx") return text def reading_pdf(string): all_text="" with pdfplumber.open(string) as pdf: for pdf_page in pdf.pages: bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 )) single_page_text = bold.extract_text(x_tolerance=2) #print( single_page_text ) # separate each page's text with newline all_text = all_text + '\n' + single_page_text return all_text def reading_file(file_obj): string=file_obj.orig_name """" ----------------------------------------------------------------------------- This function takes as arguments the file that we want to analyze. Depending the file type we use some python library. For the moment we detect only: PDF and Words. Returns: Long string with all the sentences in the document ----------------------------------------------------------------------------- Input: string: path of the file we want to analyze """ ext = os.path.splitext(string)[-1].lower() if ext == ".pdf": text=reading_pdf(string) elif ext == ".docx": text=reading_word(string) else: print ("Unknown file format.") return text def filtering(text): """" ----------------------------------------------------------------------------- This function takes as arguments the string obtained in the reading step and filters out undesired characters. Potential things to filter: Index of contents, titles, formulas, references, tables (?) Returns: Long string with all the sentences in the document. ----------------------------------------------------------------------------- Input: string: string obtained in the previous reading step. """ clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1) clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps clean1=re.sub("\no |\n\uf0b7","",clean1) #clean1=re.sub(" \n"," ",clean1) return clean1 def splitting(word, text): if word=="line": tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines elif word=="sentences": #tok_text1=text.split('. ') tok_text=sent_tokenize(text) elif word=="paragraphs": tok_text=text.split('\n\n') #tok_text= [content.strip() for content in text.splitlines() if content] return tok_text def ctrlf(words: list, text): b=[] for word in words: #print("Sentences matching the word ", word, ":\n") a=re.findall(f"[^.]* {word} [^.]*\.", text) #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive for i in range(len(a)): #print(i+1,".-", a[i]) b = b + [a[i]] #print("--------------------------------------------------") return b def total(corpus, query, split_param, model_name: str ,number: int, function: str): """ Takes filtered text and performs the NLP nalysis """ splitted=splitting(split_param, corpus) if function=="cosine similarity": score_function=util.cos_sim elif function=="dot score": score_function=util.dot_score else: print("Choose a valid option") #frames=[] #for i in query: result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function) return result demo=gr.Blocks() with demo: gr.Image("logo_credit_agricole_CIB_0.jpg") gr.Markdown("## Important Sentences Recognizer") gr.Markdown("This project aims to retrieve critical sentences related with some important words in a document.") #gr.Interface(fn=reading_file, inputs=gr.File(), outputs="text") with gr.Box(): with gr.Row(): file=gr.File() with gr.Column(): b1=gr.Button("Reading file",variant="primary") t1=gr.Textbox(label="Result") b2=gr.Button("Filtering") t2=gr.Textbox(label="Result") gr.Markdown("Now we run ctrl+f method.") with gr.Box(): checkbox1=gr.CheckboxGroup(words, label="Select desired words") b4=gr.Button("Run analysis") t4=gr.Textbox(label="Result") gr.Markdown("But first we need to choose how to parse the text.") with gr.Box(): t=gr.Textbox(label="Write: sentences or paragraphs or lines or words", value="sentences") #radio1=gr.Radio(["lines", "sentences", "paragraphs"], label="Parse by", value="sentences", interactive=True), b3=gr.Button("Split text") t3=gr.Textbox(label="Result") gr.Markdown("Using previous the previous result, we run now the NLP analysis.") with gr.Box(): gr.Markdown("Now we will proceed with the analysis.") dropdown1=gr.Dropdown(choices=["all-MiniLM-L6-v2","multi-qa-mpnet-base-dot-v1","msmarco-distilbert-base-v4"], label="Model") slider1=gr.Slider(1,100,10, label="Top k", interactive=True, step=1) dropdown2=gr.Dropdown(choices=["cosine similarity","dot product"], label="Similarity function") b5=gr.Button("Run analysis", variant="primary") gr.Markdown df1= gr.Dataframe(row_count = (1, "dynamic"), col_count=(2, "fixed"), label="Important sentences", headers=["Expression", "Score"], overflow_row_behaviour="paginate") b1.click(reading_file, inputs=file, outputs=t1) b2.click(filtering, inputs=t1, outputs=t2) b3.click(splitting, inputs=[t, t2], outputs=t3) b4.click(ctrlf,[checkbox1, t2], t4) b5.click(fn=total, inputs=[t2, t4,t, dropdown1, slider1, dropdown2], outputs=df1) demo.launch()