SentenceRecognizer / functions.py
belgrano91's picture
Upload functions.py
8228dae
#*********************************************************************
# This archive could be a potential first stone of the project.
# Now contains only functions used throughout the files, but
# in the future could contain more complex structures.
#*********************************************************************
import pdfplumber
import docx2txt
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, models,util
import nltk
from nltk.tokenize import sent_tokenize, wordpunct_tokenize
nltk.download("punkt")
def reading_word(string):
text = docx2txt.process("var.docx")
return text
def reading_pdf(string):
all_text=""
with pdfplumber.open(string) as pdf:
for pdf_page in pdf.pages:
bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
single_page_text = bold.extract_text(x_tolerance=2)
#print( single_page_text )
# separate each page's text with newline
all_text = all_text + '\n' + single_page_text
return all_text
def reading_file(string):
""""
-----------------------------------------------------------------------------
This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
For the moment we detect only: PDF and Words.
Returns: Long string with all the sentences in the document
-----------------------------------------------------------------------------
Input:
string: path of the file we want to analyze
"""
ext = os.path.splitext(string)[-1].lower()
if ext == ".pdf":
text=reading_pdf(string)
elif ext == ".docx":
text=reading_word(string)
else:
print ("Unknown file format.")
return text
def splitting(word: str, text):
if word=="line":
tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
elif word=="sentences":
#tok_text1=text.split('. ')
tok_text=sent_tokenize(text)
elif word=="paragraphs":
tok_text=re.split(r'\n{2,}', text)
for i in tok_text:
if len(i)<50:
tok_text.remove(i)
elif word=="words":
tok_text=wordpunct_tokenize(text)
return tok_text
def filtering(text):
""""
-----------------------------------------------------------------------------
This function takes as arguments the string obtained in the reading step and filters out undesired characters.
Potential things to filter: Index of contents, titles, formulas, references, tables (?)
Returns: Long string with all the sentences in the document.
-----------------------------------------------------------------------------
Input:
string: string obtained in the previous reading step.
"""
clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
clean1=re.sub("\no |\n\uf0b7","",clean1)
#clean1=re.sub(" \n"," ",clean1)
return clean1
def ctrlf(words: list, text):
b=[]
for word in words:
#print("Sentences matching the word ", word, ":\n")
a=re.findall(f"[^.]* {word} [^.]*\.", text)
#a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
for i in range(len(a)):
#print(i+1,".-", a[i])
b = b + [a[i]]
#print("--------------------------------------------------")
return b
def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None):
""""
-----------------------------------------------------------------------------
This function takes as arguments the text that we want to compare, the query with respect to we want to
compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used
to compute the similarity (by defect cosine similarity).
Returns: Histogram plot
-----------------------------------------------------------------------------
Input:
query: String
corpus: String or list of strings (usually the latter for a document --> list of sentences)
number: Int
model_name: String
score_function: Function
ax: Axis object
"""
# model info retrieval
model = SentenceTransformer(model_name)
n=len(query)
# tokenize according to the model
corpus_embedding = model.encode(corpus, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)
# semantic search gives a list of lists composed of dictionaries
hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
hits = hits[0]
#print("Comparing ", query, " VS:")
scoring=[]
corp=[]
for hit in hits:
#print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
scoring.append(hit['score'])
corp.append(corpus[hit['corpus_id']])
# defining dataframe for easiness in plotting
data = pd.DataFrame(np.column_stack([corp, scoring]),
columns=['Expression', 'Score'])
data.sort_values(by=['Score'], ascending=False)
data = data.explode('Score')
data['Score'] = data['Score'].astype('float')
return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression')
def sim(query, corpus, model_name, number=5, score_function=util.cos_sim):
# model info retrieval
model = SentenceTransformer(model_name)
n=len(query)
# tokenize according to the model
corpus_embedding = model.encode(corpus, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)
# semantic search gives a list of lists composed of dictionaries
hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
hits = hits[0]
#print("Comparing ", query, " VS:")
scoring=[]
corp=[]
for hit in hits:
#print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
scoring.append(hit['score'])
corp.append(corpus[hit['corpus_id']])
# defining dataframe for easiness in plotting
data = pd.DataFrame(np.column_stack([corp, scoring]),
columns=['Expression', 'Score'])
data.sort_values(by=['Score'], ascending=False)
data = data.explode('Score')
data['Score'] = data['Score'].astype('float')
return data
def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim):
frames=[]
for i in query:
frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)]
result = pd.DataFrame(frames)
result=result.sort_values(by=['Score'], ascending=False)
result.drop_duplicates(subset=['Expression'], inplace=True)
return result
############ EXTRA BALL ################
# detecting the conclusion and getting all the sentences of that paragraph for future use.
def conclusion():
return
########## Get a function with the distribution of the results per word