|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pdfplumber |
|
import docx2txt |
|
import os |
|
import re |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sentence_transformers import SentenceTransformer, models,util |
|
import nltk |
|
from nltk.tokenize import sent_tokenize, wordpunct_tokenize |
|
nltk.download("punkt") |
|
|
|
|
|
|
|
def reading_word(string): |
|
text = docx2txt.process("var.docx") |
|
return text |
|
|
|
def reading_pdf(string): |
|
all_text="" |
|
with pdfplumber.open(string) as pdf: |
|
for pdf_page in pdf.pages: |
|
bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 )) |
|
single_page_text = bold.extract_text(x_tolerance=2) |
|
|
|
|
|
all_text = all_text + '\n' + single_page_text |
|
return all_text |
|
|
|
|
|
def reading_file(string): |
|
"""" |
|
----------------------------------------------------------------------------- |
|
|
|
This function takes as arguments the file that we want to analyze. Depending the file type we use some python library. |
|
For the moment we detect only: PDF and Words. |
|
|
|
Returns: Long string with all the sentences in the document |
|
|
|
----------------------------------------------------------------------------- |
|
|
|
Input: |
|
|
|
string: path of the file we want to analyze |
|
|
|
""" |
|
|
|
ext = os.path.splitext(string)[-1].lower() |
|
if ext == ".pdf": |
|
text=reading_pdf(string) |
|
elif ext == ".docx": |
|
text=reading_word(string) |
|
else: |
|
print ("Unknown file format.") |
|
return text |
|
|
|
|
|
def splitting(word: str, text): |
|
if word=="line": |
|
tok_text = list(filter(lambda a: a != '', text)) |
|
elif word=="sentences": |
|
|
|
tok_text=sent_tokenize(text) |
|
elif word=="paragraphs": |
|
tok_text=re.split(r'\n{2,}', text) |
|
for i in tok_text: |
|
if len(i)<50: |
|
tok_text.remove(i) |
|
|
|
elif word=="words": |
|
tok_text=wordpunct_tokenize(text) |
|
return tok_text |
|
|
|
|
|
def filtering(text): |
|
"""" |
|
----------------------------------------------------------------------------- |
|
|
|
This function takes as arguments the string obtained in the reading step and filters out undesired characters. |
|
|
|
Potential things to filter: Index of contents, titles, formulas, references, tables (?) |
|
|
|
|
|
Returns: Long string with all the sentences in the document. |
|
|
|
----------------------------------------------------------------------------- |
|
|
|
Input: |
|
|
|
string: string obtained in the previous reading step. |
|
|
|
""" |
|
clean1=re.sub("\d{1,}.\d{1,}.+","", text) |
|
clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) |
|
clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1) |
|
clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) |
|
clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1) |
|
clean1=re.sub("\no |\n\uf0b7","",clean1) |
|
|
|
return clean1 |
|
|
|
|
|
def ctrlf(words: list, text): |
|
b=[] |
|
for word in words: |
|
|
|
a=re.findall(f"[^.]* {word} [^.]*\.", text) |
|
|
|
for i in range(len(a)): |
|
|
|
b = b + [a[i]] |
|
|
|
return b |
|
|
|
|
|
def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None): |
|
"""" |
|
----------------------------------------------------------------------------- |
|
|
|
This function takes as arguments the text that we want to compare, the query with respect to we want to |
|
compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used |
|
to compute the similarity (by defect cosine similarity). |
|
|
|
Returns: Histogram plot |
|
|
|
----------------------------------------------------------------------------- |
|
|
|
Input: |
|
|
|
query: String |
|
corpus: String or list of strings (usually the latter for a document --> list of sentences) |
|
number: Int |
|
model_name: String |
|
score_function: Function |
|
ax: Axis object |
|
|
|
""" |
|
|
|
|
|
model = SentenceTransformer(model_name) |
|
n=len(query) |
|
|
|
|
|
corpus_embedding = model.encode(corpus, convert_to_tensor=True) |
|
query_embedding = model.encode(query, convert_to_tensor=True) |
|
|
|
|
|
hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function) |
|
hits = hits[0] |
|
|
|
|
|
scoring=[] |
|
corp=[] |
|
for hit in hits: |
|
|
|
scoring.append(hit['score']) |
|
corp.append(corpus[hit['corpus_id']]) |
|
|
|
|
|
data = pd.DataFrame(np.column_stack([corp, scoring]), |
|
columns=['Expression', 'Score']) |
|
data.sort_values(by=['Score'], ascending=False) |
|
data = data.explode('Score') |
|
data['Score'] = data['Score'].astype('float') |
|
|
|
return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression') |
|
|
|
|
|
def sim(query, corpus, model_name, number=5, score_function=util.cos_sim): |
|
|
|
model = SentenceTransformer(model_name) |
|
n=len(query) |
|
|
|
|
|
corpus_embedding = model.encode(corpus, convert_to_tensor=True) |
|
query_embedding = model.encode(query, convert_to_tensor=True) |
|
|
|
|
|
hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function) |
|
hits = hits[0] |
|
|
|
|
|
scoring=[] |
|
corp=[] |
|
for hit in hits: |
|
|
|
scoring.append(hit['score']) |
|
corp.append(corpus[hit['corpus_id']]) |
|
|
|
|
|
data = pd.DataFrame(np.column_stack([corp, scoring]), |
|
columns=['Expression', 'Score']) |
|
data.sort_values(by=['Score'], ascending=False) |
|
data = data.explode('Score') |
|
data['Score'] = data['Score'].astype('float') |
|
return data |
|
|
|
|
|
def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim): |
|
frames=[] |
|
for i in query: |
|
frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)] |
|
|
|
result = pd.DataFrame(frames) |
|
result=result.sort_values(by=['Score'], ascending=False) |
|
result.drop_duplicates(subset=['Expression'], inplace=True) |
|
return result |
|
|
|
|
|
|
|
|
|
def conclusion(): |
|
return |
|
|
|
|
|
|