autosumm / extractor /_utils.py
mhsvieira's picture
Update extractor/_utils.py
0914465
raw
history blame
4.23 kB
import nmslib
import numpy as np
import streamlit as st
# import inflect
import torch
from os import environ
# p = inflect.engine()
class FewDocumentsError(Exception):
def __init__(self, documents, size, msg):
self.documents = documents
self.size = size
self.msg = msg
def __str__(self):
return repr(self.msg)
def document_extraction(dataset, query, keywords, min_document_size, min_just_one_paragraph_size):
# TODO: compare inflected forms
# word_in_text = lambda word, text: any([p.compare(word, w) for w in text.split()])
word_in_text = lambda word, text: word in set(text.split())
lower_dataset = [document.lower() for document in dataset]
lower_query = query.lower()
lower_keywords = [keyword.lower() for keyword in keywords]
if environ['PORTUGUESE'] == 'true':
portuguese = True
elif environ['PORTUGUESE'] == 'false':
portuguese = False
else:
raise EnvironmentError
documents = {}
documents['QUERY'] = [
dataset[lower_dataset.index(document)] for document in lower_dataset
if (word_in_text(lower_query, document))
and (len(document.split()) > min_document_size)
and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
]
documents['AND'] = [
dataset[lower_dataset.index(document)] for document in lower_dataset
if all(word_in_text(keyword, document) for keyword in lower_keywords)
and (len(document.split()) > min_document_size)
and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
]
documents['OR'] = [
dataset[lower_dataset.index(document)] for document in lower_dataset
if any(word_in_text(keyword, document) for keyword in lower_keywords)
and (len(document.split()) > min_document_size)
and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
]
empty = {
'QUERY': len(documents['QUERY']) == 0,
'AND': len(documents['AND']) == 0,
'OR': len(documents['OR']) == 0
}
sizes = {
'QUERY': len(documents['QUERY']),
'AND': len(documents['AND']),
'OR': len(documents['OR'])
}
if all(empty.values()):
# TODO: throw error
st.info(empty.values())
if portuguese:
st.warning(f'Nenhum documento encontrado para a query "{query}", por favor, tente com outra query')
else:
st.warning(f'No document found for the query "{query}", please try with another query')
st.stop()
if sizes['QUERY'] >= 10:
extracted_documents = documents['QUERY']
elif sizes['AND'] >= 10:
extracted_documents = documents['AND']
elif sizes['OR'] >= 10:
extracted_documents = documents['OR']
else:
number_of_documents = sizes['OR']
if portuguese:
raise FewDocumentsError(documents['OR'], number_of_documents,
f'Somente {number_of_documents} documentos encontrados para a query "{query}".\
Por favor selecione "Prosseguir" para prosseguir com {number_of_documents} documentos ou tente novamente com outra query'
)
else:
raise FewDocumentsError(documents['OR'], number_of_documents,
f'Only {number_of_documents} documents found for the query "{query}".\
Please select "Proceed" to proceed with {number_of_documents} documents or try again with another query'
)
return extracted_documents, empty, sizes
def paragraph_extraction(documents, min_paragraph_size):
paragraphs = [
documents[i].splitlines()[j] for i in range(len(documents)) for j in range(len(documents[i].splitlines()))
if (len(documents[i].splitlines()[j].split()) > min_paragraph_size)
]
return paragraphs
def semantic_search(model, query, files, number_of_similar_files):
encoded_query = model.encode(query)
encoded_files = model.encode(files)
model_index = nmslib.init(method='hnsw', space='angulardist')
model_index.addDataPointBatch(encoded_files)
model_index.createIndex({'post': 2})
ids, distances = model_index.knnQuery(encoded_query, k=number_of_similar_files)
selected_files = [files[index] for index in ids]
distances = 180*distances/np.pi
return selected_files, distances;