autosumm / extractor /extract.py
mhsvieira's picture
Add timer
a9e7556
raw
history blame
2.38 kB
from ._utils import FewDocumentsError
from ._utils import document_extraction, paragraph_extraction, semantic_search
from utils.timing import Timer
from corpora import gen_corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
@Timer.time_it('extração', 'extraction')
def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str:
"""Extract n paragraphs from the corpus using the given query.
Parameters:
query (str): Sentence used to search the corpus for relevant documents
n (int): Number of paragraphs to return
Returns:
str: String containing the n most relevant paragraphs joined by line breaks
"""
# Open corpus
with Timer('geração do corpus', 'corpus generation'):
corpus = gen_corpus(query)
# Setup query
stop_words = set(stopwords.words('english'))
query_tokens = word_tokenize(query.lower())
tokens_without_sw = [word for word in query_tokens if not word in stop_words]
keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]
# Gross search
with Timer('busca exaustiva', 'exhaustive search'):
if not extracted_documents:
extracted_documents, documents_empty, documents_sizes = document_extraction(
dataset=corpus,
query=query,
keywords=keywords,
min_document_size=0,
min_just_one_paragraph_size=0
)
# First semantc search (over documents)
with Timer('busca semantica nos documentos', 'semantic search over documents'):
selected_documents, documents_distances = semantic_search(
model=search_model,
query=query,
files=extracted_documents,
number_of_similar_files=10
)
# Second semantic search (over paragraphs)
with Timer('busca semantica nos parágrafos', 'semantic search over paragraphs'):
paragraphs = paragraph_extraction(
documents=selected_documents,
min_paragraph_size=20,
)
selected_paragraphs, paragraphs_distances = semantic_search(
model=search_model,
query=query,
files=paragraphs,
number_of_similar_files=10
)
text = '\n'.join(selected_paragraphs[:n])
return text