Spaces:
Runtime error
Runtime error
from ._utils import FewDocumentsError | |
from ._utils import document_extraction, paragraph_extraction, semantic_search | |
from utils.timing import Timer | |
from corpora import gen_corpus | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
import string | |
def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str: | |
"""Extract n paragraphs from the corpus using the given query. | |
Parameters: | |
query (str): Sentence used to search the corpus for relevant documents | |
n (int): Number of paragraphs to return | |
Returns: | |
str: String containing the n most relevant paragraphs joined by line breaks | |
""" | |
# Open corpus | |
with Timer('geração do corpus', 'corpus generation'): | |
corpus = gen_corpus(query) | |
# Setup query | |
stop_words = set(stopwords.words('english')) | |
query_tokens = word_tokenize(query.lower()) | |
tokens_without_sw = [word for word in query_tokens if not word in stop_words] | |
keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation] | |
# Gross search | |
with Timer('busca exaustiva', 'exhaustive search'): | |
if not extracted_documents: | |
extracted_documents, documents_empty, documents_sizes = document_extraction( | |
dataset=corpus, | |
query=query, | |
keywords=keywords, | |
min_document_size=0, | |
min_just_one_paragraph_size=0 | |
) | |
# First semantc search (over documents) | |
with Timer('busca semantica nos documentos', 'semantic search over documents'): | |
selected_documents, documents_distances = semantic_search( | |
model=search_model, | |
query=query, | |
files=extracted_documents, | |
number_of_similar_files=10 | |
) | |
# Second semantic search (over paragraphs) | |
with Timer('busca semantica nos parágrafos', 'semantic search over paragraphs'): | |
paragraphs = paragraph_extraction( | |
documents=selected_documents, | |
min_paragraph_size=20, | |
) | |
selected_paragraphs, paragraphs_distances = semantic_search( | |
model=search_model, | |
query=query, | |
files=paragraphs, | |
number_of_similar_files=10 | |
) | |
text = '\n'.join(selected_paragraphs[:n]) | |
return text | |