from ._utils import FewDocumentsError from ._utils import document_extraction, paragraph_extraction, semantic_search from utils.timing import Timer from corpora import gen_corpus from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import string @Timer.time_it('extração', 'extraction') def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str: """Extract n paragraphs from the corpus using the given query. Parameters: query (str): Sentence used to search the corpus for relevant documents n (int): Number of paragraphs to return Returns: str: String containing the n most relevant paragraphs joined by line breaks """ # Open corpus with Timer('geração do corpus', 'corpus generation'): corpus = gen_corpus(query) # Setup query stop_words = set(stopwords.words('english')) query_tokens = word_tokenize(query.lower()) tokens_without_sw = [word for word in query_tokens if not word in stop_words] keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation] # Gross search with Timer('busca exaustiva', 'exhaustive search'): if not extracted_documents: extracted_documents, documents_empty, documents_sizes = document_extraction( dataset=corpus, query=query, keywords=keywords, min_document_size=0, min_just_one_paragraph_size=0 ) # First semantc search (over documents) with Timer('busca semantica nos documentos', 'semantic search over documents'): selected_documents, documents_distances = semantic_search( model=search_model, query=query, files=extracted_documents, number_of_similar_files=10 ) # Second semantic search (over paragraphs) with Timer('busca semantica nos parágrafos', 'semantic search over paragraphs'): paragraphs = paragraph_extraction( documents=selected_documents, min_paragraph_size=20, ) selected_paragraphs, paragraphs_distances = semantic_search( model=search_model, query=query, files=paragraphs, number_of_similar_files=10 ) text = '\n'.join(selected_paragraphs[:n]) return text