Spaces:

autosummproject
/

autosumm

Runtime error

App Files Files Community

autosumm / extractor /extract.py

mhsvieira

Add timer

a9e7556 almost 3 years ago

raw

history blame

2.38 kB

	from ._utils import FewDocumentsError
	from ._utils import document_extraction, paragraph_extraction, semantic_search
	from utils.timing import Timer
	from corpora import gen_corpus
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import string

	@Timer.time_it('extração', 'extraction')
	def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str:
	"""Extract n paragraphs from the corpus using the given query.

	Parameters:
	query (str): Sentence used to search the corpus for relevant documents
	n (int): Number of paragraphs to return

	Returns:
	str: String containing the n most relevant paragraphs joined by line breaks
	"""
	# Open corpus
	with Timer('geração do corpus', 'corpus generation'):
	corpus = gen_corpus(query)

	# Setup query
	stop_words = set(stopwords.words('english'))
	query_tokens = word_tokenize(query.lower())
	tokens_without_sw = [word for word in query_tokens if not word in stop_words]
	keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]

	# Gross search
	with Timer('busca exaustiva', 'exhaustive search'):
	if not extracted_documents:
	extracted_documents, documents_empty, documents_sizes = document_extraction(
	dataset=corpus,
	query=query,
	keywords=keywords,
	min_document_size=0,
	min_just_one_paragraph_size=0
	)

	# First semantc search (over documents)
	with Timer('busca semantica nos documentos', 'semantic search over documents'):
	selected_documents, documents_distances = semantic_search(
	model=search_model,
	query=query,
	files=extracted_documents,
	number_of_similar_files=10
	)

	# Second semantic search (over paragraphs)
	with Timer('busca semantica nos parágrafos', 'semantic search over paragraphs'):
	paragraphs = paragraph_extraction(
	documents=selected_documents,
	min_paragraph_size=20,
	)
	selected_paragraphs, paragraphs_distances = semantic_search(
	model=search_model,
	query=query,
	files=paragraphs,
	number_of_similar_files=10
	)

	text = '\n'.join(selected_paragraphs[:n])

	return text