Spaces:

autosummproject
/

autosumm

Runtime error

File size: 2,379 Bytes

from ._utils import FewDocumentsError
from ._utils import document_extraction, paragraph_extraction, semantic_search
from utils.timing import Timer
from corpora import gen_corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

@Timer.time_it('extração', 'extraction')
def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str:
    """Extract n paragraphs from the corpus using the given query.

    Parameters:
    query (str): Sentence used to search the corpus for relevant documents
    n (int): Number of paragraphs to return

    Returns:
    str: String containing the n most relevant paragraphs joined by line breaks
    """
    # Open corpus
    with Timer('geração do corpus', 'corpus generation'):
        corpus = gen_corpus(query)

    # Setup query
    stop_words = set(stopwords.words('english'))
    query_tokens = word_tokenize(query.lower())
    tokens_without_sw = [word for word in query_tokens if not word in stop_words]
    keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]

    # Gross search
    with Timer('busca exaustiva', 'exhaustive search'):
        if not extracted_documents:
            extracted_documents, documents_empty, documents_sizes = document_extraction(
                dataset=corpus,
                query=query,
                keywords=keywords,
                min_document_size=0,
                min_just_one_paragraph_size=0
            )

    # First semantc search (over documents)
    with Timer('busca semantica nos documentos', 'semantic search over documents'):
        selected_documents, documents_distances = semantic_search(
            model=search_model,
            query=query,
            files=extracted_documents,
            number_of_similar_files=10
        )

    # Second semantic search (over paragraphs)
    with Timer('busca semantica nos parágrafos', 'semantic search over paragraphs'):
        paragraphs = paragraph_extraction(
            documents=selected_documents,
            min_paragraph_size=20,
        )
        selected_paragraphs, paragraphs_distances = semantic_search(
            model=search_model,
            query=query,
            files=paragraphs,
            number_of_similar_files=10
        )

    text = '\n'.join(selected_paragraphs[:n])

    return text