File size: 2,379 Bytes
e539b70
 
a9e7556
e539b70
 
 
 
 
a9e7556
78a71e8
e539b70
 
 
 
 
 
 
 
 
 
a9e7556
 
e539b70
 
 
 
 
 
 
 
a9e7556
 
 
 
 
 
 
 
 
e539b70
 
a9e7556
 
 
 
 
 
 
e539b70
 
a9e7556
 
 
 
 
 
 
 
 
 
 
e539b70
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from ._utils import FewDocumentsError
from ._utils import document_extraction, paragraph_extraction, semantic_search
from utils.timing import Timer
from corpora import gen_corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

@Timer.time_it('extração', 'extraction')
def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str:
    """Extract n paragraphs from the corpus using the given query.

    Parameters:
    query (str): Sentence used to search the corpus for relevant documents
    n (int): Number of paragraphs to return

    Returns:
    str: String containing the n most relevant paragraphs joined by line breaks
    """
    # Open corpus
    with Timer('geração do corpus', 'corpus generation'):
        corpus = gen_corpus(query)

    # Setup query
    stop_words = set(stopwords.words('english'))
    query_tokens = word_tokenize(query.lower())
    tokens_without_sw = [word for word in query_tokens if not word in stop_words]
    keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation]

    # Gross search
    with Timer('busca exaustiva', 'exhaustive search'):
        if not extracted_documents:
            extracted_documents, documents_empty, documents_sizes = document_extraction(
                dataset=corpus,
                query=query,
                keywords=keywords,
                min_document_size=0,
                min_just_one_paragraph_size=0
            )

    # First semantc search (over documents)
    with Timer('busca semantica nos documentos', 'semantic search over documents'):
        selected_documents, documents_distances = semantic_search(
            model=search_model,
            query=query,
            files=extracted_documents,
            number_of_similar_files=10
        )

    # Second semantic search (over paragraphs)
    with Timer('busca semantica nos parágrafos', 'semantic search over paragraphs'):
        paragraphs = paragraph_extraction(
            documents=selected_documents,
            min_paragraph_size=20,
        )
        selected_paragraphs, paragraphs_distances = semantic_search(
            model=search_model,
            query=query,
            files=paragraphs,
            number_of_similar_files=10
        )

    text = '\n'.join(selected_paragraphs[:n])

    return text