File size: 4,226 Bytes
e539b70
 
 
a319ac9
e539b70
2b3e58c
e539b70
a319ac9
e539b70
 
 
 
 
 
 
 
 
 
 
a319ac9
 
 
e539b70
 
 
 
2b3e58c
 
 
 
 
 
 
e539b70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b3e58c
 
 
 
e539b70
 
 
 
 
 
 
 
 
 
2b3e58c
 
0914465
2b3e58c
 
 
 
0914465
2b3e58c
 
e539b70
 
 
 
 
 
 
 
 
 
 
 
78a71e8
 
e539b70
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import nmslib
import numpy as np
import streamlit as st
# import inflect
import torch
from os import environ

# p = inflect.engine()

class FewDocumentsError(Exception):
  def __init__(self, documents, size, msg):
    self.documents = documents
    self.size = size
    self.msg = msg

  def __str__(self):
    return repr(self.msg)

def document_extraction(dataset, query, keywords, min_document_size, min_just_one_paragraph_size):
  # TODO: compare inflected forms
  # word_in_text = lambda word, text: any([p.compare(word, w) for w in text.split()])
  word_in_text = lambda word, text: word in set(text.split())
  lower_dataset = [document.lower() for document in dataset]
  lower_query = query.lower()
  lower_keywords = [keyword.lower() for keyword in keywords]

  if environ['PORTUGUESE'] == 'true':
    portuguese = True
  elif environ['PORTUGUESE'] == 'false':
    portuguese = False
  else:
    raise EnvironmentError

  documents = {}

  documents['QUERY'] = [
    dataset[lower_dataset.index(document)] for document in lower_dataset
    if (word_in_text(lower_query, document))
    and (len(document.split()) > min_document_size)
    and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
  ]
  
  documents['AND'] = [
    dataset[lower_dataset.index(document)] for document in lower_dataset
    if all(word_in_text(keyword, document) for keyword in lower_keywords)
    and (len(document.split()) > min_document_size)
    and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
  ]

  documents['OR'] = [
    dataset[lower_dataset.index(document)] for document in lower_dataset
    if any(word_in_text(keyword, document) for keyword in lower_keywords)
    and (len(document.split()) > min_document_size)
    and any(len(paragraph.split()) > min_just_one_paragraph_size for paragraph in document.splitlines())
  ]

  empty = {
    'QUERY': len(documents['QUERY']) == 0,
    'AND': len(documents['AND']) == 0,
    'OR': len(documents['OR']) == 0
  }

  sizes = {
    'QUERY': len(documents['QUERY']),
    'AND': len(documents['AND']),
    'OR': len(documents['OR'])
  }

  if all(empty.values()):
        # TODO: throw error
        st.info(empty.values())
        if portuguese:
          st.warning(f'Nenhum documento encontrado para a query "{query}", por favor, tente com outra query')
        else:
          st.warning(f'No document found for the query "{query}", please try with another query')
        st.stop()

  if sizes['QUERY'] >= 10:
      extracted_documents = documents['QUERY']
  elif sizes['AND'] >= 10:
      extracted_documents = documents['AND']
  elif sizes['OR'] >= 10:
      extracted_documents = documents['OR']
  else:
      number_of_documents = sizes['OR']
      if portuguese:
        raise FewDocumentsError(documents['OR'], number_of_documents, 
          f'Somente {number_of_documents} documentos encontrados para a query "{query}".\
          Por favor selecione "Prosseguir" para prosseguir com {number_of_documents} documentos ou tente novamente com outra query'
        )
      else:
        raise FewDocumentsError(documents['OR'], number_of_documents, 
          f'Only {number_of_documents} documents found for the query "{query}".\
          Please select "Proceed" to proceed with {number_of_documents} documents or try again with another query'
        )

  return extracted_documents, empty, sizes

def paragraph_extraction(documents, min_paragraph_size):
  paragraphs = [
    documents[i].splitlines()[j] for i in range(len(documents)) for j in range(len(documents[i].splitlines()))
    if (len(documents[i].splitlines()[j].split()) > min_paragraph_size)
  ]

  return paragraphs

def semantic_search(model, query, files, number_of_similar_files):
  encoded_query = model.encode(query)
  encoded_files = model.encode(files)

  model_index = nmslib.init(method='hnsw', space='angulardist')
  model_index.addDataPointBatch(encoded_files)
  model_index.createIndex({'post': 2})

  ids, distances = model_index.knnQuery(encoded_query, k=number_of_similar_files)

  selected_files = [files[index] for index in ids]

  distances = 180*distances/np.pi

  return selected_files, distances;