File size: 6,614 Bytes
2d44025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import re
import string
from collections import Counter
import math
from tqdm import tqdm
from itertools import combinations
from nltk.stem import PorterStemmer


# top 25 most common words in English and "wikipedia":
# https://en.wikipedia.org/wiki/Most_common_words_in_English
stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
                 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
                 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
punct = re.compile(f'[{re.escape(string.punctuation)}]')

def tokenize(text):
    # Split text
    return(text.split())
    
def lowercase_filter(tokens):
    # Make text lowercase
    return([token.lower() for token in tokens])

def punctuation_filter(tokens):
    # Remove punctuation
    return([punct.sub('', token) for token in tokens])

def stopword_filter(tokens):
    # Remove stopwords
    return([token for token in tokens if token not in stop_words])

def stem_filter(tokens):
    # Stem words
    ps = PorterStemmer()
    return([ps.stem(token) for token in tokens])

def analyze(text):
    tokens = tokenize(text)
    tokens = lowercase_filter(tokens)
    tokens = punctuation_filter(tokens)
    tokens = stopword_filter(tokens)
    tokens = stem_filter(tokens)

    return([token for token in tokens if token])


# Setup an index and document structure to reference later
def index_documents(df):
    ind = {}
    doc = {}
    for i in tqdm(range(0, df.shape[0])):
        if df['ID'].iloc[i] not in doc:
            doc[df['ID'].iloc[i]] = df.iloc[i]
            full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
        for token in analyze(full_text):
            if token not in ind:
                ind[token] = set()
            ind[token].add(df['ID'].iloc[i])
        if i % 5000 == 0:
            print(f'Indexed {i} documents', end='\r')
    df['title_abs'] = df['title'] + ' '  + df['abstract']
    all_text = ' '.join(df['title_abs'])
    term_frequencies = Counter(analyze(all_text))
    return(ind, doc, term_frequencies)


def rank(termfreq, doc, ind, analyzed_query, documents):
    results = []
    if not documents:
        return results
    for document in documents:
        score = 0.0
        for token in analyzed_query:
            tf = termfreq.get(token, 0)
            if len(ind.get(token, set())) == 0:
                continue
            idf = math.log10(len(doc) / len(ind.get(token, set())))
            score += tf * idf
        results.append((document, score))
    return sorted(results, key=lambda doc: doc[1], reverse=True)



def search(tf, doc, ind, query, search_type='AND', ranking=False):
    """
    Search; this will return documents that contain words from the query,
    and rank them if requested (sets are fast, but unordered).

    Parameters:
        - tf: the term frequencies. Taken from indexing documents
        - doc: documents. Taken from indexing documents
        - ind: index. Taken from indexing documents
        - query: the query string
        - search_type: ('AND', 'OR') do all query terms have to match, or just one
        - score: (True, False) if True, rank results based on TF-IDF score
    """
    if search_type not in ('AND', 'OR'):
        return []

    analyzed_query = analyze(query)
    minus_query = [x[1:] for x in query.split() if x[0] == '-']
    minus_query = [q for mq in minus_query for q in analyze(mq)]
    
    specific_query = re.findall('"([^"]*)"', query)
    specific_query = ' '.join(specific_query)
    specific_query = [x.replace('"', '') for x in specific_query.split()]
    specific_query = [q for sq in specific_query for q in analyze(sq)]
    
    results = [ind.get(token, set()) for token in analyzed_query]
    minus_results = [ind.get(token, set()) for token in minus_query] 
    specific_results = [ind.get(token, set()) for token in specific_query]
    
    if len(minus_results) > 0:
        for j in range(0, len(results)):
            for i in range(0, len(minus_results)):
                results[j] = results[j] - minus_results[i]
    results = [r for r in results if len(r) > 0]

    if len(results) > 0:
        if search_type == 'AND':
            # Deal with users who use "" to get specific results
            if len(specific_results) > 0:
                documents = [doc[doc_id] for doc_id in set.intersection(*results)]
                if len(documents) == 0:
                    for x in range(len(results), 1, -1):
                        combo_len_list = []
                        all_combos = list(combinations(results, x))
                        for c in range(0, len(all_combos)):
                            combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results)))
                        if len(combo_len_list) == 0:
                            continue
                        if max(combo_len_list) > 0:
                            break
                    if max(combo_len_list) > 0:
                        max_index = combo_len_list.index(max(combo_len_list))
                        documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
            else:
                # all tokens must be in the document
                documents = [doc[doc_id] for doc_id in set.intersection(*results)]
                if len(documents) == 0: 
                    # Iterate from length of search query backwards until some documents are returned. 
                    # Looks at all combinations 
                    for x in range(len(results), 1, -1):
                        combo_len_list = []
                        all_combos = list(combinations(results, x))
                        for c in range(0, len(all_combos)):
                            combo_len_list.append(len(set.intersection(*all_combos[c])))
                        if len(combo_len_list) == 0:
                            continue
                        if max(combo_len_list) > 0:
                            break
                    max_index = combo_len_list.index(max(combo_len_list))
                    documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
                    if len(documents) == 0:
                        documents = [doc[doc_id] for doc_id in set.union(*results)]
        if search_type == 'OR':
            # only one token has to be in the document
            documents = [doc[doc_id] for doc_id in set.union(*results)]

        if ranking:
            return(rank(tf, doc, ind, analyzed_query, documents))
    else:
        documents = []
    return documents