File size: 6,614 Bytes
2d44025 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import re
import string
from collections import Counter
import math
from tqdm import tqdm
from itertools import combinations
from nltk.stem import PorterStemmer
# top 25 most common words in English and "wikipedia":
# https://en.wikipedia.org/wiki/Most_common_words_in_English
stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
punct = re.compile(f'[{re.escape(string.punctuation)}]')
def tokenize(text):
# Split text
return(text.split())
def lowercase_filter(tokens):
# Make text lowercase
return([token.lower() for token in tokens])
def punctuation_filter(tokens):
# Remove punctuation
return([punct.sub('', token) for token in tokens])
def stopword_filter(tokens):
# Remove stopwords
return([token for token in tokens if token not in stop_words])
def stem_filter(tokens):
# Stem words
ps = PorterStemmer()
return([ps.stem(token) for token in tokens])
def analyze(text):
tokens = tokenize(text)
tokens = lowercase_filter(tokens)
tokens = punctuation_filter(tokens)
tokens = stopword_filter(tokens)
tokens = stem_filter(tokens)
return([token for token in tokens if token])
# Setup an index and document structure to reference later
def index_documents(df):
ind = {}
doc = {}
for i in tqdm(range(0, df.shape[0])):
if df['ID'].iloc[i] not in doc:
doc[df['ID'].iloc[i]] = df.iloc[i]
full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
for token in analyze(full_text):
if token not in ind:
ind[token] = set()
ind[token].add(df['ID'].iloc[i])
if i % 5000 == 0:
print(f'Indexed {i} documents', end='\r')
df['title_abs'] = df['title'] + ' ' + df['abstract']
all_text = ' '.join(df['title_abs'])
term_frequencies = Counter(analyze(all_text))
return(ind, doc, term_frequencies)
def rank(termfreq, doc, ind, analyzed_query, documents):
results = []
if not documents:
return results
for document in documents:
score = 0.0
for token in analyzed_query:
tf = termfreq.get(token, 0)
if len(ind.get(token, set())) == 0:
continue
idf = math.log10(len(doc) / len(ind.get(token, set())))
score += tf * idf
results.append((document, score))
return sorted(results, key=lambda doc: doc[1], reverse=True)
def search(tf, doc, ind, query, search_type='AND', ranking=False):
"""
Search; this will return documents that contain words from the query,
and rank them if requested (sets are fast, but unordered).
Parameters:
- tf: the term frequencies. Taken from indexing documents
- doc: documents. Taken from indexing documents
- ind: index. Taken from indexing documents
- query: the query string
- search_type: ('AND', 'OR') do all query terms have to match, or just one
- score: (True, False) if True, rank results based on TF-IDF score
"""
if search_type not in ('AND', 'OR'):
return []
analyzed_query = analyze(query)
minus_query = [x[1:] for x in query.split() if x[0] == '-']
minus_query = [q for mq in minus_query for q in analyze(mq)]
specific_query = re.findall('"([^"]*)"', query)
specific_query = ' '.join(specific_query)
specific_query = [x.replace('"', '') for x in specific_query.split()]
specific_query = [q for sq in specific_query for q in analyze(sq)]
results = [ind.get(token, set()) for token in analyzed_query]
minus_results = [ind.get(token, set()) for token in minus_query]
specific_results = [ind.get(token, set()) for token in specific_query]
if len(minus_results) > 0:
for j in range(0, len(results)):
for i in range(0, len(minus_results)):
results[j] = results[j] - minus_results[i]
results = [r for r in results if len(r) > 0]
if len(results) > 0:
if search_type == 'AND':
# Deal with users who use "" to get specific results
if len(specific_results) > 0:
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
if len(documents) == 0:
for x in range(len(results), 1, -1):
combo_len_list = []
all_combos = list(combinations(results, x))
for c in range(0, len(all_combos)):
combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results)))
if len(combo_len_list) == 0:
continue
if max(combo_len_list) > 0:
break
if max(combo_len_list) > 0:
max_index = combo_len_list.index(max(combo_len_list))
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
else:
# all tokens must be in the document
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
if len(documents) == 0:
# Iterate from length of search query backwards until some documents are returned.
# Looks at all combinations
for x in range(len(results), 1, -1):
combo_len_list = []
all_combos = list(combinations(results, x))
for c in range(0, len(all_combos)):
combo_len_list.append(len(set.intersection(*all_combos[c])))
if len(combo_len_list) == 0:
continue
if max(combo_len_list) > 0:
break
max_index = combo_len_list.index(max(combo_len_list))
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
if len(documents) == 0:
documents = [doc[doc_id] for doc_id in set.union(*results)]
if search_type == 'OR':
# only one token has to be in the document
documents = [doc[doc_id] for doc_id in set.union(*results)]
if ranking:
return(rank(tf, doc, ind, analyzed_query, documents))
else:
documents = []
return documents
|