bradley6597's picture
Move from dev environment
d76b5ba
import re
import string
from collections import Counter
import math
from tqdm import tqdm
from itertools import combinations
from nltk.stem import PorterStemmer
# top 25 most common words in English and "wikipedia":
# https://en.wikipedia.org/wiki/Most_common_words_in_English
stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
punct = re.compile(f'[{re.escape(string.punctuation)}]')
def tokenize(text):
# Split text
return(text.split())
def lowercase_filter(tokens):
# Make text lowercase
return([token.lower() for token in tokens])
def punctuation_filter(tokens):
# Remove punctuation
return([punct.sub('', token) for token in tokens])
def stopword_filter(tokens):
# Remove stopwords
return([token for token in tokens if token not in stop_words])
def stem_filter(tokens):
# Stem words
ps = PorterStemmer()
return([ps.stem(token) for token in tokens])
def analyze(text):
tokens = tokenize(text)
tokens = lowercase_filter(tokens)
tokens = punctuation_filter(tokens)
tokens = stopword_filter(tokens)
tokens = stem_filter(tokens)
return([token for token in tokens if token])
# Setup an index and document structure to reference later
def index_documents(df):
ind = {}
doc = {}
for i in tqdm(range(0, df.shape[0])):
if df['ID'].iloc[i] not in doc:
doc[df['ID'].iloc[i]] = df.iloc[i]
full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
for token in analyze(full_text):
if token not in ind:
ind[token] = set()
ind[token].add(df['ID'].iloc[i])
if i % 5000 == 0:
print(f'Indexed {i} documents', end='\r')
df['title_abs'] = df['title'] + ' ' + df['abstract']
print('Before all text')
all_text = ' '.join(df['title_abs'])
print('After all text')
term_frequencies = Counter(analyze(all_text))
return(ind, doc, term_frequencies)
def rank(termfreq, doc, ind, analyzed_query, documents):
results = []
if not documents:
return results
for document in documents:
score = 0.0
for token in analyzed_query:
tf = termfreq.get(token, 0)
if len(ind.get(token, set())) == 0:
continue
idf = math.log10(len(doc) / len(ind.get(token, set())))
score += tf * idf
results.append((document, score))
return sorted(results, key=lambda doc: doc[1], reverse=True)
def search(tf, doc, ind, query, search_type='AND', ranking=False):
"""
Search; this will return documents that contain words from the query,
and rank them if requested (sets are fast, but unordered).
Parameters:
- tf: the term frequencies. Taken from indexing documents
- doc: documents. Taken from indexing documents
- ind: index. Taken from indexing documents
- query: the query string
- search_type: ('AND', 'OR') do all query terms have to match, or just one
- score: (True, False) if True, rank results based on TF-IDF score
"""
if search_type not in ('AND', 'OR'):
return []
analyzed_query = analyze(query)
minus_query = [x[1:] for x in query.split() if x[0] == '-']
minus_query = [q for mq in minus_query for q in analyze(mq)]
specific_query = re.findall('"([^"]*)"', query)
specific_query = ' '.join(specific_query)
specific_query = [x.replace('"', '') for x in specific_query.split()]
specific_query = [q for sq in specific_query for q in analyze(sq)]
results = [ind.get(token, set()) for token in analyzed_query]
minus_results = [ind.get(token, set()) for token in minus_query]
specific_results = [ind.get(token, set()) for token in specific_query]
if len(minus_results) > 0:
for j in range(0, len(results)):
for i in range(0, len(minus_results)):
results[j] = results[j] - minus_results[i]
results = [r for r in results if len(r) > 0]
if len(results) > 0:
if search_type == 'AND':
# Deal with users who use "" to get specific results
if len(specific_results) > 0:
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
if len(documents) == 0:
for x in range(len(results), 1, -1):
combo_len_list = []
all_combos = list(combinations(results, x))
for c in range(0, len(all_combos)):
combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results)))
if len(combo_len_list) == 0:
continue
if max(combo_len_list) > 0:
break
if max(combo_len_list) > 0:
max_index = combo_len_list.index(max(combo_len_list))
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
else:
# all tokens must be in the document
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
if len(documents) == 0:
# Iterate from length of search query backwards until some documents are returned.
# Looks at all combinations
for x in range(len(results), 1, -1):
combo_len_list = []
all_combos = list(combinations(results, x))
for c in range(0, len(all_combos)):
combo_len_list.append(len(set.intersection(*all_combos[c])))
if len(combo_len_list) == 0:
continue
if max(combo_len_list) > 0:
break
max_index = combo_len_list.index(max(combo_len_list))
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
if len(documents) == 0:
documents = [doc[doc_id] for doc_id in set.union(*results)]
if search_type == 'OR':
# only one token has to be in the document
documents = [doc[doc_id] for doc_id in set.union(*results)]
if ranking:
return(rank(tf, doc, ind, analyzed_query, documents))
else:
documents = []
return documents