bradley6597's picture
Move from dev environment
d76b5ba
raw
history blame
6.67 kB
import re
import string
from collections import Counter
import math
from tqdm import tqdm
from itertools import combinations
from nltk.stem import PorterStemmer
# top 25 most common words in English and "wikipedia":
# https://en.wikipedia.org/wiki/Most_common_words_in_English
stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
punct = re.compile(f'[{re.escape(string.punctuation)}]')
def tokenize(text):
# Split text
return(text.split())
def lowercase_filter(tokens):
# Make text lowercase
return([token.lower() for token in tokens])
def punctuation_filter(tokens):
# Remove punctuation
return([punct.sub('', token) for token in tokens])
def stopword_filter(tokens):
# Remove stopwords
return([token for token in tokens if token not in stop_words])
def stem_filter(tokens):
# Stem words
ps = PorterStemmer()
return([ps.stem(token) for token in tokens])
def analyze(text):
tokens = tokenize(text)
tokens = lowercase_filter(tokens)
tokens = punctuation_filter(tokens)
tokens = stopword_filter(tokens)
tokens = stem_filter(tokens)
return([token for token in tokens if token])
# Setup an index and document structure to reference later
def index_documents(df):
ind = {}
doc = {}
for i in tqdm(range(0, df.shape[0])):
if df['ID'].iloc[i] not in doc:
doc[df['ID'].iloc[i]] = df.iloc[i]
full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
for token in analyze(full_text):
if token not in ind:
ind[token] = set()
ind[token].add(df['ID'].iloc[i])
if i % 5000 == 0:
print(f'Indexed {i} documents', end='\r')
df['title_abs'] = df['title'] + ' ' + df['abstract']
print('Before all text')
all_text = ' '.join(df['title_abs'])
print('After all text')
term_frequencies = Counter(analyze(all_text))
return(ind, doc, term_frequencies)
def rank(termfreq, doc, ind, analyzed_query, documents):
results = []
if not documents:
return results
for document in documents:
score = 0.0
for token in analyzed_query:
tf = termfreq.get(token, 0)
if len(ind.get(token, set())) == 0:
continue
idf = math.log10(len(doc) / len(ind.get(token, set())))
score += tf * idf
results.append((document, score))
return sorted(results, key=lambda doc: doc[1], reverse=True)
def search(tf, doc, ind, query, search_type='AND', ranking=False):
"""
Search; this will return documents that contain words from the query,
and rank them if requested (sets are fast, but unordered).
Parameters:
- tf: the term frequencies. Taken from indexing documents
- doc: documents. Taken from indexing documents
- ind: index. Taken from indexing documents
- query: the query string
- search_type: ('AND', 'OR') do all query terms have to match, or just one
- score: (True, False) if True, rank results based on TF-IDF score
"""
if search_type not in ('AND', 'OR'):
return []
analyzed_query = analyze(query)
minus_query = [x[1:] for x in query.split() if x[0] == '-']
minus_query = [q for mq in minus_query for q in analyze(mq)]
specific_query = re.findall('"([^"]*)"', query)
specific_query = ' '.join(specific_query)
specific_query = [x.replace('"', '') for x in specific_query.split()]
specific_query = [q for sq in specific_query for q in analyze(sq)]
results = [ind.get(token, set()) for token in analyzed_query]
minus_results = [ind.get(token, set()) for token in minus_query]
specific_results = [ind.get(token, set()) for token in specific_query]
if len(minus_results) > 0:
for j in range(0, len(results)):
for i in range(0, len(minus_results)):
results[j] = results[j] - minus_results[i]
results = [r for r in results if len(r) > 0]
if len(results) > 0:
if search_type == 'AND':
# Deal with users who use "" to get specific results
if len(specific_results) > 0:
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
if len(documents) == 0:
for x in range(len(results), 1, -1):
combo_len_list = []
all_combos = list(combinations(results, x))
for c in range(0, len(all_combos)):
combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results)))
if len(combo_len_list) == 0:
continue
if max(combo_len_list) > 0:
break
if max(combo_len_list) > 0:
max_index = combo_len_list.index(max(combo_len_list))
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
else:
# all tokens must be in the document
documents = [doc[doc_id] for doc_id in set.intersection(*results)]
if len(documents) == 0:
# Iterate from length of search query backwards until some documents are returned.
# Looks at all combinations
for x in range(len(results), 1, -1):
combo_len_list = []
all_combos = list(combinations(results, x))
for c in range(0, len(all_combos)):
combo_len_list.append(len(set.intersection(*all_combos[c])))
if len(combo_len_list) == 0:
continue
if max(combo_len_list) > 0:
break
max_index = combo_len_list.index(max(combo_len_list))
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
if len(documents) == 0:
documents = [doc[doc_id] for doc_id in set.union(*results)]
if search_type == 'OR':
# only one token has to be in the document
documents = [doc[doc_id] for doc_id in set.union(*results)]
if ranking:
return(rank(tf, doc, ind, analyzed_query, documents))
else:
documents = []
return documents