|
import re |
|
import string |
|
from collections import Counter |
|
import math |
|
from tqdm import tqdm |
|
from itertools import combinations |
|
from nltk.stem import PorterStemmer |
|
|
|
|
|
|
|
|
|
stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', |
|
'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', |
|
'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia']) |
|
punct = re.compile(f'[{re.escape(string.punctuation)}]') |
|
|
|
def tokenize(text): |
|
|
|
return(text.split()) |
|
|
|
def lowercase_filter(tokens): |
|
|
|
return([token.lower() for token in tokens]) |
|
|
|
def punctuation_filter(tokens): |
|
|
|
return([punct.sub('', token) for token in tokens]) |
|
|
|
def stopword_filter(tokens): |
|
|
|
return([token for token in tokens if token not in stop_words]) |
|
|
|
def stem_filter(tokens): |
|
|
|
ps = PorterStemmer() |
|
return([ps.stem(token) for token in tokens]) |
|
|
|
def analyze(text): |
|
tokens = tokenize(text) |
|
tokens = lowercase_filter(tokens) |
|
tokens = punctuation_filter(tokens) |
|
tokens = stopword_filter(tokens) |
|
tokens = stem_filter(tokens) |
|
|
|
return([token for token in tokens if token]) |
|
|
|
|
|
|
|
def index_documents(df): |
|
ind = {} |
|
doc = {} |
|
for i in tqdm(range(0, df.shape[0])): |
|
if df['ID'].iloc[i] not in doc: |
|
doc[df['ID'].iloc[i]] = df.iloc[i] |
|
full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]]) |
|
for token in analyze(full_text): |
|
if token not in ind: |
|
ind[token] = set() |
|
ind[token].add(df['ID'].iloc[i]) |
|
if i % 5000 == 0: |
|
print(f'Indexed {i} documents', end='\r') |
|
df['title_abs'] = df['title'] + ' ' + df['abstract'] |
|
print('Before all text') |
|
all_text = ' '.join(df['title_abs']) |
|
print('After all text') |
|
term_frequencies = Counter(analyze(all_text)) |
|
return(ind, doc, term_frequencies) |
|
|
|
|
|
def rank(termfreq, doc, ind, analyzed_query, documents): |
|
results = [] |
|
if not documents: |
|
return results |
|
for document in documents: |
|
score = 0.0 |
|
for token in analyzed_query: |
|
tf = termfreq.get(token, 0) |
|
if len(ind.get(token, set())) == 0: |
|
continue |
|
idf = math.log10(len(doc) / len(ind.get(token, set()))) |
|
score += tf * idf |
|
results.append((document, score)) |
|
return sorted(results, key=lambda doc: doc[1], reverse=True) |
|
|
|
|
|
|
|
def search(tf, doc, ind, query, search_type='AND', ranking=False): |
|
""" |
|
Search; this will return documents that contain words from the query, |
|
and rank them if requested (sets are fast, but unordered). |
|
|
|
Parameters: |
|
- tf: the term frequencies. Taken from indexing documents |
|
- doc: documents. Taken from indexing documents |
|
- ind: index. Taken from indexing documents |
|
- query: the query string |
|
- search_type: ('AND', 'OR') do all query terms have to match, or just one |
|
- score: (True, False) if True, rank results based on TF-IDF score |
|
""" |
|
if search_type not in ('AND', 'OR'): |
|
return [] |
|
|
|
analyzed_query = analyze(query) |
|
minus_query = [x[1:] for x in query.split() if x[0] == '-'] |
|
minus_query = [q for mq in minus_query for q in analyze(mq)] |
|
|
|
specific_query = re.findall('"([^"]*)"', query) |
|
specific_query = ' '.join(specific_query) |
|
specific_query = [x.replace('"', '') for x in specific_query.split()] |
|
specific_query = [q for sq in specific_query for q in analyze(sq)] |
|
|
|
results = [ind.get(token, set()) for token in analyzed_query] |
|
minus_results = [ind.get(token, set()) for token in minus_query] |
|
specific_results = [ind.get(token, set()) for token in specific_query] |
|
|
|
if len(minus_results) > 0: |
|
for j in range(0, len(results)): |
|
for i in range(0, len(minus_results)): |
|
results[j] = results[j] - minus_results[i] |
|
results = [r for r in results if len(r) > 0] |
|
|
|
if len(results) > 0: |
|
if search_type == 'AND': |
|
|
|
if len(specific_results) > 0: |
|
documents = [doc[doc_id] for doc_id in set.intersection(*results)] |
|
if len(documents) == 0: |
|
for x in range(len(results), 1, -1): |
|
combo_len_list = [] |
|
all_combos = list(combinations(results, x)) |
|
for c in range(0, len(all_combos)): |
|
combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results))) |
|
if len(combo_len_list) == 0: |
|
continue |
|
if max(combo_len_list) > 0: |
|
break |
|
if max(combo_len_list) > 0: |
|
max_index = combo_len_list.index(max(combo_len_list)) |
|
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])] |
|
else: |
|
|
|
documents = [doc[doc_id] for doc_id in set.intersection(*results)] |
|
if len(documents) == 0: |
|
|
|
|
|
for x in range(len(results), 1, -1): |
|
combo_len_list = [] |
|
all_combos = list(combinations(results, x)) |
|
for c in range(0, len(all_combos)): |
|
combo_len_list.append(len(set.intersection(*all_combos[c]))) |
|
if len(combo_len_list) == 0: |
|
continue |
|
if max(combo_len_list) > 0: |
|
break |
|
max_index = combo_len_list.index(max(combo_len_list)) |
|
documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])] |
|
if len(documents) == 0: |
|
documents = [doc[doc_id] for doc_id in set.union(*results)] |
|
if search_type == 'OR': |
|
|
|
documents = [doc[doc_id] for doc_id in set.union(*results)] |
|
|
|
if ranking: |
|
return(rank(tf, doc, ind, analyzed_query, documents)) |
|
else: |
|
documents = [] |
|
return documents |
|
|