gdrive-illustration-search

Running

File size: 6,671 Bytes

import re
import string
from collections import Counter
import math
from tqdm import tqdm
from itertools import combinations
from nltk.stem import PorterStemmer


# top 25 most common words in English and "wikipedia":
# https://en.wikipedia.org/wiki/Most_common_words_in_English
stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
                 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
                 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
punct = re.compile(f'[{re.escape(string.punctuation)}]')

def tokenize(text):
    # Split text
    return(text.split())
    
def lowercase_filter(tokens):
    # Make text lowercase
    return([token.lower() for token in tokens])

def punctuation_filter(tokens):
    # Remove punctuation
    return([punct.sub('', token) for token in tokens])

def stopword_filter(tokens):
    # Remove stopwords
    return([token for token in tokens if token not in stop_words])

def stem_filter(tokens):
    # Stem words
    ps = PorterStemmer()
    return([ps.stem(token) for token in tokens])

def analyze(text):
    tokens = tokenize(text)
    tokens = lowercase_filter(tokens)
    tokens = punctuation_filter(tokens)
    tokens = stopword_filter(tokens)
    tokens = stem_filter(tokens)

    return([token for token in tokens if token])


# Setup an index and document structure to reference later
def index_documents(df):
    ind = {}
    doc = {}
    for i in tqdm(range(0, df.shape[0])):
        if df['ID'].iloc[i] not in doc:
            doc[df['ID'].iloc[i]] = df.iloc[i]
            full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
        for token in analyze(full_text):
            if token not in ind:
                ind[token] = set()
            ind[token].add(df['ID'].iloc[i])
        if i % 5000 == 0:
            print(f'Indexed {i} documents', end='\r')
    df['title_abs'] = df['title'] + ' '  + df['abstract']
    print('Before all text')
    all_text = ' '.join(df['title_abs'])
    print('After all text')
    term_frequencies = Counter(analyze(all_text))
    return(ind, doc, term_frequencies)


def rank(termfreq, doc, ind, analyzed_query, documents):
    results = []
    if not documents:
        return results
    for document in documents:
        score = 0.0
        for token in analyzed_query:
            tf = termfreq.get(token, 0)
            if len(ind.get(token, set())) == 0:
                continue
            idf = math.log10(len(doc) / len(ind.get(token, set())))
            score += tf * idf
        results.append((document, score))
    return sorted(results, key=lambda doc: doc[1], reverse=True)



def search(tf, doc, ind, query, search_type='AND', ranking=False):
    """
    Search; this will return documents that contain words from the query,
    and rank them if requested (sets are fast, but unordered).

    Parameters:
        - tf: the term frequencies. Taken from indexing documents
        - doc: documents. Taken from indexing documents
        - ind: index. Taken from indexing documents
        - query: the query string
        - search_type: ('AND', 'OR') do all query terms have to match, or just one
        - score: (True, False) if True, rank results based on TF-IDF score
    """
    if search_type not in ('AND', 'OR'):
        return []

    analyzed_query = analyze(query)
    minus_query = [x[1:] for x in query.split() if x[0] == '-']
    minus_query = [q for mq in minus_query for q in analyze(mq)]
    
    specific_query = re.findall('"([^"]*)"', query)
    specific_query = ' '.join(specific_query)
    specific_query = [x.replace('"', '') for x in specific_query.split()]
    specific_query = [q for sq in specific_query for q in analyze(sq)]
    
    results = [ind.get(token, set()) for token in analyzed_query]
    minus_results = [ind.get(token, set()) for token in minus_query] 
    specific_results = [ind.get(token, set()) for token in specific_query]
    
    if len(minus_results) > 0:
        for j in range(0, len(results)):
            for i in range(0, len(minus_results)):
                results[j] = results[j] - minus_results[i]
    results = [r for r in results if len(r) > 0]

    if len(results) > 0:
        if search_type == 'AND':
            # Deal with users who use "" to get specific results
            if len(specific_results) > 0:
                documents = [doc[doc_id] for doc_id in set.intersection(*results)]
                if len(documents) == 0:
                    for x in range(len(results), 1, -1):
                        combo_len_list = []
                        all_combos = list(combinations(results, x))
                        for c in range(0, len(all_combos)):
                            combo_len_list.append(len(set.intersection(*all_combos[c], *specific_results)))
                        if len(combo_len_list) == 0:
                            continue
                        if max(combo_len_list) > 0:
                            break
                    if max(combo_len_list) > 0:
                        max_index = combo_len_list.index(max(combo_len_list))
                        documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
            else:
                # all tokens must be in the document
                documents = [doc[doc_id] for doc_id in set.intersection(*results)]
                if len(documents) == 0: 
                    # Iterate from length of search query backwards until some documents are returned. 
                    # Looks at all combinations 
                    for x in range(len(results), 1, -1):
                        combo_len_list = []
                        all_combos = list(combinations(results, x))
                        for c in range(0, len(all_combos)):
                            combo_len_list.append(len(set.intersection(*all_combos[c])))
                        if len(combo_len_list) == 0:
                            continue
                        if max(combo_len_list) > 0:
                            break
                    max_index = combo_len_list.index(max(combo_len_list))
                    documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
                    if len(documents) == 0:
                        documents = [doc[doc_id] for doc_id in set.union(*results)]
        if search_type == 'OR':
            # only one token has to be in the document
            documents = [doc[doc_id] for doc_id in set.union(*results)]

        if ranking:
            return(rank(tf, doc, ind, analyzed_query, documents))
    else:
        documents = []
    return documents