gdrive-illustration-search

Running

App Files Files Community

gdrive-illustration-search / functions.py

bradley6597

Move from dev environment

d76b5ba about 2 years ago

raw

history blame contribute delete

6.67 kB

	import re
	import string
	from collections import Counter
	import math
	from tqdm import tqdm
	from itertools import combinations
	from nltk.stem import PorterStemmer


	# top 25 most common words in English and "wikipedia":
	# https://en.wikipedia.org/wiki/Most_common_words_in_English
	stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
	'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
	'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
	punct = re.compile(f'[{re.escape(string.punctuation)}]')

	def tokenize(text):
	# Split text
	return(text.split())

	def lowercase_filter(tokens):
	# Make text lowercase
	return([token.lower() for token in tokens])

	def punctuation_filter(tokens):
	# Remove punctuation
	return([punct.sub('', token) for token in tokens])

	def stopword_filter(tokens):
	# Remove stopwords
	return([token for token in tokens if token not in stop_words])

	def stem_filter(tokens):
	# Stem words
	ps = PorterStemmer()
	return([ps.stem(token) for token in tokens])

	def analyze(text):
	tokens = tokenize(text)
	tokens = lowercase_filter(tokens)
	tokens = punctuation_filter(tokens)
	tokens = stopword_filter(tokens)
	tokens = stem_filter(tokens)

	return([token for token in tokens if token])


	# Setup an index and document structure to reference later
	def index_documents(df):
	ind = {}
	doc = {}
	for i in tqdm(range(0, df.shape[0])):
	if df['ID'].iloc[i] not in doc:
	doc[df['ID'].iloc[i]] = df.iloc[i]
	full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
	for token in analyze(full_text):
	if token not in ind:
	ind[token] = set()
	ind[token].add(df['ID'].iloc[i])
	if i % 5000 == 0:
	print(f'Indexed {i} documents', end='\r')
	df['title_abs'] = df['title'] + ' ' + df['abstract']
	print('Before all text')
	all_text = ' '.join(df['title_abs'])
	print('After all text')
	term_frequencies = Counter(analyze(all_text))
	return(ind, doc, term_frequencies)


	def rank(termfreq, doc, ind, analyzed_query, documents):
	results = []
	if not documents:
	return results
	for document in documents:
	score = 0.0
	for token in analyzed_query:
	tf = termfreq.get(token, 0)
	if len(ind.get(token, set())) == 0:
	continue
	idf = math.log10(len(doc) / len(ind.get(token, set())))
	score += tf * idf
	results.append((document, score))
	return sorted(results, key=lambda doc: doc[1], reverse=True)



	def search(tf, doc, ind, query, search_type='AND', ranking=False):
	"""
	Search; this will return documents that contain words from the query,
	and rank them if requested (sets are fast, but unordered).

	Parameters:
	- tf: the term frequencies. Taken from indexing documents
	- doc: documents. Taken from indexing documents
	- ind: index. Taken from indexing documents
	- query: the query string
	- search_type: ('AND', 'OR') do all query terms have to match, or just one
	- score: (True, False) if True, rank results based on TF-IDF score
	"""
	if search_type not in ('AND', 'OR'):
	return []

	analyzed_query = analyze(query)
	minus_query = [x[1:] for x in query.split() if x[0] == '-']
	minus_query = [q for mq in minus_query for q in analyze(mq)]

	specific_query = re.findall('"([^"]*)"', query)
	specific_query = ' '.join(specific_query)
	specific_query = [x.replace('"', '') for x in specific_query.split()]
	specific_query = [q for sq in specific_query for q in analyze(sq)]

	results = [ind.get(token, set()) for token in analyzed_query]
	minus_results = [ind.get(token, set()) for token in minus_query]
	specific_results = [ind.get(token, set()) for token in specific_query]

	if len(minus_results) > 0:
	for j in range(0, len(results)):
	for i in range(0, len(minus_results)):
	results[j] = results[j] - minus_results[i]
	results = [r for r in results if len(r) > 0]

	if len(results) > 0:
	if search_type == 'AND':
	# Deal with users who use "" to get specific results
	if len(specific_results) > 0:
	documents = [doc[doc_id] for doc_id in set.intersection(*results)]
	if len(documents) == 0:
	for x in range(len(results), 1, -1):
	combo_len_list = []
	all_combos = list(combinations(results, x))
	for c in range(0, len(all_combos)):
	combo_len_list.append(len(set.intersection(all_combos[c], specific_results)))
	if len(combo_len_list) == 0:
	continue
	if max(combo_len_list) > 0:
	break
	if max(combo_len_list) > 0:
	max_index = combo_len_list.index(max(combo_len_list))
	documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
	else:
	# all tokens must be in the document
	documents = [doc[doc_id] for doc_id in set.intersection(*results)]
	if len(documents) == 0:
	# Iterate from length of search query backwards until some documents are returned.
	# Looks at all combinations
	for x in range(len(results), 1, -1):
	combo_len_list = []
	all_combos = list(combinations(results, x))
	for c in range(0, len(all_combos)):
	combo_len_list.append(len(set.intersection(*all_combos[c])))
	if len(combo_len_list) == 0:
	continue
	if max(combo_len_list) > 0:
	break
	max_index = combo_len_list.index(max(combo_len_list))
	documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
	if len(documents) == 0:
	documents = [doc[doc_id] for doc_id in set.union(*results)]
	if search_type == 'OR':
	# only one token has to be in the document
	documents = [doc[doc_id] for doc_id in set.union(*results)]

	if ranking:
	return(rank(tf, doc, ind, analyzed_query, documents))
	else:
	documents = []
	return documents