gdrive-illustration-search

Running

gdrive-illustration-search / functions.py

Bradley

Duplicate from bradley6597/illustration-testing

2d44025 over 1 year ago

6.61 kB

	import re
	import string
	from collections import Counter
	import math
	from tqdm import tqdm
	from itertools import combinations
	from nltk.stem import PorterStemmer


	# top 25 most common words in English and "wikipedia":
	# https://en.wikipedia.org/wiki/Most_common_words_in_English
	stop_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
	'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
	'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
	punct = re.compile(f'[{re.escape(string.punctuation)}]')

	def tokenize(text):
	# Split text
	return(text.split())

	def lowercase_filter(tokens):
	# Make text lowercase
	return([token.lower() for token in tokens])

	def punctuation_filter(tokens):
	# Remove punctuation
	return([punct.sub('', token) for token in tokens])

	def stopword_filter(tokens):
	# Remove stopwords
	return([token for token in tokens if token not in stop_words])

	def stem_filter(tokens):
	# Stem words
	ps = PorterStemmer()
	return([ps.stem(token) for token in tokens])

	def analyze(text):
	tokens = tokenize(text)
	tokens = lowercase_filter(tokens)
	tokens = punctuation_filter(tokens)
	tokens = stopword_filter(tokens)
	tokens = stem_filter(tokens)

	return([token for token in tokens if token])


	# Setup an index and document structure to reference later
	def index_documents(df):
	ind = {}
	doc = {}
	for i in tqdm(range(0, df.shape[0])):
	if df['ID'].iloc[i] not in doc:
	doc[df['ID'].iloc[i]] = df.iloc[i]
	full_text = ' '.join([df['title'].iloc[i], df['abstract'].iloc[i]])
	for token in analyze(full_text):
	if token not in ind:
	ind[token] = set()
	ind[token].add(df['ID'].iloc[i])
	if i % 5000 == 0:
	print(f'Indexed {i} documents', end='\r')
	df['title_abs'] = df['title'] + ' ' + df['abstract']
	all_text = ' '.join(df['title_abs'])
	term_frequencies = Counter(analyze(all_text))
	return(ind, doc, term_frequencies)


	def rank(termfreq, doc, ind, analyzed_query, documents):
	results = []
	if not documents:
	return results
	for document in documents:
	score = 0.0
	for token in analyzed_query:
	tf = termfreq.get(token, 0)
	if len(ind.get(token, set())) == 0:
	continue
	idf = math.log10(len(doc) / len(ind.get(token, set())))
	score += tf * idf
	results.append((document, score))
	return sorted(results, key=lambda doc: doc[1], reverse=True)



	def search(tf, doc, ind, query, search_type='AND', ranking=False):
	"""
	Search; this will return documents that contain words from the query,
	and rank them if requested (sets are fast, but unordered).

	Parameters:
	- tf: the term frequencies. Taken from indexing documents
	- doc: documents. Taken from indexing documents
	- ind: index. Taken from indexing documents
	- query: the query string
	- search_type: ('AND', 'OR') do all query terms have to match, or just one
	- score: (True, False) if True, rank results based on TF-IDF score
	"""
	if search_type not in ('AND', 'OR'):
	return []

	analyzed_query = analyze(query)
	minus_query = [x[1:] for x in query.split() if x[0] == '-']
	minus_query = [q for mq in minus_query for q in analyze(mq)]

	specific_query = re.findall('"([^"]*)"', query)
	specific_query = ' '.join(specific_query)
	specific_query = [x.replace('"', '') for x in specific_query.split()]
	specific_query = [q for sq in specific_query for q in analyze(sq)]

	results = [ind.get(token, set()) for token in analyzed_query]
	minus_results = [ind.get(token, set()) for token in minus_query]
	specific_results = [ind.get(token, set()) for token in specific_query]

	if len(minus_results) > 0:
	for j in range(0, len(results)):
	for i in range(0, len(minus_results)):
	results[j] = results[j] - minus_results[i]
	results = [r for r in results if len(r) > 0]

	if len(results) > 0:
	if search_type == 'AND':
	# Deal with users who use "" to get specific results
	if len(specific_results) > 0:
	documents = [doc[doc_id] for doc_id in set.intersection(*results)]
	if len(documents) == 0:
	for x in range(len(results), 1, -1):
	combo_len_list = []
	all_combos = list(combinations(results, x))
	for c in range(0, len(all_combos)):
	combo_len_list.append(len(set.intersection(all_combos[c], specific_results)))
	if len(combo_len_list) == 0:
	continue
	if max(combo_len_list) > 0:
	break
	if max(combo_len_list) > 0:
	max_index = combo_len_list.index(max(combo_len_list))
	documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
	else:
	# all tokens must be in the document
	documents = [doc[doc_id] for doc_id in set.intersection(*results)]
	if len(documents) == 0:
	# Iterate from length of search query backwards until some documents are returned.
	# Looks at all combinations
	for x in range(len(results), 1, -1):
	combo_len_list = []
	all_combos = list(combinations(results, x))
	for c in range(0, len(all_combos)):
	combo_len_list.append(len(set.intersection(*all_combos[c])))
	if len(combo_len_list) == 0:
	continue
	if max(combo_len_list) > 0:
	break
	max_index = combo_len_list.index(max(combo_len_list))
	documents = [doc[doc_id] for doc_id in set.intersection(*all_combos[max_index])]
	if len(documents) == 0:
	documents = [doc[doc_id] for doc_id in set.union(*results)]
	if search_type == 'OR':
	# only one token has to be in the document
	documents = [doc[doc_id] for doc_id in set.union(*results)]

	if ranking:
	return(rank(tf, doc, ind, analyzed_query, documents))
	else:
	documents = []
	return documents