Spaces:

ababio
/

Search-Engine

App Files Files Community

Search-Engine / Indexer.py

ababio's picture

Upload 24 files

2c4cdb1 verified 3 months ago

3.31 kB


	import nltk #NLTK for natural language processing tasks
	from nltk.corpus import stopwords # list of stop word
	from nltk.tokenize import word_tokenize # To tokenize each word
	from nltk.stem import PorterStemmer # For specific rules to transform words to their stems


	# Preprosessing data before indexing
	with open('scraper_results.json', 'r') as doc: scraper_results=doc.read()

	# Initialize empty lists to store publication name, URL, author, and date

	pubName = []
	pubURL = []
	pubCUAuthor = []
	pubDate = []

	# Load the scraped results using ujson
	data_dict = ujson.loads(scraper_results)

	# Get the length of the data_dict (number of publications)
	array_length = len(data_dict)
	# Print the number of publications
	print(array_length)

	#Seperate name, url, date, author in different file
	for item in data_dict:
	pubName.append(item["name"])
	pubURL.append(item["pub_url"])
	pubCUAuthor.append(item["cu_author"])
	pubDate.append(item["date"])
	with open('pub_name.json', 'w') as f:ujson.dump(pubName, f)
	with open('pub_url.json', 'w') as f:ujson.dump(pubURL, f)
	with open('pub_cu_author.json', 'w') as f:ujson.dump(pubCUAuthor, f)
	with open('pub_date.json', 'w') as f: ujson.dump(pubDate, f)


	#Open a file with publication names in read mode
	with open('pub_name.json','r') as f:publication=f.read()

	#Load JSON File
	pubName = ujson.loads(publication)

	#Downloading libraries to use its methods
	nltk.download('stopwords')
	nltk.download('punkt')

	#Predefined stopwords in nltk are used
	stop_words = stopwords.words('english')
	stemmer = PorterStemmer()
	pub_list_first_stem = []
	pub_list = []
	pub_list_wo_sc = []
	print(len(pubName))

	for file in pubName:
	#Splitting strings to tokens(words)
	words = word_tokenize(file)
	stem_word = ""
	for i in words:
	if i.lower() not in stop_words:
	stem_word += stemmer.stem(i) + " "
	pub_list_first_stem.append(stem_word)
	pub_list.append(file)

	#Removing all below characters
	special_characters = '''!()-—[]{};:'"\, <>./?@#$%^&*_~0123456789+=’‘'''
	for file in pub_list:
	word_wo_sc = ""
	if len(file.split()) ==1 : pub_list_wo_sc.append(file)
	else:
	for a in file:
	if a in special_characters:
	word_wo_sc += ' '
	else:
	word_wo_sc += a
	#print(word_wo_sc)
	pub_list_wo_sc.append(word_wo_sc)

	#Stemming Process
	pub_list_stem_wo_sw = []
	for name in pub_list_wo_sc:
	words = word_tokenize(name)
	stem_word = ""
	for a in words:
	if a.lower() not in stop_words:
	stem_word += stemmer.stem(a) + ' '
	pub_list_stem_wo_sw.append(stem_word.lower())

	data_dict = {} #Inverted Index holder

	# Indexing process
	for a in range(len(pub_list_stem_wo_sw)):
	for b in pub_list_stem_wo_sw[a].split():
	if b not in data_dict:
	data_dict[b] = [a]
	else:
	data_dict[b].append(a)

	# printing the lenght
	print(len(pub_list_wo_sc))
	print(len(pub_list_stem_wo_sw))
	print(len(pub_list_first_stem))
	print(len(pub_list))

	# with open('publication_list.json', 'w') as f:
	# ujson.dump(pub_list, f)

	with open('publication_list_stemmed.json', 'w') as f:
	ujson.dump(pub_list_first_stem, f)

	with open('publication_indexed_dictionary.json', 'w') as f:
	ujson.dump(data_dict, f)