Spaces:

ewgewgewg
/

IndexingAlpha

Sleeping

App Files Files Community

IndexingAlpha / generate.py

ewgewgewg

add custom insert syntax

073a71c over 1 year ago

raw history blame contribute delete

No virus

2.84 kB

	import PyPDF2
	import yake

	def generate(input, attempted_items, offset, custom):

	# Step 1: Import
	pdfFileObj = open(input.name, 'rb')

	pdfReader = PyPDF2.PdfReader(pdfFileObj)

	length = len(pdfReader.pages)

	f = open("t.txt", "w")

	pages = []

	for x in range(length):
	pages.append(pdfReader.pages[x].extract_text())

	f.writelines(pages)

	# Step 2: Process for Keywords
	r = open("t.txt", "r")
	read = r.read()

	keywords = []

	if attempted_items > 0:
	kw_extractor = yake.KeywordExtractor(top = attempted_items)
	keywords = kw_extractor.extract_keywords(read)

	referral_dictionary = {}

	for kw in keywords:
	referral_dictionary[kw[0]] = kw[0]

	if(len(custom)):
	split_group = custom.split(';')
	if len(split_group[0]):
	for group in split_group:
	split_items = group.split(',')
	if not len(split_items):
	continue
	destination = split_items[0]
	for kw in split_items:
	referral_dictionary[kw] = destination

	# Step 3: Process for Assignment
	output = {}
	starting_keys = referral_dictionary.keys()

	for x in range(length):
	pageText = pdfReader.pages[x].extract_text()

	for kw in starting_keys:
	if pageText.find(kw) != -1:
	destination_key = referral_dictionary[kw]
	if not destination_key in output:
	output[destination_key] = []
	output[destination_key].append(x+1+offset)

	# Step 4: Output List
	clean_output = {}
	sortable = []
	for kw in output.keys():
	clean_output[kw] = []
	sortable.append(kw)

	for kw in output:
	clean_pages = []
	if (len(output[kw]) == 0):
	print('detected but no pages!:', kw, output[kw])
	continue
	start = output[kw][0]
	end = output[kw][0]

	for num in output[kw]:
	if num > end + 1:
	if start == end:
	clean_pages.append(str(start))
	else:
	clean_pages.append(f'{start}-{end}')
	start = num
	end = num

	if start == end:
	clean_pages.append(str(start))
	else:
	clean_pages.append(f'{start}-{end}')

	clean_output[kw] = clean_pages

	sortable.sort(key=str.casefold)
	final = []
	removed_count = 0

	for item in sortable:
	if (not clean_output[item]):
	removed_count += 1
	continue

	page_listings = ', '.join(clean_output[item])
	final.append(f'{item}: {page_listings}')

	return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'