IndexingAlpha / generate.py
ewgewgewg's picture
add custom insert syntax
073a71c
import PyPDF2
import yake
def generate(input, attempted_items, offset, custom):
# Step 1: Import
pdfFileObj = open(input.name, 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)
length = len(pdfReader.pages)
f = open("t.txt", "w")
pages = []
for x in range(length):
pages.append(pdfReader.pages[x].extract_text())
f.writelines(pages)
# Step 2: Process for Keywords
r = open("t.txt", "r")
read = r.read()
keywords = []
if attempted_items > 0:
kw_extractor = yake.KeywordExtractor(top = attempted_items)
keywords = kw_extractor.extract_keywords(read)
referral_dictionary = {}
for kw in keywords:
referral_dictionary[kw[0]] = kw[0]
if(len(custom)):
split_group = custom.split(';')
if len(split_group[0]):
for group in split_group:
split_items = group.split(',')
if not len(split_items):
continue
destination = split_items[0]
for kw in split_items:
referral_dictionary[kw] = destination
# Step 3: Process for Assignment
output = {}
starting_keys = referral_dictionary.keys()
for x in range(length):
pageText = pdfReader.pages[x].extract_text()
for kw in starting_keys:
if pageText.find(kw) != -1:
destination_key = referral_dictionary[kw]
if not destination_key in output:
output[destination_key] = []
output[destination_key].append(x+1+offset)
# Step 4: Output List
clean_output = {}
sortable = []
for kw in output.keys():
clean_output[kw] = []
sortable.append(kw)
for kw in output:
clean_pages = []
if (len(output[kw]) == 0):
print('detected but no pages!:', kw, output[kw])
continue
start = output[kw][0]
end = output[kw][0]
for num in output[kw]:
if num > end + 1:
if start == end:
clean_pages.append(str(start))
else:
clean_pages.append(f'{start}-{end}')
start = num
end = num
if start == end:
clean_pages.append(str(start))
else:
clean_pages.append(f'{start}-{end}')
clean_output[kw] = clean_pages
sortable.sort(key=str.casefold)
final = []
removed_count = 0
for item in sortable:
if (not clean_output[item]):
removed_count += 1
continue
page_listings = ', '.join(clean_output[item])
final.append(f'{item}: {page_listings}')
return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'