Spaces:
Sleeping
Sleeping
import PyPDF2 | |
import yake | |
def generate(input, attempted_items, offset, custom): | |
# Step 1: Import | |
pdfFileObj = open(input.name, 'rb') | |
pdfReader = PyPDF2.PdfReader(pdfFileObj) | |
length = len(pdfReader.pages) | |
f = open("t.txt", "w") | |
pages = [] | |
for x in range(length): | |
pages.append(pdfReader.pages[x].extract_text()) | |
f.writelines(pages) | |
# Step 2: Process for Keywords | |
r = open("t.txt", "r") | |
read = r.read() | |
keywords = [] | |
if attempted_items > 0: | |
kw_extractor = yake.KeywordExtractor(top = attempted_items) | |
keywords = kw_extractor.extract_keywords(read) | |
referral_dictionary = {} | |
for kw in keywords: | |
referral_dictionary[kw[0]] = kw[0] | |
if(len(custom)): | |
split_group = custom.split(';') | |
if len(split_group[0]): | |
for group in split_group: | |
split_items = group.split(',') | |
if not len(split_items): | |
continue | |
destination = split_items[0] | |
for kw in split_items: | |
referral_dictionary[kw] = destination | |
# Step 3: Process for Assignment | |
output = {} | |
starting_keys = referral_dictionary.keys() | |
for x in range(length): | |
pageText = pdfReader.pages[x].extract_text() | |
for kw in starting_keys: | |
if pageText.find(kw) != -1: | |
destination_key = referral_dictionary[kw] | |
if not destination_key in output: | |
output[destination_key] = [] | |
output[destination_key].append(x+1+offset) | |
# Step 4: Output List | |
clean_output = {} | |
sortable = [] | |
for kw in output.keys(): | |
clean_output[kw] = [] | |
sortable.append(kw) | |
for kw in output: | |
clean_pages = [] | |
if (len(output[kw]) == 0): | |
print('detected but no pages!:', kw, output[kw]) | |
continue | |
start = output[kw][0] | |
end = output[kw][0] | |
for num in output[kw]: | |
if num > end + 1: | |
if start == end: | |
clean_pages.append(str(start)) | |
else: | |
clean_pages.append(f'{start}-{end}') | |
start = num | |
end = num | |
if start == end: | |
clean_pages.append(str(start)) | |
else: | |
clean_pages.append(f'{start}-{end}') | |
clean_output[kw] = clean_pages | |
sortable.sort(key=str.casefold) | |
final = [] | |
removed_count = 0 | |
for item in sortable: | |
if (not clean_output[item]): | |
removed_count += 1 | |
continue | |
page_listings = ', '.join(clean_output[item]) | |
final.append(f'{item}: {page_listings}') | |
return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.' | |