Spaces:
Sleeping
Sleeping
File size: 2,841 Bytes
073a71c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import PyPDF2
import yake
def generate(input, attempted_items, offset, custom):
# Step 1: Import
pdfFileObj = open(input.name, 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)
length = len(pdfReader.pages)
f = open("t.txt", "w")
pages = []
for x in range(length):
pages.append(pdfReader.pages[x].extract_text())
f.writelines(pages)
# Step 2: Process for Keywords
r = open("t.txt", "r")
read = r.read()
keywords = []
if attempted_items > 0:
kw_extractor = yake.KeywordExtractor(top = attempted_items)
keywords = kw_extractor.extract_keywords(read)
referral_dictionary = {}
for kw in keywords:
referral_dictionary[kw[0]] = kw[0]
if(len(custom)):
split_group = custom.split(';')
if len(split_group[0]):
for group in split_group:
split_items = group.split(',')
if not len(split_items):
continue
destination = split_items[0]
for kw in split_items:
referral_dictionary[kw] = destination
# Step 3: Process for Assignment
output = {}
starting_keys = referral_dictionary.keys()
for x in range(length):
pageText = pdfReader.pages[x].extract_text()
for kw in starting_keys:
if pageText.find(kw) != -1:
destination_key = referral_dictionary[kw]
if not destination_key in output:
output[destination_key] = []
output[destination_key].append(x+1+offset)
# Step 4: Output List
clean_output = {}
sortable = []
for kw in output.keys():
clean_output[kw] = []
sortable.append(kw)
for kw in output:
clean_pages = []
if (len(output[kw]) == 0):
print('detected but no pages!:', kw, output[kw])
continue
start = output[kw][0]
end = output[kw][0]
for num in output[kw]:
if num > end + 1:
if start == end:
clean_pages.append(str(start))
else:
clean_pages.append(f'{start}-{end}')
start = num
end = num
if start == end:
clean_pages.append(str(start))
else:
clean_pages.append(f'{start}-{end}')
clean_output[kw] = clean_pages
sortable.sort(key=str.casefold)
final = []
removed_count = 0
for item in sortable:
if (not clean_output[item]):
removed_count += 1
continue
page_listings = ', '.join(clean_output[item])
final.append(f'{item}: {page_listings}')
return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'
|