import PyPDF2 import yake def generate(input, attempted_items, offset, custom): # Step 1: Import pdfFileObj = open(input.name, 'rb') pdfReader = PyPDF2.PdfReader(pdfFileObj) length = len(pdfReader.pages) f = open("t.txt", "w") pages = [] for x in range(length): pages.append(pdfReader.pages[x].extract_text()) f.writelines(pages) # Step 2: Process for Keywords r = open("t.txt", "r") read = r.read() keywords = [] if attempted_items > 0: kw_extractor = yake.KeywordExtractor(top = attempted_items) keywords = kw_extractor.extract_keywords(read) referral_dictionary = {} for kw in keywords: referral_dictionary[kw[0]] = kw[0] if(len(custom)): split_group = custom.split(';') if len(split_group[0]): for group in split_group: split_items = group.split(',') if not len(split_items): continue destination = split_items[0] for kw in split_items: referral_dictionary[kw] = destination # Step 3: Process for Assignment output = {} starting_keys = referral_dictionary.keys() for x in range(length): pageText = pdfReader.pages[x].extract_text() for kw in starting_keys: if pageText.find(kw) != -1: destination_key = referral_dictionary[kw] if not destination_key in output: output[destination_key] = [] output[destination_key].append(x+1+offset) # Step 4: Output List clean_output = {} sortable = [] for kw in output.keys(): clean_output[kw] = [] sortable.append(kw) for kw in output: clean_pages = [] if (len(output[kw]) == 0): print('detected but no pages!:', kw, output[kw]) continue start = output[kw][0] end = output[kw][0] for num in output[kw]: if num > end + 1: if start == end: clean_pages.append(str(start)) else: clean_pages.append(f'{start}-{end}') start = num end = num if start == end: clean_pages.append(str(start)) else: clean_pages.append(f'{start}-{end}') clean_output[kw] = clean_pages sortable.sort(key=str.casefold) final = [] removed_count = 0 for item in sortable: if (not clean_output[item]): removed_count += 1 continue page_listings = ', '.join(clean_output[item]) final.append(f'{item}: {page_listings}') return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'