Spaces:

ewgewgewg
/

IndexingAlpha

Sleeping

File size: 2,841 Bytes

073a71c

import PyPDF2
import yake

def generate(input, attempted_items, offset, custom):

    # Step 1: Import
    pdfFileObj = open(input.name, 'rb')

    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    
    length = len(pdfReader.pages)

    f = open("t.txt", "w")

    pages = []

    for x in range(length):
        pages.append(pdfReader.pages[x].extract_text())

    f.writelines(pages)

    # Step 2: Process for Keywords
    r = open("t.txt", "r")
    read = r.read()

    keywords = []

    if attempted_items > 0:
        kw_extractor = yake.KeywordExtractor(top = attempted_items)
        keywords = kw_extractor.extract_keywords(read)

    referral_dictionary = {}

    for kw in keywords:
        referral_dictionary[kw[0]] = kw[0]

    if(len(custom)):
        split_group = custom.split(';')
        if len(split_group[0]):
            for group in split_group:
                split_items = group.split(',')
                if not len(split_items):
                    continue
                destination = split_items[0]
                for kw in split_items:
                    referral_dictionary[kw] = destination

    # Step 3: Process for Assignment
    output = {}
    starting_keys = referral_dictionary.keys()

    for x in range(length):
        pageText = pdfReader.pages[x].extract_text()

        for kw in starting_keys:
            if pageText.find(kw) != -1:
                destination_key = referral_dictionary[kw]
                if not destination_key in output:
                    output[destination_key] = []
                output[destination_key].append(x+1+offset)

    # Step 4: Output List
    clean_output = {}
    sortable = []
    for kw in output.keys():
        clean_output[kw] = []
        sortable.append(kw)

    for kw in output:
        clean_pages = []
        if (len(output[kw]) == 0):
            print('detected but no pages!:', kw, output[kw])
            continue
        start = output[kw][0]
        end = output[kw][0]

        for num in output[kw]:
            if num > end + 1:
                if start == end:
                    clean_pages.append(str(start))
                else:
                    clean_pages.append(f'{start}-{end}')
                start = num
            end = num

        if start == end:
            clean_pages.append(str(start))
        else:
            clean_pages.append(f'{start}-{end}')
        
        clean_output[kw] = clean_pages

    sortable.sort(key=str.casefold)
    final = []
    removed_count = 0
    
    for item in sortable:
        if (not clean_output[item]):
            removed_count += 1
            continue

        page_listings = ', '.join(clean_output[item])
        final.append(f'{item}: {page_listings}')

    return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'