File size: 2,841 Bytes
073a71c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import PyPDF2
import yake

def generate(input, attempted_items, offset, custom):

    # Step 1: Import
    pdfFileObj = open(input.name, 'rb')

    pdfReader = PyPDF2.PdfReader(pdfFileObj)
    
    length = len(pdfReader.pages)

    f = open("t.txt", "w")

    pages = []

    for x in range(length):
        pages.append(pdfReader.pages[x].extract_text())

    f.writelines(pages)

    # Step 2: Process for Keywords
    r = open("t.txt", "r")
    read = r.read()

    keywords = []

    if attempted_items > 0:
        kw_extractor = yake.KeywordExtractor(top = attempted_items)
        keywords = kw_extractor.extract_keywords(read)

    referral_dictionary = {}

    for kw in keywords:
        referral_dictionary[kw[0]] = kw[0]

    if(len(custom)):
        split_group = custom.split(';')
        if len(split_group[0]):
            for group in split_group:
                split_items = group.split(',')
                if not len(split_items):
                    continue
                destination = split_items[0]
                for kw in split_items:
                    referral_dictionary[kw] = destination

    # Step 3: Process for Assignment
    output = {}
    starting_keys = referral_dictionary.keys()

    for x in range(length):
        pageText = pdfReader.pages[x].extract_text()

        for kw in starting_keys:
            if pageText.find(kw) != -1:
                destination_key = referral_dictionary[kw]
                if not destination_key in output:
                    output[destination_key] = []
                output[destination_key].append(x+1+offset)

    # Step 4: Output List
    clean_output = {}
    sortable = []
    for kw in output.keys():
        clean_output[kw] = []
        sortable.append(kw)

    for kw in output:
        clean_pages = []
        if (len(output[kw]) == 0):
            print('detected but no pages!:', kw, output[kw])
            continue
        start = output[kw][0]
        end = output[kw][0]

        for num in output[kw]:
            if num > end + 1:
                if start == end:
                    clean_pages.append(str(start))
                else:
                    clean_pages.append(f'{start}-{end}')
                start = num
            end = num

        if start == end:
            clean_pages.append(str(start))
        else:
            clean_pages.append(f'{start}-{end}')
        
        clean_output[kw] = clean_pages

    sortable.sort(key=str.casefold)
    final = []
    removed_count = 0
    
    for item in sortable:
        if (not clean_output[item]):
            removed_count += 1
            continue

        page_listings = ', '.join(clean_output[item])
        final.append(f'{item}: {page_listings}')

    return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'