Spaces:
Sleeping
Sleeping
add custom insert syntax
Browse files- .gitignore +2 -1
- app.py +3 -97
- generate.py +105 -0
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
.env
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__
|
app.py
CHANGED
@@ -1,110 +1,16 @@
|
|
1 |
# GNU
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
import yake
|
5 |
|
6 |
demo = gr.Blocks()
|
7 |
|
8 |
-
def generate(input, attempted_items, offset, custom):
|
9 |
-
|
10 |
-
# Step 1: Import
|
11 |
-
|
12 |
-
pdfFileObj = open(input.name, 'rb')
|
13 |
-
|
14 |
-
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
15 |
-
|
16 |
-
length = len(pdfReader.pages)
|
17 |
-
|
18 |
-
f = open("t.txt", "w")
|
19 |
-
|
20 |
-
pages = []
|
21 |
-
|
22 |
-
for x in range(length):
|
23 |
-
pages.append(pdfReader.pages[x].extract_text())
|
24 |
-
|
25 |
-
f.writelines(pages)
|
26 |
-
|
27 |
-
# Step 2: Process for Keywords
|
28 |
-
r = open("t.txt", "r")
|
29 |
-
read = r.read()
|
30 |
-
|
31 |
-
kw_extractor = yake.KeywordExtractor(top = attempted_items)
|
32 |
-
keywords = kw_extractor.extract_keywords(read)
|
33 |
-
|
34 |
-
kw_list = []
|
35 |
-
for kw in keywords:
|
36 |
-
kw_list.append(kw[0])
|
37 |
-
|
38 |
-
if(len(custom)):
|
39 |
-
split_custom = custom.split(';')
|
40 |
-
for kw in split_custom:
|
41 |
-
kw_list.append(kw)
|
42 |
-
|
43 |
-
# Step 3: Process for Assignment
|
44 |
-
output = {}
|
45 |
-
for kw in kw_list:
|
46 |
-
output[kw] = []
|
47 |
-
|
48 |
-
for x in range(length):
|
49 |
-
pageText = pdfReader.pages[x].extract_text()
|
50 |
-
|
51 |
-
for kw in kw_list:
|
52 |
-
if pageText.find(kw) != -1:
|
53 |
-
output[kw].append(x+1+offset)
|
54 |
-
|
55 |
-
# Step 4: Output List
|
56 |
-
|
57 |
-
clean_output = {}
|
58 |
-
sortable = []
|
59 |
-
for kw in kw_list:
|
60 |
-
clean_output[kw] = []
|
61 |
-
sortable.append(kw)
|
62 |
-
|
63 |
-
for kw in output:
|
64 |
-
clean_pages = []
|
65 |
-
if (len(output[kw]) == 0):
|
66 |
-
print('detected but no pages!:', kw, output[kw])
|
67 |
-
continue
|
68 |
-
start = output[kw][0]
|
69 |
-
end = output[kw][0]
|
70 |
-
|
71 |
-
for num in output[kw]:
|
72 |
-
if num > end + 1:
|
73 |
-
if start == end:
|
74 |
-
clean_pages.append(str(start))
|
75 |
-
else:
|
76 |
-
clean_pages.append(f'{start}-{end}')
|
77 |
-
start = num
|
78 |
-
end = num
|
79 |
-
|
80 |
-
if start == end:
|
81 |
-
clean_pages.append(str(start))
|
82 |
-
else:
|
83 |
-
clean_pages.append(f'{start}-{end}')
|
84 |
-
|
85 |
-
clean_output[kw] = clean_pages
|
86 |
-
|
87 |
-
sortable.sort(key=str.casefold)
|
88 |
-
final = []
|
89 |
-
removed_count = 0
|
90 |
-
|
91 |
-
for item in sortable:
|
92 |
-
if (not clean_output[item]):
|
93 |
-
removed_count += 1
|
94 |
-
continue
|
95 |
-
|
96 |
-
page_listings = ', '.join(clean_output[item])
|
97 |
-
final.append(f'{item}: {page_listings}')
|
98 |
-
|
99 |
-
return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'
|
100 |
-
|
101 |
def attempted_items_changer(attempted_items_input):
|
102 |
if (not attempted_items_input.isdigit()):
|
103 |
return {
|
104 |
attempted_items: 50
|
105 |
}
|
106 |
return {
|
107 |
-
attempted_items: int(attempted_items_input)
|
108 |
}
|
109 |
|
110 |
def offset_changer(offset_input):
|
@@ -151,7 +57,7 @@ with demo:
|
|
151 |
custom_input = gr.Textbox(value="", show_label=True, label="Custom")
|
152 |
custom_input.change(custom_changer, [custom_input], [custom])
|
153 |
|
154 |
-
gr.HTML("<p><em>You can add semicolon-separated values in Custom to add custom fields to index.
|
155 |
|
156 |
|
157 |
gr.Markdown("---")
|
|
|
1 |
# GNU
|
2 |
import gradio as gr
|
3 |
+
from generate import generate
|
|
|
4 |
|
5 |
demo = gr.Blocks()
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def attempted_items_changer(attempted_items_input):
|
8 |
if (not attempted_items_input.isdigit()):
|
9 |
return {
|
10 |
attempted_items: 50
|
11 |
}
|
12 |
return {
|
13 |
+
attempted_items: max(int(attempted_items_input), 0)
|
14 |
}
|
15 |
|
16 |
def offset_changer(offset_input):
|
|
|
57 |
custom_input = gr.Textbox(value="", show_label=True, label="Custom")
|
58 |
custom_input.change(custom_changer, [custom_input], [custom])
|
59 |
|
60 |
+
gr.HTML("<p><em>You can add semicolon-separated values in Custom to add custom fields to index. Optionally, you can comma-separate terms between semicolons if you want multiple terms to contribute to a single index entry -- the first term will be the label for the index entry. If Custom does not produce expected values, you may be clicking too quickly -- please adjust the field, wait, and try again.</em></p>")
|
61 |
|
62 |
|
63 |
gr.Markdown("---")
|
generate.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import yake
|
3 |
+
|
4 |
+
def generate(input, attempted_items, offset, custom):
|
5 |
+
|
6 |
+
# Step 1: Import
|
7 |
+
pdfFileObj = open(input.name, 'rb')
|
8 |
+
|
9 |
+
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
10 |
+
|
11 |
+
length = len(pdfReader.pages)
|
12 |
+
|
13 |
+
f = open("t.txt", "w")
|
14 |
+
|
15 |
+
pages = []
|
16 |
+
|
17 |
+
for x in range(length):
|
18 |
+
pages.append(pdfReader.pages[x].extract_text())
|
19 |
+
|
20 |
+
f.writelines(pages)
|
21 |
+
|
22 |
+
# Step 2: Process for Keywords
|
23 |
+
r = open("t.txt", "r")
|
24 |
+
read = r.read()
|
25 |
+
|
26 |
+
keywords = []
|
27 |
+
|
28 |
+
if attempted_items > 0:
|
29 |
+
kw_extractor = yake.KeywordExtractor(top = attempted_items)
|
30 |
+
keywords = kw_extractor.extract_keywords(read)
|
31 |
+
|
32 |
+
referral_dictionary = {}
|
33 |
+
|
34 |
+
for kw in keywords:
|
35 |
+
referral_dictionary[kw[0]] = kw[0]
|
36 |
+
|
37 |
+
if(len(custom)):
|
38 |
+
split_group = custom.split(';')
|
39 |
+
if len(split_group[0]):
|
40 |
+
for group in split_group:
|
41 |
+
split_items = group.split(',')
|
42 |
+
if not len(split_items):
|
43 |
+
continue
|
44 |
+
destination = split_items[0]
|
45 |
+
for kw in split_items:
|
46 |
+
referral_dictionary[kw] = destination
|
47 |
+
|
48 |
+
# Step 3: Process for Assignment
|
49 |
+
output = {}
|
50 |
+
starting_keys = referral_dictionary.keys()
|
51 |
+
|
52 |
+
for x in range(length):
|
53 |
+
pageText = pdfReader.pages[x].extract_text()
|
54 |
+
|
55 |
+
for kw in starting_keys:
|
56 |
+
if pageText.find(kw) != -1:
|
57 |
+
destination_key = referral_dictionary[kw]
|
58 |
+
if not destination_key in output:
|
59 |
+
output[destination_key] = []
|
60 |
+
output[destination_key].append(x+1+offset)
|
61 |
+
|
62 |
+
# Step 4: Output List
|
63 |
+
clean_output = {}
|
64 |
+
sortable = []
|
65 |
+
for kw in output.keys():
|
66 |
+
clean_output[kw] = []
|
67 |
+
sortable.append(kw)
|
68 |
+
|
69 |
+
for kw in output:
|
70 |
+
clean_pages = []
|
71 |
+
if (len(output[kw]) == 0):
|
72 |
+
print('detected but no pages!:', kw, output[kw])
|
73 |
+
continue
|
74 |
+
start = output[kw][0]
|
75 |
+
end = output[kw][0]
|
76 |
+
|
77 |
+
for num in output[kw]:
|
78 |
+
if num > end + 1:
|
79 |
+
if start == end:
|
80 |
+
clean_pages.append(str(start))
|
81 |
+
else:
|
82 |
+
clean_pages.append(f'{start}-{end}')
|
83 |
+
start = num
|
84 |
+
end = num
|
85 |
+
|
86 |
+
if start == end:
|
87 |
+
clean_pages.append(str(start))
|
88 |
+
else:
|
89 |
+
clean_pages.append(f'{start}-{end}')
|
90 |
+
|
91 |
+
clean_output[kw] = clean_pages
|
92 |
+
|
93 |
+
sortable.sort(key=str.casefold)
|
94 |
+
final = []
|
95 |
+
removed_count = 0
|
96 |
+
|
97 |
+
for item in sortable:
|
98 |
+
if (not clean_output[item]):
|
99 |
+
removed_count += 1
|
100 |
+
continue
|
101 |
+
|
102 |
+
page_listings = ', '.join(clean_output[item])
|
103 |
+
final.append(f'{item}: {page_listings}')
|
104 |
+
|
105 |
+
return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'
|