Spaces:
Sleeping
Sleeping
Add application
Browse files- .gitignore +1 -0
- README.md +2 -0
- app.py +114 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
README.md
CHANGED
@@ -10,4 +10,6 @@ pinned: false
|
|
10 |
license: gpl
|
11 |
---
|
12 |
|
|
|
|
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
10 |
license: gpl
|
11 |
---
|
12 |
|
13 |
+
A basic tool made to create back-of-the-book indexes, using yake
|
14 |
+
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GNU
|
2 |
+
import gradio as gr
|
3 |
+
import PyPDF2
|
4 |
+
import yake
|
5 |
+
|
6 |
+
demo = gr.Blocks()
|
7 |
+
|
8 |
+
def generate(input):
|
9 |
+
# Step 1: Import
|
10 |
+
|
11 |
+
pdfFileObj = open(input.name, 'rb')
|
12 |
+
|
13 |
+
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
14 |
+
|
15 |
+
length = len(pdfReader.pages)
|
16 |
+
|
17 |
+
f = open("t.txt", "w")
|
18 |
+
|
19 |
+
pages = []
|
20 |
+
|
21 |
+
for x in range(length):
|
22 |
+
pages.append(pdfReader.pages[x].extract_text())
|
23 |
+
|
24 |
+
f.writelines(pages)
|
25 |
+
|
26 |
+
# Step 2: Process for Keywords
|
27 |
+
r = open("t.txt", "r")
|
28 |
+
read = r.read()
|
29 |
+
|
30 |
+
kw_extractor = yake.KeywordExtractor(top = 50)
|
31 |
+
keywords = kw_extractor.extract_keywords(read)
|
32 |
+
|
33 |
+
kw_list = []
|
34 |
+
for kw in keywords:
|
35 |
+
kw_list.append(kw[0])
|
36 |
+
|
37 |
+
# Step 3: Process for Assignment
|
38 |
+
output = {}
|
39 |
+
for kw in kw_list:
|
40 |
+
output[kw] = []
|
41 |
+
|
42 |
+
for x in range(length):
|
43 |
+
pageText = pdfReader.pages[x].extract_text()
|
44 |
+
|
45 |
+
for kw in kw_list:
|
46 |
+
if pageText.find(kw) != -1:
|
47 |
+
output[kw].append(x+1)
|
48 |
+
|
49 |
+
|
50 |
+
# Step 4: Output List
|
51 |
+
|
52 |
+
clean_output = {}
|
53 |
+
for kw in kw_list:
|
54 |
+
clean_output[kw] = []
|
55 |
+
|
56 |
+
for kw in output:
|
57 |
+
clean_pages = []
|
58 |
+
if (len(output[kw]) == 0):
|
59 |
+
print('detected but no pages!:', kw, output[kw])
|
60 |
+
continue
|
61 |
+
start = output[kw][0]
|
62 |
+
end = output[kw][0]
|
63 |
+
|
64 |
+
for num in output[kw]:
|
65 |
+
if num > end + 1:
|
66 |
+
if start == end:
|
67 |
+
clean_pages.append(str(start))
|
68 |
+
else:
|
69 |
+
clean_pages.append(f'{start}-{end}')
|
70 |
+
start = num
|
71 |
+
end = num
|
72 |
+
|
73 |
+
if start == end:
|
74 |
+
clean_pages.append(str(start))
|
75 |
+
else:
|
76 |
+
clean_pages.append(f'{start}-{end}')
|
77 |
+
|
78 |
+
clean_output[kw] = clean_pages
|
79 |
+
|
80 |
+
return str(clean_output)
|
81 |
+
|
82 |
+
|
83 |
+
with demo:
|
84 |
+
|
85 |
+
gr.Markdown("# PDF to Index")
|
86 |
+
|
87 |
+
with gr.Column():
|
88 |
+
|
89 |
+
gr.Markdown("## Load Inputs")
|
90 |
+
|
91 |
+
uploaded_file = gr.File(
|
92 |
+
label="Upload a PDF file",
|
93 |
+
file_count="single",
|
94 |
+
type="file"
|
95 |
+
)
|
96 |
+
|
97 |
+
gr.Markdown("---")
|
98 |
+
|
99 |
+
with gr.Column():
|
100 |
+
gr.Markdown("## Index From PDF")
|
101 |
+
convert_button = gr.Button("Index From PDF!", variant="primary")
|
102 |
+
out_placeholder = gr.HTML("<p><em>Output will appear below, with PyPDF2 for preprocessing and yake for processing:</em></p>")
|
103 |
+
gr.Markdown("### Raw Index")
|
104 |
+
index = gr.Textbox(
|
105 |
+
label="Raw Index", placeholder="The index will appear here"
|
106 |
+
)
|
107 |
+
|
108 |
+
convert_button.click(
|
109 |
+
fn=generate,
|
110 |
+
inputs=[uploaded_file],
|
111 |
+
outputs=[index],
|
112 |
+
)
|
113 |
+
|
114 |
+
demo.launch()
|