ewgewgewg commited on
Commit
d380e2b
1 Parent(s): 57821f1

Add application

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. README.md +2 -0
  3. app.py +114 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md CHANGED
@@ -10,4 +10,6 @@ pinned: false
10
  license: gpl
11
  ---
12
 
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
10
  license: gpl
11
  ---
12
 
13
+ A basic tool made to create back-of-the-book indexes, using yake
14
+
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GNU
2
+ import gradio as gr
3
+ import PyPDF2
4
+ import yake
5
+
6
+ demo = gr.Blocks()
7
+
8
+ def generate(input):
9
+ # Step 1: Import
10
+
11
+ pdfFileObj = open(input.name, 'rb')
12
+
13
+ pdfReader = PyPDF2.PdfReader(pdfFileObj)
14
+
15
+ length = len(pdfReader.pages)
16
+
17
+ f = open("t.txt", "w")
18
+
19
+ pages = []
20
+
21
+ for x in range(length):
22
+ pages.append(pdfReader.pages[x].extract_text())
23
+
24
+ f.writelines(pages)
25
+
26
+ # Step 2: Process for Keywords
27
+ r = open("t.txt", "r")
28
+ read = r.read()
29
+
30
+ kw_extractor = yake.KeywordExtractor(top = 50)
31
+ keywords = kw_extractor.extract_keywords(read)
32
+
33
+ kw_list = []
34
+ for kw in keywords:
35
+ kw_list.append(kw[0])
36
+
37
+ # Step 3: Process for Assignment
38
+ output = {}
39
+ for kw in kw_list:
40
+ output[kw] = []
41
+
42
+ for x in range(length):
43
+ pageText = pdfReader.pages[x].extract_text()
44
+
45
+ for kw in kw_list:
46
+ if pageText.find(kw) != -1:
47
+ output[kw].append(x+1)
48
+
49
+
50
+ # Step 4: Output List
51
+
52
+ clean_output = {}
53
+ for kw in kw_list:
54
+ clean_output[kw] = []
55
+
56
+ for kw in output:
57
+ clean_pages = []
58
+ if (len(output[kw]) == 0):
59
+ print('detected but no pages!:', kw, output[kw])
60
+ continue
61
+ start = output[kw][0]
62
+ end = output[kw][0]
63
+
64
+ for num in output[kw]:
65
+ if num > end + 1:
66
+ if start == end:
67
+ clean_pages.append(str(start))
68
+ else:
69
+ clean_pages.append(f'{start}-{end}')
70
+ start = num
71
+ end = num
72
+
73
+ if start == end:
74
+ clean_pages.append(str(start))
75
+ else:
76
+ clean_pages.append(f'{start}-{end}')
77
+
78
+ clean_output[kw] = clean_pages
79
+
80
+ return str(clean_output)
81
+
82
+
83
+ with demo:
84
+
85
+ gr.Markdown("# PDF to Index")
86
+
87
+ with gr.Column():
88
+
89
+ gr.Markdown("## Load Inputs")
90
+
91
+ uploaded_file = gr.File(
92
+ label="Upload a PDF file",
93
+ file_count="single",
94
+ type="file"
95
+ )
96
+
97
+ gr.Markdown("---")
98
+
99
+ with gr.Column():
100
+ gr.Markdown("## Index From PDF")
101
+ convert_button = gr.Button("Index From PDF!", variant="primary")
102
+ out_placeholder = gr.HTML("<p><em>Output will appear below, with PyPDF2 for preprocessing and yake for processing:</em></p>")
103
+ gr.Markdown("### Raw Index")
104
+ index = gr.Textbox(
105
+ label="Raw Index", placeholder="The index will appear here"
106
+ )
107
+
108
+ convert_button.click(
109
+ fn=generate,
110
+ inputs=[uploaded_file],
111
+ outputs=[index],
112
+ )
113
+
114
+ demo.launch()