ewgewgewg commited on
Commit
073a71c
1 Parent(s): c6fbe5d

add custom insert syntax

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. app.py +3 -97
  3. generate.py +105 -0
.gitignore CHANGED
@@ -1 +1,2 @@
1
- .env
 
 
1
+ .env
2
+ __pycache__
app.py CHANGED
@@ -1,110 +1,16 @@
1
  # GNU
2
  import gradio as gr
3
- import PyPDF2
4
- import yake
5
 
6
  demo = gr.Blocks()
7
 
8
- def generate(input, attempted_items, offset, custom):
9
-
10
- # Step 1: Import
11
-
12
- pdfFileObj = open(input.name, 'rb')
13
-
14
- pdfReader = PyPDF2.PdfReader(pdfFileObj)
15
-
16
- length = len(pdfReader.pages)
17
-
18
- f = open("t.txt", "w")
19
-
20
- pages = []
21
-
22
- for x in range(length):
23
- pages.append(pdfReader.pages[x].extract_text())
24
-
25
- f.writelines(pages)
26
-
27
- # Step 2: Process for Keywords
28
- r = open("t.txt", "r")
29
- read = r.read()
30
-
31
- kw_extractor = yake.KeywordExtractor(top = attempted_items)
32
- keywords = kw_extractor.extract_keywords(read)
33
-
34
- kw_list = []
35
- for kw in keywords:
36
- kw_list.append(kw[0])
37
-
38
- if(len(custom)):
39
- split_custom = custom.split(';')
40
- for kw in split_custom:
41
- kw_list.append(kw)
42
-
43
- # Step 3: Process for Assignment
44
- output = {}
45
- for kw in kw_list:
46
- output[kw] = []
47
-
48
- for x in range(length):
49
- pageText = pdfReader.pages[x].extract_text()
50
-
51
- for kw in kw_list:
52
- if pageText.find(kw) != -1:
53
- output[kw].append(x+1+offset)
54
-
55
- # Step 4: Output List
56
-
57
- clean_output = {}
58
- sortable = []
59
- for kw in kw_list:
60
- clean_output[kw] = []
61
- sortable.append(kw)
62
-
63
- for kw in output:
64
- clean_pages = []
65
- if (len(output[kw]) == 0):
66
- print('detected but no pages!:', kw, output[kw])
67
- continue
68
- start = output[kw][0]
69
- end = output[kw][0]
70
-
71
- for num in output[kw]:
72
- if num > end + 1:
73
- if start == end:
74
- clean_pages.append(str(start))
75
- else:
76
- clean_pages.append(f'{start}-{end}')
77
- start = num
78
- end = num
79
-
80
- if start == end:
81
- clean_pages.append(str(start))
82
- else:
83
- clean_pages.append(f'{start}-{end}')
84
-
85
- clean_output[kw] = clean_pages
86
-
87
- sortable.sort(key=str.casefold)
88
- final = []
89
- removed_count = 0
90
-
91
- for item in sortable:
92
- if (not clean_output[item]):
93
- removed_count += 1
94
- continue
95
-
96
- page_listings = ', '.join(clean_output[item])
97
- final.append(f'{item}: {page_listings}')
98
-
99
- return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'
100
-
101
  def attempted_items_changer(attempted_items_input):
102
  if (not attempted_items_input.isdigit()):
103
  return {
104
  attempted_items: 50
105
  }
106
  return {
107
- attempted_items: int(attempted_items_input)
108
  }
109
 
110
  def offset_changer(offset_input):
@@ -151,7 +57,7 @@ with demo:
151
  custom_input = gr.Textbox(value="", show_label=True, label="Custom")
152
  custom_input.change(custom_changer, [custom_input], [custom])
153
 
154
- gr.HTML("<p><em>You can add semicolon-separated values in Custom to add custom fields to index. If the fields do not produce expected values, you may be clicking too quickly -- please adjust the field, wait, and try again.</em></p>")
155
 
156
 
157
  gr.Markdown("---")
 
1
  # GNU
2
  import gradio as gr
3
+ from generate import generate
 
4
 
5
  demo = gr.Blocks()
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def attempted_items_changer(attempted_items_input):
8
  if (not attempted_items_input.isdigit()):
9
  return {
10
  attempted_items: 50
11
  }
12
  return {
13
+ attempted_items: max(int(attempted_items_input), 0)
14
  }
15
 
16
  def offset_changer(offset_input):
 
57
  custom_input = gr.Textbox(value="", show_label=True, label="Custom")
58
  custom_input.change(custom_changer, [custom_input], [custom])
59
 
60
+ gr.HTML("<p><em>You can add semicolon-separated values in Custom to add custom fields to index. Optionally, you can comma-separate terms between semicolons if you want multiple terms to contribute to a single index entry -- the first term will be the label for the index entry. If Custom does not produce expected values, you may be clicking too quickly -- please adjust the field, wait, and try again.</em></p>")
61
 
62
 
63
  gr.Markdown("---")
generate.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import yake
3
+
4
+ def generate(input, attempted_items, offset, custom):
5
+
6
+ # Step 1: Import
7
+ pdfFileObj = open(input.name, 'rb')
8
+
9
+ pdfReader = PyPDF2.PdfReader(pdfFileObj)
10
+
11
+ length = len(pdfReader.pages)
12
+
13
+ f = open("t.txt", "w")
14
+
15
+ pages = []
16
+
17
+ for x in range(length):
18
+ pages.append(pdfReader.pages[x].extract_text())
19
+
20
+ f.writelines(pages)
21
+
22
+ # Step 2: Process for Keywords
23
+ r = open("t.txt", "r")
24
+ read = r.read()
25
+
26
+ keywords = []
27
+
28
+ if attempted_items > 0:
29
+ kw_extractor = yake.KeywordExtractor(top = attempted_items)
30
+ keywords = kw_extractor.extract_keywords(read)
31
+
32
+ referral_dictionary = {}
33
+
34
+ for kw in keywords:
35
+ referral_dictionary[kw[0]] = kw[0]
36
+
37
+ if(len(custom)):
38
+ split_group = custom.split(';')
39
+ if len(split_group[0]):
40
+ for group in split_group:
41
+ split_items = group.split(',')
42
+ if not len(split_items):
43
+ continue
44
+ destination = split_items[0]
45
+ for kw in split_items:
46
+ referral_dictionary[kw] = destination
47
+
48
+ # Step 3: Process for Assignment
49
+ output = {}
50
+ starting_keys = referral_dictionary.keys()
51
+
52
+ for x in range(length):
53
+ pageText = pdfReader.pages[x].extract_text()
54
+
55
+ for kw in starting_keys:
56
+ if pageText.find(kw) != -1:
57
+ destination_key = referral_dictionary[kw]
58
+ if not destination_key in output:
59
+ output[destination_key] = []
60
+ output[destination_key].append(x+1+offset)
61
+
62
+ # Step 4: Output List
63
+ clean_output = {}
64
+ sortable = []
65
+ for kw in output.keys():
66
+ clean_output[kw] = []
67
+ sortable.append(kw)
68
+
69
+ for kw in output:
70
+ clean_pages = []
71
+ if (len(output[kw]) == 0):
72
+ print('detected but no pages!:', kw, output[kw])
73
+ continue
74
+ start = output[kw][0]
75
+ end = output[kw][0]
76
+
77
+ for num in output[kw]:
78
+ if num > end + 1:
79
+ if start == end:
80
+ clean_pages.append(str(start))
81
+ else:
82
+ clean_pages.append(f'{start}-{end}')
83
+ start = num
84
+ end = num
85
+
86
+ if start == end:
87
+ clean_pages.append(str(start))
88
+ else:
89
+ clean_pages.append(f'{start}-{end}')
90
+
91
+ clean_output[kw] = clean_pages
92
+
93
+ sortable.sort(key=str.casefold)
94
+ final = []
95
+ removed_count = 0
96
+
97
+ for item in sortable:
98
+ if (not clean_output[item]):
99
+ removed_count += 1
100
+ continue
101
+
102
+ page_listings = ', '.join(clean_output[item])
103
+ final.append(f'{item}: {page_listings}')
104
+
105
+ return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'