Spaces:

ewgewgewg
/

IndexingAlpha

Sleeping

App Files Files Community

ewgewgewg commited on Feb 28, 2023

Commit

073a71c

•

1 Parent(s): c6fbe5d

add custom insert syntax

Browse files

Files changed (3) hide show

.gitignore +2 -1
app.py +3 -97
generate.py +105 -0

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- .env


1	+ .env
2	+ __pycache__

app.py CHANGED Viewed

@@ -1,110 +1,16 @@
 # GNU
 import gradio as gr
-import PyPDF2
-import yake
 demo = gr.Blocks()
-def generate(input, attempted_items, offset, custom):
-    # Step 1: Import
-    pdfFileObj = open(input.name, 'rb')
-    pdfReader = PyPDF2.PdfReader(pdfFileObj)
-    length = len(pdfReader.pages)
-    f = open("t.txt", "w")
-    pages = []
-    for x in range(length):
-        pages.append(pdfReader.pages[x].extract_text())
-    f.writelines(pages)
-    # Step 2: Process for Keywords
-    r = open("t.txt", "r")
-    read = r.read()
-    kw_extractor = yake.KeywordExtractor(top = attempted_items)
-    keywords = kw_extractor.extract_keywords(read)
-    kw_list = []
-    for kw in keywords:
-        kw_list.append(kw[0])
-    if(len(custom)):
-        split_custom = custom.split(';')
-        for kw in split_custom:
-            kw_list.append(kw)
-    # Step 3: Process for Assignment
-    output = {}
-    for kw in kw_list:
-        output[kw] = []
-    for x in range(length):
-        pageText = pdfReader.pages[x].extract_text()
-        for kw in kw_list:
-            if pageText.find(kw) != -1:
-                output[kw].append(x+1+offset)
-    # Step 4: Output List
-    clean_output = {}
-    sortable = []
-    for kw in kw_list:
-        clean_output[kw] = []
-        sortable.append(kw)
-    for kw in output:
-        clean_pages = []
-        if (len(output[kw]) == 0):
-            print('detected but no pages!:', kw, output[kw])
-            continue
-        start = output[kw][0]
-        end = output[kw][0]
-        for num in output[kw]:
-            if num > end + 1:
-                if start == end:
-                    clean_pages.append(str(start))
-                else:
-                    clean_pages.append(f'{start}-{end}')
-                start = num
-            end = num
-        if start == end:
-            clean_pages.append(str(start))
-        else:
-            clean_pages.append(f'{start}-{end}')
-        clean_output[kw] = clean_pages
-    sortable.sort(key=str.casefold)
-    final = []
-    removed_count = 0
-    for item in sortable:
-        if (not clean_output[item]):
-            removed_count += 1
-            continue
-        page_listings = ', '.join(clean_output[item])
-        final.append(f'{item}: {page_listings}')
-    return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'
 def attempted_items_changer(attempted_items_input):
     if (not attempted_items_input.isdigit()):
         return {
             attempted_items: 50
         }
     return {
-        attempted_items: int(attempted_items_input)
     }
 def offset_changer(offset_input):
@@ -151,7 +57,7 @@ with demo:
             custom_input = gr.Textbox(value="", show_label=True, label="Custom")
             custom_input.change(custom_changer, [custom_input], [custom])
-        gr.HTML("<p><em>You can add semicolon-separated values in Custom to add custom fields to index. If the fields do not produce expected values, you may be clicking too quickly -- please adjust the field, wait, and try again.</em></p>")
         gr.Markdown("---")

 # GNU
 import gradio as gr
+from generate import generate
 demo = gr.Blocks()
 def attempted_items_changer(attempted_items_input):
     if (not attempted_items_input.isdigit()):
         return {
             attempted_items: 50
         }
     return {
+        attempted_items: max(int(attempted_items_input), 0)
     }
 def offset_changer(offset_input):
             custom_input = gr.Textbox(value="", show_label=True, label="Custom")
             custom_input.change(custom_changer, [custom_input], [custom])
+        gr.HTML("<p><em>You can add semicolon-separated values in Custom to add custom fields to index. Optionally, you can comma-separate terms between semicolons if you want multiple terms to contribute to a single index entry -- the first term will be the label for the index entry. If Custom does not produce expected values, you may be clicking too quickly -- please adjust the field, wait, and try again.</em></p>")
         gr.Markdown("---")

generate.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import PyPDF2
+import yake
+def generate(input, attempted_items, offset, custom):
+    # Step 1: Import
+    pdfFileObj = open(input.name, 'rb')
+    pdfReader = PyPDF2.PdfReader(pdfFileObj)
+    length = len(pdfReader.pages)
+    f = open("t.txt", "w")
+    pages = []
+    for x in range(length):
+        pages.append(pdfReader.pages[x].extract_text())
+    f.writelines(pages)
+    # Step 2: Process for Keywords
+    r = open("t.txt", "r")
+    read = r.read()
+    keywords = []
+    if attempted_items > 0:
+        kw_extractor = yake.KeywordExtractor(top = attempted_items)
+        keywords = kw_extractor.extract_keywords(read)
+    referral_dictionary = {}
+    for kw in keywords:
+        referral_dictionary[kw[0]] = kw[0]
+    if(len(custom)):
+        split_group = custom.split(';')
+        if len(split_group[0]):
+            for group in split_group:
+                split_items = group.split(',')
+                if not len(split_items):
+                    continue
+                destination = split_items[0]
+                for kw in split_items:
+                    referral_dictionary[kw] = destination
+    # Step 3: Process for Assignment
+    output = {}
+    starting_keys = referral_dictionary.keys()
+    for x in range(length):
+        pageText = pdfReader.pages[x].extract_text()
+        for kw in starting_keys:
+            if pageText.find(kw) != -1:
+                destination_key = referral_dictionary[kw]
+                if not destination_key in output:
+                    output[destination_key] = []
+                output[destination_key].append(x+1+offset)
+    # Step 4: Output List
+    clean_output = {}
+    sortable = []
+    for kw in output.keys():
+        clean_output[kw] = []
+        sortable.append(kw)
+    for kw in output:
+        clean_pages = []
+        if (len(output[kw]) == 0):
+            print('detected but no pages!:', kw, output[kw])
+            continue
+        start = output[kw][0]
+        end = output[kw][0]
+        for num in output[kw]:
+            if num > end + 1:
+                if start == end:
+                    clean_pages.append(str(start))
+                else:
+                    clean_pages.append(f'{start}-{end}')
+                start = num
+            end = num
+        if start == end:
+            clean_pages.append(str(start))
+        else:
+            clean_pages.append(f'{start}-{end}')
+        clean_output[kw] = clean_pages
+    sortable.sort(key=str.casefold)
+    final = []
+    removed_count = 0
+    for item in sortable:
+        if (not clean_output[item]):
+            removed_count += 1
+            continue
+        page_listings = ', '.join(clean_output[item])
+        final.append(f'{item}: {page_listings}')
+    return '\n'.join(final) + f'\nThere were {removed_count} items generated by algorithm not found on a page.'