Spaces:

ewgewgewg
/

IndexingAlpha

Sleeping

App Files Files Community

ewgewgewg commited on Feb 26, 2023

Commit

d380e2b

•

1 Parent(s): 57821f1

Add application

Browse files

Files changed (3) hide show

.gitignore +1 -0
README.md +2 -0
app.py +114 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

README.md CHANGED Viewed

@@ -10,4 +10,6 @@ pinned: false
 license: gpl
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: gpl
 ---
+A basic tool made to create back-of-the-book indexes, using yake
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# GNU
+import gradio as gr
+import PyPDF2
+import yake
+demo = gr.Blocks()
+def generate(input):
+    # Step 1: Import
+    pdfFileObj = open(input.name, 'rb')
+    pdfReader = PyPDF2.PdfReader(pdfFileObj)
+    length = len(pdfReader.pages)
+    f = open("t.txt", "w")
+    pages = []
+    for x in range(length):
+        pages.append(pdfReader.pages[x].extract_text())
+    f.writelines(pages)
+    # Step 2: Process for Keywords
+    r = open("t.txt", "r")
+    read = r.read()
+    kw_extractor = yake.KeywordExtractor(top = 50)
+    keywords = kw_extractor.extract_keywords(read)
+    kw_list = []
+    for kw in keywords:
+        kw_list.append(kw[0])
+    # Step 3: Process for Assignment
+    output = {}
+    for kw in kw_list:
+        output[kw] = []
+    for x in range(length):
+        pageText = pdfReader.pages[x].extract_text()
+        for kw in kw_list:
+            if pageText.find(kw) != -1:
+                output[kw].append(x+1)
+    # Step 4: Output List
+    clean_output = {}
+    for kw in kw_list:
+        clean_output[kw] = []
+    for kw in output:
+        clean_pages = []
+        if (len(output[kw]) == 0):
+            print('detected but no pages!:', kw, output[kw])
+            continue
+        start = output[kw][0]
+        end = output[kw][0]
+        for num in output[kw]:
+            if num > end + 1:
+                if start == end:
+                    clean_pages.append(str(start))
+                else:
+                    clean_pages.append(f'{start}-{end}')
+                start = num
+            end = num
+        if start == end:
+            clean_pages.append(str(start))
+        else:
+            clean_pages.append(f'{start}-{end}')
+        clean_output[kw] = clean_pages
+    return str(clean_output)
+with demo:
+    gr.Markdown("# PDF to Index")
+    with gr.Column():
+        gr.Markdown("## Load Inputs")
+        uploaded_file = gr.File(
+            label="Upload a PDF file",
+            file_count="single",
+            type="file"
+        )
+        gr.Markdown("---")
+    with gr.Column():
+            gr.Markdown("## Index From PDF")
+            convert_button = gr.Button("Index From PDF!", variant="primary")
+            out_placeholder = gr.HTML("<p><em>Output will appear below, with PyPDF2 for preprocessing and yake for processing:</em></p>")
+            gr.Markdown("### Raw Index")
+            index = gr.Textbox(
+                label="Raw Index", placeholder="The index will appear here"
+            )
+    convert_button.click(
+        fn=generate,
+        inputs=[uploaded_file],
+        outputs=[index],
+    )
+demo.launch()