IndexingAlpha / app.py
ewgewgewg's picture
Add application
d380e2b
raw
history blame
No virus
2.6 kB
# GNU
import gradio as gr
import PyPDF2
import yake
demo = gr.Blocks()
def generate(input):
# Step 1: Import
pdfFileObj = open(input.name, 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)
length = len(pdfReader.pages)
f = open("t.txt", "w")
pages = []
for x in range(length):
pages.append(pdfReader.pages[x].extract_text())
f.writelines(pages)
# Step 2: Process for Keywords
r = open("t.txt", "r")
read = r.read()
kw_extractor = yake.KeywordExtractor(top = 50)
keywords = kw_extractor.extract_keywords(read)
kw_list = []
for kw in keywords:
kw_list.append(kw[0])
# Step 3: Process for Assignment
output = {}
for kw in kw_list:
output[kw] = []
for x in range(length):
pageText = pdfReader.pages[x].extract_text()
for kw in kw_list:
if pageText.find(kw) != -1:
output[kw].append(x+1)
# Step 4: Output List
clean_output = {}
for kw in kw_list:
clean_output[kw] = []
for kw in output:
clean_pages = []
if (len(output[kw]) == 0):
print('detected but no pages!:', kw, output[kw])
continue
start = output[kw][0]
end = output[kw][0]
for num in output[kw]:
if num > end + 1:
if start == end:
clean_pages.append(str(start))
else:
clean_pages.append(f'{start}-{end}')
start = num
end = num
if start == end:
clean_pages.append(str(start))
else:
clean_pages.append(f'{start}-{end}')
clean_output[kw] = clean_pages
return str(clean_output)
with demo:
gr.Markdown("# PDF to Index")
with gr.Column():
gr.Markdown("## Load Inputs")
uploaded_file = gr.File(
label="Upload a PDF file",
file_count="single",
type="file"
)
gr.Markdown("---")
with gr.Column():
gr.Markdown("## Index From PDF")
convert_button = gr.Button("Index From PDF!", variant="primary")
out_placeholder = gr.HTML("<p><em>Output will appear below, with PyPDF2 for preprocessing and yake for processing:</em></p>")
gr.Markdown("### Raw Index")
index = gr.Textbox(
label="Raw Index", placeholder="The index will appear here"
)
convert_button.click(
fn=generate,
inputs=[uploaded_file],
outputs=[index],
)
demo.launch()