# GNU import gradio as gr import PyPDF2 import yake demo = gr.Blocks() def generate(input): # Step 1: Import pdfFileObj = open(input.name, 'rb') pdfReader = PyPDF2.PdfReader(pdfFileObj) length = len(pdfReader.pages) f = open("t.txt", "w") pages = [] for x in range(length): pages.append(pdfReader.pages[x].extract_text()) f.writelines(pages) # Step 2: Process for Keywords r = open("t.txt", "r") read = r.read() kw_extractor = yake.KeywordExtractor(top = 50) keywords = kw_extractor.extract_keywords(read) kw_list = [] for kw in keywords: kw_list.append(kw[0]) # Step 3: Process for Assignment output = {} for kw in kw_list: output[kw] = [] for x in range(length): pageText = pdfReader.pages[x].extract_text() for kw in kw_list: if pageText.find(kw) != -1: output[kw].append(x+1) # Step 4: Output List clean_output = {} for kw in kw_list: clean_output[kw] = [] for kw in output: clean_pages = [] if (len(output[kw]) == 0): print('detected but no pages!:', kw, output[kw]) continue start = output[kw][0] end = output[kw][0] for num in output[kw]: if num > end + 1: if start == end: clean_pages.append(str(start)) else: clean_pages.append(f'{start}-{end}') start = num end = num if start == end: clean_pages.append(str(start)) else: clean_pages.append(f'{start}-{end}') clean_output[kw] = clean_pages return str(clean_output) with demo: gr.Markdown("# PDF to Index") with gr.Column(): gr.Markdown("## Load Inputs") uploaded_file = gr.File( label="Upload a PDF file", file_count="single", type="file" ) gr.Markdown("---") with gr.Column(): gr.Markdown("## Index From PDF") convert_button = gr.Button("Index From PDF!", variant="primary") out_placeholder = gr.HTML("

Output will appear below, with PyPDF2 for preprocessing and yake for processing:

") gr.Markdown("### Raw Index") index = gr.Textbox( label="Raw Index", placeholder="The index will appear here" ) convert_button.click( fn=generate, inputs=[uploaded_file], outputs=[index], ) demo.launch()