Spaces:
Sleeping
Sleeping
File size: 2,036 Bytes
717b6b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import os
# Function to perform OCR and search for a keyword
def ocr_pdf_and_search_keyword(pdf_file, keyword, lang='fas'): # 'fas': Persian language (Farsi)
# Convert PDF to images
images = convert_from_path(pdf_file.name)
# Initialize a variable to hold the extracted text
extracted_text = ""
keyword_found = False
# Loop through each image and perform OCR
for page_number, image in enumerate(images):
text = pytesseract.image_to_string(image, lang=lang)
extracted_text += text
# Check if the keyword is in the extracted text
if keyword.lower() in text.lower():
keyword_found = True
if not keyword_found:
result_message = f"Keyword '{keyword}' not found in the document."
else:
result_message = f"Keyword '{keyword}' found in the document."
return extracted_text, result_message
# Create Gradio interface
def gradio_interface():
# Define Gradio inputs and outputs
pdf_input = gr.File(label="Upload PDF")
keyword_input = gr.Textbox(label="Enter Keyword", value="فلسفه") # Default keyword is 'فلسفه'
output_text = gr.Textbox(label="Extracted Text", interactive=False)
output_message = gr.Textbox(label="Keyword Search Result", interactive=False)
# Function to process the inputs and return the outputs
def process(pdf_file, keyword):
extracted_text, result_message = ocr_pdf_and_search_keyword(pdf_file, keyword)
return extracted_text, result_message
# Create and launch Gradio interface
gr.Interface(fn=process,
inputs=[pdf_input, keyword_input],
outputs=[output_text, output_message],
title="OCR PDF Keyword Search",
description="Upload a PDF, enter a keyword, and see the OCR results along with a search for the keyword."
).launch()
# Call the function to create the interface
gradio_interface()
|