Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pytesseract | |
from pdf2image import convert_from_path | |
import os | |
# Function to perform OCR and search for a keyword | |
def ocr_pdf_and_search_keyword(pdf_file, keyword, lang='fas'): # 'fas': Persian language (Farsi) | |
# Convert PDF to images | |
images = convert_from_path(pdf_file.name) | |
# Initialize a variable to hold the extracted text | |
extracted_text = "" | |
keyword_found = False | |
# Loop through each image and perform OCR | |
for page_number, image in enumerate(images): | |
text = pytesseract.image_to_string(image, lang=lang) | |
extracted_text += text | |
# Check if the keyword is in the extracted text | |
if keyword.lower() in text.lower(): | |
keyword_found = True | |
if not keyword_found: | |
result_message = f"Keyword '{keyword}' not found in the document." | |
else: | |
result_message = f"Keyword '{keyword}' found in the document." | |
return extracted_text, result_message | |
# Create Gradio interface | |
def gradio_interface(): | |
# Define Gradio inputs and outputs | |
pdf_input = gr.File(label="Upload PDF") | |
keyword_input = gr.Textbox(label="Enter Keyword", value="ููุณูู") # Default keyword is 'ููุณูู' | |
output_text = gr.Textbox(label="Extracted Text", interactive=False) | |
output_message = gr.Textbox(label="Keyword Search Result", interactive=False) | |
# Function to process the inputs and return the outputs | |
def process(pdf_file, keyword): | |
extracted_text, result_message = ocr_pdf_and_search_keyword(pdf_file, keyword) | |
return extracted_text, result_message | |
# Create and launch Gradio interface | |
gr.Interface(fn=process, | |
inputs=[pdf_input, keyword_input], | |
outputs=[output_text, output_message], | |
title="OCR PDF Keyword Search", | |
description="Upload a PDF, enter a keyword, and see the OCR results along with a search for the keyword." | |
).launch() | |
# Call the function to create the interface | |
gradio_interface() | |