Persian-OCR

Sleeping

File size: 2,036 Bytes

717b6b1

import gradio as gr
import pytesseract
from pdf2image import convert_from_path
import os

# Function to perform OCR and search for a keyword
def ocr_pdf_and_search_keyword(pdf_file, keyword, lang='fas'):  # 'fas': Persian language (Farsi)
    # Convert PDF to images
    images = convert_from_path(pdf_file.name)

    # Initialize a variable to hold the extracted text
    extracted_text = ""
    keyword_found = False

    # Loop through each image and perform OCR
    for page_number, image in enumerate(images):
        text = pytesseract.image_to_string(image, lang=lang)
        extracted_text += text

        # Check if the keyword is in the extracted text
        if keyword.lower() in text.lower():
            keyword_found = True

    if not keyword_found:
        result_message = f"Keyword '{keyword}' not found in the document."
    else:
        result_message = f"Keyword '{keyword}' found in the document."

    return extracted_text, result_message

# Create Gradio interface
def gradio_interface():
    # Define Gradio inputs and outputs
    pdf_input = gr.File(label="Upload PDF")
    keyword_input = gr.Textbox(label="Enter Keyword", value="فلسفه")  # Default keyword is 'فلسفه'
    output_text = gr.Textbox(label="Extracted Text", interactive=False)
    output_message = gr.Textbox(label="Keyword Search Result", interactive=False)

    # Function to process the inputs and return the outputs
    def process(pdf_file, keyword):
        extracted_text, result_message = ocr_pdf_and_search_keyword(pdf_file, keyword)
        return extracted_text, result_message

    # Create and launch Gradio interface
    gr.Interface(fn=process,
                 inputs=[pdf_input, keyword_input],
                 outputs=[output_text, output_message],
                 title="OCR PDF Keyword Search",
                 description="Upload a PDF, enter a keyword, and see the OCR results along with a search for the keyword."
                ).launch()

# Call the function to create the interface
gradio_interface()