Spaces:

AlirezaF138
/

Persian-OCR

Sleeping

App Files Files Community

AlirezaF138 commited on 27 days ago

Commit

717b6b1

verified ·

1 Parent(s): b25999e

Create app.py

Browse files

Files changed (1) hide show

app.py +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import gradio as gr
+import pytesseract
+from pdf2image import convert_from_path
+import os
+# Function to perform OCR and search for a keyword
+def ocr_pdf_and_search_keyword(pdf_file, keyword, lang='fas'):  # 'fas': Persian language (Farsi)
+    # Convert PDF to images
+    images = convert_from_path(pdf_file.name)
+    # Initialize a variable to hold the extracted text
+    extracted_text = ""
+    keyword_found = False
+    # Loop through each image and perform OCR
+    for page_number, image in enumerate(images):
+        text = pytesseract.image_to_string(image, lang=lang)
+        extracted_text += text
+        # Check if the keyword is in the extracted text
+        if keyword.lower() in text.lower():
+            keyword_found = True
+    if not keyword_found:
+        result_message = f"Keyword '{keyword}' not found in the document."
+    else:
+        result_message = f"Keyword '{keyword}' found in the document."
+    return extracted_text, result_message
+# Create Gradio interface
+def gradio_interface():
+    # Define Gradio inputs and outputs
+    pdf_input = gr.File(label="Upload PDF")
+    keyword_input = gr.Textbox(label="Enter Keyword", value="فلسفه")  # Default keyword is 'فلسفه'
+    output_text = gr.Textbox(label="Extracted Text", interactive=False)
+    output_message = gr.Textbox(label="Keyword Search Result", interactive=False)
+    # Function to process the inputs and return the outputs
+    def process(pdf_file, keyword):
+        extracted_text, result_message = ocr_pdf_and_search_keyword(pdf_file, keyword)
+        return extracted_text, result_message
+    # Create and launch Gradio interface
+    gr.Interface(fn=process,
+                 inputs=[pdf_input, keyword_input],
+                 outputs=[output_text, output_message],
+                 title="OCR PDF Keyword Search",
+                 description="Upload a PDF, enter a keyword, and see the OCR results along with a search for the keyword."
+                ).launch()
+# Call the function to create the interface
+gradio_interface()