AlirezaF138 commited on
Commit
717b6b1
ยท
verified ยท
1 Parent(s): b25999e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
+ import os
5
+
6
+ # Function to perform OCR and search for a keyword
7
+ def ocr_pdf_and_search_keyword(pdf_file, keyword, lang='fas'): # 'fas': Persian language (Farsi)
8
+ # Convert PDF to images
9
+ images = convert_from_path(pdf_file.name)
10
+
11
+ # Initialize a variable to hold the extracted text
12
+ extracted_text = ""
13
+ keyword_found = False
14
+
15
+ # Loop through each image and perform OCR
16
+ for page_number, image in enumerate(images):
17
+ text = pytesseract.image_to_string(image, lang=lang)
18
+ extracted_text += text
19
+
20
+ # Check if the keyword is in the extracted text
21
+ if keyword.lower() in text.lower():
22
+ keyword_found = True
23
+
24
+ if not keyword_found:
25
+ result_message = f"Keyword '{keyword}' not found in the document."
26
+ else:
27
+ result_message = f"Keyword '{keyword}' found in the document."
28
+
29
+ return extracted_text, result_message
30
+
31
+ # Create Gradio interface
32
+ def gradio_interface():
33
+ # Define Gradio inputs and outputs
34
+ pdf_input = gr.File(label="Upload PDF")
35
+ keyword_input = gr.Textbox(label="Enter Keyword", value="ูู„ุณูู‡") # Default keyword is 'ูู„ุณูู‡'
36
+ output_text = gr.Textbox(label="Extracted Text", interactive=False)
37
+ output_message = gr.Textbox(label="Keyword Search Result", interactive=False)
38
+
39
+ # Function to process the inputs and return the outputs
40
+ def process(pdf_file, keyword):
41
+ extracted_text, result_message = ocr_pdf_and_search_keyword(pdf_file, keyword)
42
+ return extracted_text, result_message
43
+
44
+ # Create and launch Gradio interface
45
+ gr.Interface(fn=process,
46
+ inputs=[pdf_input, keyword_input],
47
+ outputs=[output_text, output_message],
48
+ title="OCR PDF Keyword Search",
49
+ description="Upload a PDF, enter a keyword, and see the OCR results along with a search for the keyword."
50
+ ).launch()
51
+
52
+ # Call the function to create the interface
53
+ gradio_interface()