Spaces:

Aumkeshchy2003
/

Gradio-OCR

Sleeping

App Files Files Community

Aumkeshchy2003 commited on Sep 30, 2024

Commit

814690d

•

1 Parent(s): 63cbf36

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -33

app.py CHANGED Viewed

@@ -1,45 +1,40 @@
 from typing import List
 import pytesseract
 from PIL import Image
 import gradio as gr
-def tesseract_ocr(filepath: str, languages: List[str]):
     image = Image.open(filepath)
-    return pytesseract.image_to_string(image=image, lang=', '.join(languages))
-title = "Tesseract OCR"
-description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
-article = "<p style='text-align: center'><a href='https://tesseract-ocr.github.io/' target='_blank'>Tesseract documentation</a> | <a href='https://github.com/tesseract-ocr/tesseract' target='_blank'>Github Repo</a></p>"
-language_choices = pytesseract.get_languages()
-def search_and_highlight(text, keyword):
-    highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE)
-    return highlighted_text
-def ocr_and_search(image, keyword, language_choices):
-    if image is None:
-        return "Please upload an image."
-    extracted_text = tesseract_ocr(image)
     if keyword:
-        highlighted_text = search_and_highlight(extracted_text, keyword)
-        return highlighted_text
     else:
-        return extracted_text
-iface = gr.Interface(
-    fn=ocr_and_search,
     inputs=[
-        gr.Image(type="pil", label="Upload Image"),
-        gr.Textbox(label="Enter keyword to search (optional)"),
-    ],
-    outputs=gr.HTML(label="Extracted and Highlighted Text"),
-    title="OCR and Keyword Search",
-    description="Upload an image to extract text using OCR and optionally search for keywords in the extracted text."
 )
-iface.launch()

 from typing import List
 import pytesseract
 from PIL import Image
 import gradio as gr
+import re
+def tesseract_ocr_with_search(filepath: str, languages: List[str], keyword: str):
+    # Perform OCR on the image
     image = Image.open(filepath)
+    extracted_text = pytesseract.image_to_string(image=image, lang=', '.join(languages))
+    # If keyword is provided, highlight the occurrences
     if keyword:
+        highlighted_text = re.sub(f"({re.escape(keyword)})", r"<mark>\1</mark>", extracted_text, flags=re.IGNORECASE)
     else:
+        highlighted_text = extracted_text
+    return highlighted_text
+# Fetch available languages for Tesseract
+language_choices = pytesseract.get_languages()
+# Define Gradio Interface
+demo = gr.Interface(
+    fn=tesseract_ocr_with_search,
     inputs=[
+        gr.Image(type="filepath", label="Upload Image"),    # Input for image upload
+        gr.CheckboxGroup(language_choices, type="value", value=['eng'], label='Language'),  # Language selection
+        gr.Textbox(placeholder="Enter keyword to search", label="Keyword Search")  # Keyword input
+        ],
+    outputs=gr.HTML(),  # Use HTML output to allow text highlighting
+    title=title,
+    description=description,
+    article=article,
+    examples=examples,
 )
+if __name__ == '__main__':
+    demo.launch()
+    print("Finished running")