Aumkeshchy2003 commited on
Commit
814690d
1 Parent(s): 63cbf36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -33
app.py CHANGED
@@ -1,45 +1,40 @@
1
  from typing import List
2
-
3
  import pytesseract
4
  from PIL import Image
5
-
6
  import gradio as gr
 
7
 
8
- def tesseract_ocr(filepath: str, languages: List[str]):
 
9
  image = Image.open(filepath)
10
- return pytesseract.image_to_string(image=image, lang=', '.join(languages))
11
-
12
- title = "Tesseract OCR"
13
- description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
14
- article = "<p style='text-align: center'><a href='https://tesseract-ocr.github.io/' target='_blank'>Tesseract documentation</a> | <a href='https://github.com/tesseract-ocr/tesseract' target='_blank'>Github Repo</a></p>"
15
-
16
- language_choices = pytesseract.get_languages()
17
-
18
- def search_and_highlight(text, keyword):
19
- highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE)
20
- return highlighted_text
21
-
22
- def ocr_and_search(image, keyword, language_choices):
23
- if image is None:
24
- return "Please upload an image."
25
-
26
- extracted_text = tesseract_ocr(image)
27
-
28
  if keyword:
29
- highlighted_text = search_and_highlight(extracted_text, keyword)
30
- return highlighted_text
31
  else:
32
- return extracted_text
 
 
 
 
 
33
 
34
- iface = gr.Interface(
35
- fn=ocr_and_search,
 
36
  inputs=[
37
- gr.Image(type="pil", label="Upload Image"),
38
- gr.Textbox(label="Enter keyword to search (optional)"),
39
- ],
40
- outputs=gr.HTML(label="Extracted and Highlighted Text"),
41
- title="OCR and Keyword Search",
42
- description="Upload an image to extract text using OCR and optionally search for keywords in the extracted text."
 
 
 
43
  )
44
 
45
- iface.launch()
 
 
 
1
  from typing import List
 
2
  import pytesseract
3
  from PIL import Image
 
4
  import gradio as gr
5
+ import re
6
 
7
+ def tesseract_ocr_with_search(filepath: str, languages: List[str], keyword: str):
8
+ # Perform OCR on the image
9
  image = Image.open(filepath)
10
+ extracted_text = pytesseract.image_to_string(image=image, lang=', '.join(languages))
11
+
12
+ # If keyword is provided, highlight the occurrences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  if keyword:
14
+ highlighted_text = re.sub(f"({re.escape(keyword)})", r"<mark>\1</mark>", extracted_text, flags=re.IGNORECASE)
 
15
  else:
16
+ highlighted_text = extracted_text
17
+
18
+ return highlighted_text
19
+
20
+ # Fetch available languages for Tesseract
21
+ language_choices = pytesseract.get_languages()
22
 
23
+ # Define Gradio Interface
24
+ demo = gr.Interface(
25
+ fn=tesseract_ocr_with_search,
26
  inputs=[
27
+ gr.Image(type="filepath", label="Upload Image"), # Input for image upload
28
+ gr.CheckboxGroup(language_choices, type="value", value=['eng'], label='Language'), # Language selection
29
+ gr.Textbox(placeholder="Enter keyword to search", label="Keyword Search") # Keyword input
30
+ ],
31
+ outputs=gr.HTML(), # Use HTML output to allow text highlighting
32
+ title=title,
33
+ description=description,
34
+ article=article,
35
+ examples=examples,
36
  )
37
 
38
+ if __name__ == '__main__':
39
+ demo.launch()
40
+ print("Finished running")