Spaces:

Aumkeshchy2003
/

Gradio-OCR

Running

File size: 1,363 Bytes

d4640a8
794e69a
 
a73bb26
794e69a
 
6916c84
fb7988f
6916c84
 
a73bb26
 
76a8b7b
 
 
 
 
a73bb26
6916c84
76a8b7b
a73bb26
6916c84
76a8b7b
a73bb26
 
 
 
 
e2eafa6
6916c84
 
1c9cf55
814690d
76a8b7b
d66c9c9
76a8b7b
6916c84
76a8b7b
03f0455
814690d
6916c84
04c7dbc
814690d
 
6916c84

from typing import List
import pytesseract
from PIL import Image
import re
import gradio as gr

def tesseract_ocr(filepath: str) -> str:
    image = Image.open(filepath)
    combined_languages = 'eng+hin'
    extracted_text = pytesseract.image_to_string(image=image, lang=combined_languages)
    return extracted_text

def search_and_highlight(text: str, keyword: str) -> str:
    if keyword:
        highlighted_text = re.sub(f"({keyword})", r"<mark>\1</mark>", text, flags=re.IGNORECASE)
        return highlighted_text
    return text

def ocr_and_search(filepath: str, keyword: str) -> str:
    if filepath is None:
        return "Please upload an image."
    extracted_text = tesseract_ocr(filepath)
    
    if keyword:
        highlighted_text = search_and_highlight(extracted_text, keyword)
        return highlighted_text
    else:
        return extracted_text

title = "Tesseract OCR (English + Hindi)"
description = "Gradio demo for Tesseract with multi-language support (English and Hindi)."

demo = gr.Interface(
    fn=ocr_and_search, 
    inputs=[
        gr.Image(type="filepath", label="Upload Image for OCR"), 
        gr.Textbox(label="Keyword to Highlight", placeholder="Enter a keyword...")
    ],
    outputs='html', 
    title=title,
    description=description
)
if __name__ == '__main__':
    demo.launch()
    print("Finished running")