Newspapers-OCR-Demo / run_ocr.py
Devesh Pant
v0
1b870f4
raw
history blame
1.38 kB
import cv2
import numpy
import argparse
from pytesseract import*
from PIL import Image, ImageFont, ImageDraw
import numpy as np
# def preprocess_image(image):
def OCR(img, lang='hin', min_conf=0.25):
rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# preprocessed_image = preprocess_image(rgb)
# write the preprocessed image to disk as a temporary file so we can
results = pytesseract.image_to_data(rgb, output_type=Output.DICT, lang=lang)
out_text = ""
for i in range(0, len(results["text"])):
# We can then extract the bounding box coordinates
# of the text region from the current result
x = results["left"][i]
y = results["top"][i]
w = results["width"][i]
h = results["height"][i]
# We will also extract the OCR text itself along
# with the confidence of the text localization
text = results["text"][i]
conf = int(results["conf"][i])
# filter out weak confidence text localizations
if conf > min_conf:
# We then strip out non-ASCII text so we can
# draw the text on the image We will be using
# OpenCV, then draw a bounding box around the
# text along with the text itself
text = "".join(text).strip()
out_text += text + " "
return out_text