pdf-parser-api / backend /text_recog.py
blaxx14's picture
add all files
81f6231
raw
history blame
539 Bytes
import cv2
import pytesseract
from .file_utils import convert_image_to_word
def parsing_image(image, filename):
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\hp\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
image = cv2.imread(image)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
custom_config = r'--oem 3 --psm 6'
data = pytesseract.image_to_string(thresh, config=custom_config)
convert_image_to_word(data, filename)
return {}