import os, io, sys, inspect, statistics from statistics import mean # from google.cloud import vision, storage from google.cloud import vision from google.cloud import vision_v1p3beta1 as vision_beta from PIL import Image, ImageDraw, ImageFont import colorsys from tqdm import tqdm currentdir = os.path.dirname(os.path.abspath( inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) sys.path.append(parentdir) ''' @misc{li2021trocr, title={TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models}, author={Minghao Li and Tengchao Lv and Lei Cui and Yijuan Lu and Dinei Florencio and Cha Zhang and Zhoujun Li and Furu Wei}, year={2021}, eprint={2109.10282}, archivePrefix={arXiv}, primaryClass={cs.CL} } ''' class OCRGoogle: BBOX_COLOR = "black" def __init__(self, path, cfg, trOCR_model_version, trOCR_model, trOCR_processor, device): self.path = path self.cfg = cfg self.do_use_trOCR = self.cfg['leafmachine']['project']['do_use_trOCR'] self.OCR_option = self.cfg['leafmachine']['project']['OCR_option'] # Initialize TrOCR components self.trOCR_model_version = trOCR_model_version self.trOCR_processor = trOCR_processor self.trOCR_model = trOCR_model self.device = device self.hand_cleaned_text = None self.hand_organized_text = None self.hand_bounds = None self.hand_bounds_word = None self.hand_bounds_flat = None self.hand_text_to_box_mapping = None self.hand_height = None self.hand_confidences = None self.hand_characters = None self.normal_cleaned_text = None self.normal_organized_text = None self.normal_bounds = None self.normal_bounds_word = None self.normal_text_to_box_mapping = None self.normal_bounds_flat = None self.normal_height = None self.normal_confidences = None self.normal_characters = None self.trOCR_texts = None self.trOCR_text_to_box_mapping = None self.trOCR_bounds_flat = None self.trOCR_height = None self.trOCR_confidences = None self.trOCR_characters = None def detect_text_with_trOCR_using_google_bboxes(self, do_use_trOCR, logger): CONFIDENCES = 0.80 MAX_NEW_TOKENS = 50 self.OCR_JSON_to_file = {} if not do_use_trOCR: if self.OCR_option in ['normal',]: self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}") return f"Google_OCR_Standard:\n{self.normal_organized_text}" if self.OCR_option in ['hand',]: self.OCR_JSON_to_file['OCR_handwritten'] = self.hand_organized_text logger.info(f"Google_OCR_Handwriting:\n{self.hand_organized_text}") return f"Google_OCR_Handwriting:\n{self.hand_organized_text}" if self.OCR_option in ['both',]: logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}\n\nGoogle_OCR_Handwriting:\n{self.hand_organized_text}") return f"Google_OCR_Standard:\n{self.normal_organized_text}\n\nGoogle_OCR_Handwriting:\n{self.hand_organized_text}" else: logger.info(f'Supplementing with trOCR') self.trOCR_texts = [] original_image = Image.open(self.path).convert("RGB") if self.OCR_option in ['normal',]: available_bounds = self.normal_bounds_word elif self.OCR_option in ['hand',]: available_bounds = self.hand_bounds_word elif self.OCR_option in ['both',]: available_bounds = self.hand_bounds_word else: raise text_to_box_mapping = [] characters = [] height = [] confidences = [] for bound in tqdm(available_bounds, desc="Processing words using Google Vision bboxes"): vertices = bound["vertices"] left = min([v["x"] for v in vertices]) top = min([v["y"] for v in vertices]) right = max([v["x"] for v in vertices]) bottom = max([v["y"] for v in vertices]) # Crop image based on Google's bounding box cropped_image = original_image.crop((left, top, right, bottom)) pixel_values = self.trOCR_processor(cropped_image, return_tensors="pt").pixel_values # Move pixel values to the appropriate device pixel_values = pixel_values.to(self.device) generated_ids = self.trOCR_model.generate(pixel_values, max_new_tokens=MAX_NEW_TOKENS) extracted_text = self.trOCR_processor.batch_decode(generated_ids, skip_special_tokens=True)[0] self.trOCR_texts.append(extracted_text) # For plotting word_length = max(vertex.get('x') for vertex in vertices) - min(vertex.get('x') for vertex in vertices) num_symbols = len(extracted_text) Yw = max(vertex.get('y') for vertex in vertices) Yo = Yw - min(vertex.get('y') for vertex in vertices) X = word_length / num_symbols if num_symbols > 0 else 0 H = int(X+(Yo*0.1)) height.append(H) map_dict = { "vertices": vertices, "text": extracted_text # Use the text extracted by trOCR } text_to_box_mapping.append(map_dict) characters.append(extracted_text) confidences.append(CONFIDENCES) median_height = statistics.median(height) if height else 0 median_heights = [median_height * 1.5] * len(characters) self.trOCR_texts = ' '.join(self.trOCR_texts) self.trOCR_text_to_box_mapping = text_to_box_mapping self.trOCR_bounds_flat = available_bounds self.trOCR_height = median_heights self.trOCR_confidences = confidences self.trOCR_characters = characters if self.OCR_option in ['normal',]: self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text self.OCR_JSON_to_file['OCR_trOCR'] = self.trOCR_texts logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}\n\ntrOCR:\n{self.trOCR_texts}") return f"Google_OCR_Standard:\n{self.normal_organized_text}\n\ntrOCR:\n{self.trOCR_texts}" if self.OCR_option in ['hand',]: self.OCR_JSON_to_file['OCR_handwritten'] = self.hand_organized_text self.OCR_JSON_to_file['OCR_trOCR'] = self.trOCR_texts logger.info(f"Google_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}") return f"Google_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}" if self.OCR_option in ['both',]: self.OCR_JSON_to_file['OCR_printed'] = self.normal_organized_text self.OCR_JSON_to_file['OCR_handwritten'] = self.hand_organized_text self.OCR_JSON_to_file['OCR_trOCR'] = self.trOCR_texts logger.info(f"Google_OCR_Standard:\n{self.normal_organized_text}\n\nGoogle_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}") return f"Google_OCR_Standard:\n{self.normal_organized_text}\n\nGoogle_OCR_Handwriting:\n{self.hand_organized_text}\n\ntrOCR:\n{self.trOCR_texts}" else: raise @staticmethod def confidence_to_color(confidence): hue = (confidence - 0.5) * 120 / 0.5 r, g, b = colorsys.hls_to_rgb(hue/360, 0.5, 1) return (int(r*255), int(g*255), int(b*255)) def render_text_on_black_image(self, option): bounds_flat = getattr(self, f'{option}_bounds_flat', []) heights = getattr(self, f'{option}_height', []) confidences = getattr(self, f'{option}_confidences', []) characters = getattr(self, f'{option}_characters', []) original_image = Image.open(self.path) width, height = original_image.size black_image = Image.new("RGB", (width, height), "black") draw = ImageDraw.Draw(black_image) for bound, confidence, char_height, character in zip(bounds_flat, confidences, heights, characters): font_size = int(char_height) font = ImageFont.load_default().font_variant(size=font_size) if option == 'trOCR': color = (0, 170, 255) else: color = OCRGoogle.confidence_to_color(confidence) position = (bound["vertices"][0]["x"], bound["vertices"][0]["y"] - char_height) draw.text(position, character, fill=color, font=font) return black_image def merge_images(self, image1, image2): width1, height1 = image1.size width2, height2 = image2.size merged_image = Image.new("RGB", (width1 + width2, max([height1, height2]))) merged_image.paste(image1, (0, 0)) merged_image.paste(image2, (width1, 0)) return merged_image def draw_boxes(self, option): bounds = getattr(self, f'{option}_bounds', []) bounds_word = getattr(self, f'{option}_bounds_word', []) confidences = getattr(self, f'{option}_confidences', []) draw = ImageDraw.Draw(self.image) width, height = self.image.size if min([width, height]) > 4000: line_width_thick = int((width + height) / 2 * 0.0025) # Adjust line width for character level line_width_thin = 1 else: line_width_thick = int((width + height) / 2 * 0.005) # Adjust line width for character level line_width_thin = 1 #int((width + height) / 2 * 0.001) for bound in bounds_word: draw.polygon( [ bound["vertices"][0]["x"], bound["vertices"][0]["y"], bound["vertices"][1]["x"], bound["vertices"][1]["y"], bound["vertices"][2]["x"], bound["vertices"][2]["y"], bound["vertices"][3]["x"], bound["vertices"][3]["y"], ], outline=OCRGoogle.BBOX_COLOR, width=line_width_thin ) # Draw a line segment at the bottom of each handwritten character for bound, confidence in zip(bounds, confidences): color = OCRGoogle.confidence_to_color(confidence) # Use the bottom two vertices of the bounding box for the line bottom_left = (bound["vertices"][3]["x"], bound["vertices"][3]["y"] + line_width_thick) bottom_right = (bound["vertices"][2]["x"], bound["vertices"][2]["y"] + line_width_thick) draw.line([bottom_left, bottom_right], fill=color, width=line_width_thick) return self.image def detect_text(self): client = vision.ImageAnnotatorClient() with io.open(self.path, 'rb') as image_file: content = image_file.read() image = vision.Image(content=content) response = client.document_text_detection(image=image) texts = response.text_annotations if response.error.message: raise Exception( '{}\nFor more info on error messages, check: ' 'https://cloud.google.com/apis/design/errors'.format( response.error.message)) bounds = [] bounds_word = [] text_to_box_mapping = [] bounds_flat = [] height_flat = [] confidences = [] characters = [] organized_text = "" paragraph_count = 0 for text in texts[1:]: vertices = [{"x": vertex.x, "y": vertex.y} for vertex in text.bounding_poly.vertices] map_dict = { "vertices": vertices, "text": text.description } text_to_box_mapping.append(map_dict) for page in response.full_text_annotation.pages: for block in page.blocks: # paragraph_count += 1 # organized_text += f'OCR_paragraph_{paragraph_count}:\n' # Add paragraph label for paragraph in block.paragraphs: avg_H_list = [] for word in paragraph.words: Yw = max(vertex.y for vertex in word.bounding_box.vertices) # Calculate the width of the word and divide by the number of symbols word_length = max(vertex.x for vertex in word.bounding_box.vertices) - min(vertex.x for vertex in word.bounding_box.vertices) num_symbols = len(word.symbols) if num_symbols <= 3: H = int(Yw - min(vertex.y for vertex in word.bounding_box.vertices)) else: Yo = Yw - min(vertex.y for vertex in word.bounding_box.vertices) X = word_length / num_symbols if num_symbols > 0 else 0 H = int(X+(Yo*0.1)) avg_H_list.append(H) avg_H = int(mean(avg_H_list)) words_in_para = [] for word in paragraph.words: # Get word-level bounding box bound_word_dict = { "vertices": [ {"x": vertex.x, "y": vertex.y} for vertex in word.bounding_box.vertices ] } bounds_word.append(bound_word_dict) Y = max(vertex.y for vertex in word.bounding_box.vertices) word_x_start = min(vertex.x for vertex in word.bounding_box.vertices) word_x_end = max(vertex.x for vertex in word.bounding_box.vertices) num_symbols = len(word.symbols) symbol_width = (word_x_end - word_x_start) / num_symbols if num_symbols > 0 else 0 current_x_position = word_x_start characters_ind = [] for symbol in word.symbols: bound_dict = { "vertices": [ {"x": vertex.x, "y": vertex.y} for vertex in symbol.bounding_box.vertices ] } bounds.append(bound_dict) # Create flat bounds with adjusted x position bounds_flat_dict = { "vertices": [ {"x": current_x_position, "y": Y}, {"x": current_x_position + symbol_width, "y": Y} ] } bounds_flat.append(bounds_flat_dict) current_x_position += symbol_width height_flat.append(avg_H) confidences.append(round(symbol.confidence, 4)) characters_ind.append(symbol.text) characters.append(symbol.text) words_in_para.append(''.join(characters_ind)) paragraph_text = ' '.join(words_in_para) # Join words in paragraph organized_text += paragraph_text + ' ' #+ '\n' # median_height = statistics.median(height_flat) if height_flat else 0 # median_heights = [median_height] * len(characters) self.normal_cleaned_text = texts[0].description if texts else '' self.normal_organized_text = organized_text self.normal_bounds = bounds self.normal_bounds_word = bounds_word self.normal_text_to_box_mapping = text_to_box_mapping self.normal_bounds_flat = bounds_flat # self.normal_height = median_heights #height_flat self.normal_height = height_flat self.normal_confidences = confidences self.normal_characters = characters def detect_handwritten_ocr(self): client = vision_beta.ImageAnnotatorClient() with open(self.path, "rb") as image_file: content = image_file.read() image = vision_beta.Image(content=content) image_context = vision_beta.ImageContext(language_hints=["en-t-i0-handwrit"]) response = client.document_text_detection(image=image, image_context=image_context) texts = response.text_annotations if response.error.message: raise Exception( "{}\nFor more info on error messages, check: " "https://cloud.google.com/apis/design/errors".format(response.error.message) ) bounds = [] bounds_word = [] bounds_flat = [] height_flat = [] confidences = [] characters = [] organized_text = "" paragraph_count = 0 text_to_box_mapping = [] for text in texts[1:]: vertices = [{"x": vertex.x, "y": vertex.y} for vertex in text.bounding_poly.vertices] map_dict = { "vertices": vertices, "text": text.description } text_to_box_mapping.append(map_dict) for page in response.full_text_annotation.pages: for block in page.blocks: # paragraph_count += 1 # organized_text += f'\nOCR_paragraph_{paragraph_count}:\n' # Add paragraph label for paragraph in block.paragraphs: avg_H_list = [] for word in paragraph.words: Yw = max(vertex.y for vertex in word.bounding_box.vertices) # Calculate the width of the word and divide by the number of symbols word_length = max(vertex.x for vertex in word.bounding_box.vertices) - min(vertex.x for vertex in word.bounding_box.vertices) num_symbols = len(word.symbols) if num_symbols <= 3: H = int(Yw - min(vertex.y for vertex in word.bounding_box.vertices)) else: Yo = Yw - min(vertex.y for vertex in word.bounding_box.vertices) X = word_length / num_symbols if num_symbols > 0 else 0 H = int(X+(Yo*0.1)) avg_H_list.append(H) avg_H = int(mean(avg_H_list)) words_in_para = [] for word in paragraph.words: # Get word-level bounding box bound_word_dict = { "vertices": [ {"x": vertex.x, "y": vertex.y} for vertex in word.bounding_box.vertices ] } bounds_word.append(bound_word_dict) Y = max(vertex.y for vertex in word.bounding_box.vertices) word_x_start = min(vertex.x for vertex in word.bounding_box.vertices) word_x_end = max(vertex.x for vertex in word.bounding_box.vertices) num_symbols = len(word.symbols) symbol_width = (word_x_end - word_x_start) / num_symbols if num_symbols > 0 else 0 current_x_position = word_x_start characters_ind = [] for symbol in word.symbols: bound_dict = { "vertices": [ {"x": vertex.x, "y": vertex.y} for vertex in symbol.bounding_box.vertices ] } bounds.append(bound_dict) # Create flat bounds with adjusted x position bounds_flat_dict = { "vertices": [ {"x": current_x_position, "y": Y}, {"x": current_x_position + symbol_width, "y": Y} ] } bounds_flat.append(bounds_flat_dict) current_x_position += symbol_width height_flat.append(avg_H) confidences.append(round(symbol.confidence, 4)) characters_ind.append(symbol.text) characters.append(symbol.text) words_in_para.append(''.join(characters_ind)) paragraph_text = ' '.join(words_in_para) # Join words in paragraph organized_text += paragraph_text + ' ' #+ '\n' # median_height = statistics.median(height_flat) if height_flat else 0 # median_heights = [median_height] * len(characters) self.hand_cleaned_text = response.text_annotations[0].description if response.text_annotations else '' self.hand_organized_text = organized_text self.hand_bounds = bounds self.hand_bounds_word = bounds_word self.hand_bounds_flat = bounds_flat self.hand_text_to_box_mapping = text_to_box_mapping # self.hand_height = median_heights #height_flat self.hand_height = height_flat self.hand_confidences = confidences self.hand_characters = characters def process_image(self, do_create_OCR_helper_image, logger): if self.OCR_option in ['normal', 'both']: self.detect_text() if self.OCR_option in ['hand', 'both']: self.detect_handwritten_ocr() if self.OCR_option not in ['normal', 'hand', 'both']: self.OCR_option = 'both' self.detect_text() self.detect_handwritten_ocr() ### Optionally add trOCR to the self.OCR for additional context self.OCR = self.detect_text_with_trOCR_using_google_bboxes(self.do_use_trOCR, logger) if do_create_OCR_helper_image: self.image = Image.open(self.path) if self.OCR_option in ['normal', 'both']: image_with_boxes_normal = self.draw_boxes('normal') text_image_normal = self.render_text_on_black_image('normal') self.merged_image_normal = self.merge_images(image_with_boxes_normal, text_image_normal) if self.OCR_option in ['hand', 'both']: image_with_boxes_hand = self.draw_boxes('hand') text_image_hand = self.render_text_on_black_image('hand') self.merged_image_hand = self.merge_images(image_with_boxes_hand, text_image_hand) if self.do_use_trOCR: text_image_trOCR = self.render_text_on_black_image('trOCR') ### Merge final overlay image ### [original, normal bboxes, normal text] if self.OCR_option in ['normal']: self.overlay_image = self.merge_images(Image.open(self.path), self.merged_image_normal) ### [original, hand bboxes, hand text] elif self.OCR_option in ['hand']: self.overlay_image = self.merge_images(Image.open(self.path), self.merged_image_hand) ### [original, normal bboxes, normal text, hand bboxes, hand text] else: self.overlay_image = self.merge_images(Image.open(self.path), self.merge_images(self.merged_image_normal, self.merged_image_hand)) if self.do_use_trOCR: self.overlay_image = self.merge_images(self.overlay_image, text_image_trOCR) else: self.merged_image_normal = None self.merged_image_hand = None self.overlay_image = Image.open(self.path) ''' BBOX_COLOR = "black" # green cyan def render_text_on_black_image(image_path, handwritten_char_bounds_flat, handwritten_char_confidences, handwritten_char_heights, characters): # Load the original image to get its dimensions original_image = Image.open(image_path) width, height = original_image.size # Create a black image of the same size black_image = Image.new("RGB", (width, height), "black") draw = ImageDraw.Draw(black_image) # Loop through each character for bound, confidence, char_height, character in zip(handwritten_char_bounds_flat, handwritten_char_confidences, handwritten_char_heights, characters): # Determine the font size based on the height of the character font_size = int(char_height) font = ImageFont.load_default().font_variant(size=font_size) # Color of the character color = confidence_to_color(confidence) # Position of the text (using the bottom-left corner of the bounding box) position = (bound["vertices"][0]["x"], bound["vertices"][0]["y"] - char_height) # Draw the character draw.text(position, character, fill=color, font=font) return black_image def merge_images(image1, image2): # Assuming both images are of the same size width, height = image1.size merged_image = Image.new("RGB", (width * 2, height)) merged_image.paste(image1, (0, 0)) merged_image.paste(image2, (width, 0)) return merged_image def draw_boxes(image, bounds, color): if bounds: draw = ImageDraw.Draw(image) width, height = image.size line_width = int((width + height) / 2 * 0.001) # This sets the line width as 0.5% of the average dimension for bound in bounds: draw.polygon( [ bound["vertices"][0]["x"], bound["vertices"][0]["y"], bound["vertices"][1]["x"], bound["vertices"][1]["y"], bound["vertices"][2]["x"], bound["vertices"][2]["y"], bound["vertices"][3]["x"], bound["vertices"][3]["y"], ], outline=color, width=line_width ) return image def detect_text(path): client = vision.ImageAnnotatorClient() with io.open(path, 'rb') as image_file: content = image_file.read() image = vision.Image(content=content) response = client.document_text_detection(image=image) texts = response.text_annotations if response.error.message: raise Exception( '{}\nFor more info on error messages, check: ' 'https://cloud.google.com/apis/design/errors'.format( response.error.message)) # Extract bounding boxes bounds = [] text_to_box_mapping = {} for text in texts[1:]: # Skip the first entry, as it represents the entire detected text # Convert BoundingPoly to dictionary bound_dict = { "vertices": [ {"x": vertex.x, "y": vertex.y} for vertex in text.bounding_poly.vertices ] } bounds.append(bound_dict) text_to_box_mapping[str(bound_dict)] = text.description if texts: # cleaned_text = texts[0].description.replace("\n", " ").replace("\t", " ").replace("|", " ") cleaned_text = texts[0].description return cleaned_text, bounds, text_to_box_mapping else: return '', None, None def confidence_to_color(confidence): """Convert confidence level to a color ranging from red (low confidence) to green (high confidence).""" # Using HSL color space, where Hue varies from red to green hue = (confidence - 0.5) * 120 / 0.5 # Scale confidence to range 0-120 (red to green in HSL) r, g, b = colorsys.hls_to_rgb(hue/360, 0.5, 1) # Convert to RGB return (int(r*255), int(g*255), int(b*255)) def overlay_boxes_on_image(path, typed_bounds, handwritten_char_bounds, handwritten_char_confidences, do_create_OCR_helper_image): if do_create_OCR_helper_image: image = Image.open(path) draw = ImageDraw.Draw(image) width, height = image.size line_width = int((width + height) / 2 * 0.005) # Adjust line width for character level # Draw boxes for typed text for bound in typed_bounds: draw.polygon( [ bound["vertices"][0]["x"], bound["vertices"][0]["y"], bound["vertices"][1]["x"], bound["vertices"][1]["y"], bound["vertices"][2]["x"], bound["vertices"][2]["y"], bound["vertices"][3]["x"], bound["vertices"][3]["y"], ], outline=BBOX_COLOR, width=1 ) # Draw a line segment at the bottom of each handwritten character for bound, confidence in zip(handwritten_char_bounds, handwritten_char_confidences): color = confidence_to_color(confidence) # Use the bottom two vertices of the bounding box for the line bottom_left = (bound["vertices"][3]["x"], bound["vertices"][3]["y"] + line_width) bottom_right = (bound["vertices"][2]["x"], bound["vertices"][2]["y"] + line_width) draw.line([bottom_left, bottom_right], fill=color, width=line_width) text_image = render_text_on_black_image(path, handwritten_char_bounds, handwritten_char_confidences) merged_image = merge_images(image, text_image) # Assuming 'overlayed_image' is the image with lines return merged_image else: return Image.open(path) def detect_handwritten_ocr(path): """Detects handwritten characters in a local image and returns their bounding boxes and confidence levels. Args: path: The path to the local file. Returns: A tuple of (text, bounding_boxes, confidences) """ client = vision_beta.ImageAnnotatorClient() with open(path, "rb") as image_file: content = image_file.read() image = vision_beta.Image(content=content) image_context = vision_beta.ImageContext(language_hints=["en-t-i0-handwrit"]) response = client.document_text_detection(image=image, image_context=image_context) if response.error.message: raise Exception( "{}\nFor more info on error messages, check: " "https://cloud.google.com/apis/design/errors".format(response.error.message) ) bounds = [] bounds_flat = [] height_flat = [] confidences = [] character = [] for page in response.full_text_annotation.pages: for block in page.blocks: for paragraph in block.paragraphs: for word in paragraph.words: # Get the bottom Y-location (max Y) for the whole word Y = max(vertex.y for vertex in word.bounding_box.vertices) # Get the height of the word's bounding box H = Y - min(vertex.y for vertex in word.bounding_box.vertices) for symbol in word.symbols: # Collecting bounding box for each symbol bound_dict = { "vertices": [ {"x": vertex.x, "y": vertex.y} for vertex in symbol.bounding_box.vertices ] } bounds.append(bound_dict) # Bounds with same bottom y height bounds_flat_dict = { "vertices": [ {"x": vertex.x, "y": Y} for vertex in symbol.bounding_box.vertices ] } bounds_flat.append(bounds_flat_dict) # Add the word's height height_flat.append(H) # Collecting confidence for each symbol symbol_confidence = round(symbol.confidence, 4) confidences.append(symbol_confidence) character.append(symbol.text) cleaned_text = response.full_text_annotation.text return cleaned_text, bounds, bounds_flat, height_flat, confidences, character def process_image(path, do_create_OCR_helper_image): typed_text, typed_bounds, _ = detect_text(path) handwritten_text, handwritten_bounds, _ = detect_handwritten_ocr(path) overlayed_image = overlay_boxes_on_image(path, typed_bounds, handwritten_bounds, do_create_OCR_helper_image) return typed_text, handwritten_text, overlayed_image ''' # ''' Google Vision''' # def detect_text(path): # """Detects text in the file located in the local filesystem.""" # client = vision.ImageAnnotatorClient() # with io.open(path, 'rb') as image_file: # content = image_file.read() # image = vision.Image(content=content) # response = client.document_text_detection(image=image) # texts = response.text_annotations # if response.error.message: # raise Exception( # '{}\nFor more info on error messages, check: ' # 'https://cloud.google.com/apis/design/errors'.format( # response.error.message)) # return texts[0].description if texts else ''