from paddleocr import PaddleOCR, draw_ocr from PIL import Image import math import numpy as np import cv2 import re import tempfile from pathlib import Path def image_split(imageBytes): # Read image from imageBytes image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR) (h, w) = image.shape[:2] sub_images = [] # If height is greater than or equal to 2000 if h >= 2000: # Split image into multiple images num = int(math.ceil(h / 2000)) height = int(h / num) for i in range(num): # Calculate the dimensions of the sub-image startY = i * height endY = (i + 1) * height width = w dim = (width, endY - startY) # Double check that the dimensions are within bounds if dim[0] <= w and dim[1] <= h: # Resize the sub-image sub_image = cv2.resize(image[startY:endY, 0:w], dim, interpolation=cv2.INTER_AREA) # Save the sub-image # cv2.imwrite('test_{}.png'.format(i),sub_image) # Add the sub-image to the list sub_images.append(sub_image) else: sub_images.append(image) return sub_images def image_resize(image): #image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR) (h,w) = image.shape[:2] WIDTH = 541 # Calculate the ratio of height and width r = WIDTH / float(w) # Create a dimension with the calculated width and height dim = (WIDTH, int(h * r)) # Resize the image image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA) return image def msg_type(line_list, image, ori_width): resize_ratio = 541 / ori_width if len(line_list) == 2: # inner_lines = line_list[0] words = line_list[1][0] x = int(line_list[0][0][0] * resize_ratio) y_min = int(line_list[0][0][1] * resize_ratio) y_max = int(line_list[0][3][1] * resize_ratio) y_mid = y_min + int((y_max-y_min)/2) # 计算首个文字的正中心起始位置,确保后续计算对话框背景颜色更准确 color = image[y_mid][x] # print("1111") return words, (x, y_mid), (y_min, y_max), color else: return None, (None, None), None def date_delete(words, line_text, line, dialog, res): data_pattern1 = re.compile(r'^\d{1,2}:\d{1,2}$') # 剔除24:24的时间戳 if bool(data_pattern1.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern2 = re.compile(r'(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除凌晨|上午|下午|晚上24:24的时间戳 if bool(data_pattern2.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern3 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') # 剔除2024年4月12日24:24的时间戳 if bool(data_pattern3.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern4 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除2024年4月12日凌晨|上午|下午|晚上24:24的时间戳 if bool(data_pattern4.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern5 = re.compile(r'^昨天\d{1,2}:\d{1,2}$') # 剔除昨天24:24的时间戳 if bool(data_pattern5.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern6 = re.compile(r'昨天(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除昨天凌晨|上午|下午|晚上24:24的时间戳 if bool(data_pattern6.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern7 = re.compile(r'^(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') # 剔除4月12日24:24的时间戳 if bool(data_pattern7.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern8 = re.compile(r'^(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除4月12日(凌晨|上午|下午|晚上)24:24的时间戳 if bool(data_pattern8.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern9 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)\d{1,2}:\d{1,2}$') # 剔除星期一24:24的时间戳 if bool(data_pattern9.match(words)): if line == res[-1]: dialog.append(line_text) return True data_pattern10 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除星期一(凌晨|上午|下午|晚上)24:24的时间戳 if bool(data_pattern10.match(words)): if line == res[-1]: dialog.append(line_text) return True def OCR_text(img_path: str, output_path: str) -> Path: ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv4") with open(img_path, "rb") as f: imageBytes = f.read() image_list = image_split(imageBytes) result = ocr.ocr(img_path, cls=True) dialog = [] line_text = '' # y_max_last = 0 filename = '' # result0 = result[0] # image_RGB = Image.open(img_path).convert('RGB') # boxes = [line[0] for line in result0] # txts = [line[1][0] for line in result0] # scores = [line[1][1] for line in result0] # im_show = draw_ocr(image_RGB, boxes, txts, scores, font_path='./ppocr_img/fonts/simfang.ttf') # imshow = Image.fromarray(im_show) # imshow.save('result.jpg') output_txt_path = Path(output_path) / "result.txt" for image in image_list: ori_width = image.shape[1] image = image_resize(image) for idx in range(len(result)): res = result[idx] for line in res: # print(line) words, (x, y_mid), (y_min, y_max), color = msg_type(line, image, ori_width) if line[1][1] < 0.7: if line == res[-1]: dialog.append(line_text) continue if words in ['...', '我通过了你的朋友验证请求,现在', '我们可以开始聊天了', '以上是打招呼的内容', '三', '+', '你撤回了一条消息', 'HD', ':', '按住说话', '以下是新消息', '川']: if line == res[-1]: dialog.append(line_text) continue if date_delete(words, line_text, line, dialog, res): continue data_pattern = re.compile(r'^\d{1,3}"$') # 剔除语音 if bool(data_pattern.match(words)): if line == res[-1]: dialog.append(line_text) continue ### 使用颜色判断(待进一步测试) if color[0] > 160 and color[1] > 160 and color[2] > 160: # 背景颜色为白色,则判断为speaker1 if line_text == '': line_text = 'Speaker1:' + words y_max_last = y_max elif (y_min - y_max_last) < (y_max - y_min): line_text = line_text + words y_max_last = y_max else: dialog.append(line_text) line_text = 'Speaker1:' + words y_max_last = y_max else: # 否则判定为speaker2 if line_text == '': line_text = 'Speaker2:' + words y_max_last = y_max elif (y_min - y_max_last) < (y_max - y_min): line_text = line_text + words y_max_last = y_max else: dialog.append(line_text) line_text = 'Speaker2:' + words y_max_last = y_max line_point = line test_point = res[-1] if line == res[-1]: dialog.append(line_text) ### 保存识别到的结果为txt if filename == '': with open(output_txt_path, 'a+', encoding='utf-8') as f: for dia in dialog: f.write(dia + '\n') else: with open(filename.replace('/', '.') + '.txt', 'a+', encoding='utf-8') as f: for dia in dialog: f.write(dia + '\n') print(f'output dir is: {output_txt_path}') return output_txt_path if __name__ == '__main__': img_path = './image/3.png' output_path = tempfile.mkdtemp() OCR_text(img_path, output_path)