File size: 9,094 Bytes
2bcf29e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image
import math
import numpy as np
import cv2
import re
import tempfile
from pathlib import Path
def image_split(imageBytes):
# Read image from imageBytes
image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
(h, w) = image.shape[:2]
sub_images = []
# If height is greater than or equal to 2000
if h >= 2000:
# Split image into multiple images
num = int(math.ceil(h / 2000))
height = int(h / num)
for i in range(num):
# Calculate the dimensions of the sub-image
startY = i * height
endY = (i + 1) * height
width = w
dim = (width, endY - startY)
# Double check that the dimensions are within bounds
if dim[0] <= w and dim[1] <= h:
# Resize the sub-image
sub_image = cv2.resize(image[startY:endY, 0:w], dim, interpolation=cv2.INTER_AREA)
# Save the sub-image
# cv2.imwrite('test_{}.png'.format(i),sub_image)
# Add the sub-image to the list
sub_images.append(sub_image)
else:
sub_images.append(image)
return sub_images
def image_resize(image):
#image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
(h,w) = image.shape[:2]
WIDTH = 541
# Calculate the ratio of height and width
r = WIDTH / float(w)
# Create a dimension with the calculated width and height
dim = (WIDTH, int(h * r))
# Resize the image
image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)
return image
def msg_type(line_list, image, ori_width):
resize_ratio = 541 / ori_width
if len(line_list) == 2:
# inner_lines = line_list[0]
words = line_list[1][0]
x = int(line_list[0][0][0] * resize_ratio)
y_min = int(line_list[0][0][1] * resize_ratio)
y_max = int(line_list[0][3][1] * resize_ratio)
y_mid = y_min + int((y_max-y_min)/2) # 计算首个文字的正中心起始位置,确保后续计算对话框背景颜色更准确
color = image[y_mid][x]
# print("1111")
return words, (x, y_mid), (y_min, y_max), color
else:
return None, (None, None), None
def date_delete(words, line_text, line, dialog, res):
data_pattern1 = re.compile(r'^\d{1,2}:\d{1,2}$') # 剔除24:24的时间戳
if bool(data_pattern1.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern2 = re.compile(r'(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除凌晨|上午|下午|晚上24:24的时间戳
if bool(data_pattern2.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern3 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') # 剔除2024年4月12日24:24的时间戳
if bool(data_pattern3.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern4 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除2024年4月12日凌晨|上午|下午|晚上24:24的时间戳
if bool(data_pattern4.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern5 = re.compile(r'^昨天\d{1,2}:\d{1,2}$') # 剔除昨天24:24的时间戳
if bool(data_pattern5.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern6 = re.compile(r'昨天(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除昨天凌晨|上午|下午|晚上24:24的时间戳
if bool(data_pattern6.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern7 = re.compile(r'^(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') # 剔除4月12日24:24的时间戳
if bool(data_pattern7.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern8 = re.compile(r'^(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除4月12日(凌晨|上午|下午|晚上)24:24的时间戳
if bool(data_pattern8.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern9 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)\d{1,2}:\d{1,2}$') # 剔除星期一24:24的时间戳
if bool(data_pattern9.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
data_pattern10 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') # 剔除星期一(凌晨|上午|下午|晚上)24:24的时间戳
if bool(data_pattern10.match(words)):
if line == res[-1]:
dialog.append(line_text)
return True
def OCR_text(img_path: str, output_path: str) -> Path:
ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv4")
with open(img_path, "rb") as f:
imageBytes = f.read()
image_list = image_split(imageBytes)
result = ocr.ocr(img_path, cls=True)
dialog = []
line_text = ''
# y_max_last = 0
filename = ''
# result0 = result[0]
# image_RGB = Image.open(img_path).convert('RGB')
# boxes = [line[0] for line in result0]
# txts = [line[1][0] for line in result0]
# scores = [line[1][1] for line in result0]
# im_show = draw_ocr(image_RGB, boxes, txts, scores, font_path='./ppocr_img/fonts/simfang.ttf')
# imshow = Image.fromarray(im_show)
# imshow.save('result.jpg')
output_txt_path = Path(output_path) / "result.txt"
for image in image_list:
ori_width = image.shape[1]
image = image_resize(image)
for idx in range(len(result)):
res = result[idx]
for line in res:
# print(line)
words, (x, y_mid), (y_min, y_max), color = msg_type(line, image, ori_width)
if line[1][1] < 0.7:
if line == res[-1]:
dialog.append(line_text)
continue
if words in ['...', '我通过了你的朋友验证请求,现在', '我们可以开始聊天了', '以上是打招呼的内容', '三',
'+', '你撤回了一条消息', 'HD', ':', '按住说话', '以下是新消息', '川']:
if line == res[-1]:
dialog.append(line_text)
continue
if date_delete(words, line_text, line, dialog, res):
continue
data_pattern = re.compile(r'^\d{1,3}"$') # 剔除语音
if bool(data_pattern.match(words)):
if line == res[-1]:
dialog.append(line_text)
continue
### 使用颜色判断(待进一步测试)
if color[0] > 160 and color[1] > 160 and color[2] > 160: # 背景颜色为白色,则判断为speaker1
if line_text == '':
line_text = 'Speaker1:' + words
y_max_last = y_max
elif (y_min - y_max_last) < (y_max - y_min):
line_text = line_text + words
y_max_last = y_max
else:
dialog.append(line_text)
line_text = 'Speaker1:' + words
y_max_last = y_max
else: # 否则判定为speaker2
if line_text == '':
line_text = 'Speaker2:' + words
y_max_last = y_max
elif (y_min - y_max_last) < (y_max - y_min):
line_text = line_text + words
y_max_last = y_max
else:
dialog.append(line_text)
line_text = 'Speaker2:' + words
y_max_last = y_max
line_point = line
test_point = res[-1]
if line == res[-1]:
dialog.append(line_text)
### 保存识别到的结果为txt
if filename == '':
with open(output_txt_path, 'a+', encoding='utf-8') as f:
for dia in dialog:
f.write(dia + '\n')
else:
with open(filename.replace('/', '.') + '.txt', 'a+', encoding='utf-8') as f:
for dia in dialog:
f.write(dia + '\n')
print(f'output dir is: {output_txt_path}')
return output_txt_path
if __name__ == '__main__':
img_path = './image/3.png'
output_path = tempfile.mkdtemp()
OCR_text(img_path, output_path)
|