|
from paddleocr import PaddleOCR, draw_ocr |
|
from PIL import Image |
|
import math |
|
import numpy as np |
|
import cv2 |
|
import re |
|
import tempfile |
|
from pathlib import Path |
|
|
|
def image_split(imageBytes): |
|
|
|
image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR) |
|
(h, w) = image.shape[:2] |
|
|
|
sub_images = [] |
|
|
|
if h >= 2000: |
|
|
|
num = int(math.ceil(h / 2000)) |
|
height = int(h / num) |
|
|
|
for i in range(num): |
|
|
|
startY = i * height |
|
endY = (i + 1) * height |
|
width = w |
|
dim = (width, endY - startY) |
|
|
|
if dim[0] <= w and dim[1] <= h: |
|
|
|
sub_image = cv2.resize(image[startY:endY, 0:w], dim, interpolation=cv2.INTER_AREA) |
|
|
|
|
|
|
|
sub_images.append(sub_image) |
|
else: |
|
sub_images.append(image) |
|
return sub_images |
|
|
|
def image_resize(image): |
|
|
|
(h,w) = image.shape[:2] |
|
WIDTH = 541 |
|
|
|
r = WIDTH / float(w) |
|
|
|
dim = (WIDTH, int(h * r)) |
|
|
|
image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA) |
|
|
|
return image |
|
|
|
def msg_type(line_list, image, ori_width): |
|
resize_ratio = 541 / ori_width |
|
if len(line_list) == 2: |
|
|
|
words = line_list[1][0] |
|
x = int(line_list[0][0][0] * resize_ratio) |
|
y_min = int(line_list[0][0][1] * resize_ratio) |
|
y_max = int(line_list[0][3][1] * resize_ratio) |
|
y_mid = y_min + int((y_max-y_min)/2) |
|
color = image[y_mid][x] |
|
|
|
|
|
return words, (x, y_mid), (y_min, y_max), color |
|
else: |
|
return None, (None, None), None |
|
|
|
def date_delete(words, line_text, line, dialog, res): |
|
|
|
data_pattern1 = re.compile(r'^\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern1.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern2 = re.compile(r'(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern2.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern3 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern3.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern4 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern4.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern5 = re.compile(r'^昨天\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern5.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern6 = re.compile(r'昨天(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern6.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern7 = re.compile(r'^(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern7.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern8 = re.compile(r'^(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern8.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern9 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern9.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
data_pattern10 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$') |
|
if bool(data_pattern10.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
return True |
|
|
|
def OCR_text(img_path: str, output_path: str) -> Path: |
|
ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv4") |
|
|
|
with open(img_path, "rb") as f: |
|
imageBytes = f.read() |
|
image_list = image_split(imageBytes) |
|
|
|
result = ocr.ocr(img_path, cls=True) |
|
dialog = [] |
|
line_text = '' |
|
|
|
filename = '' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output_txt_path = Path(output_path) / "result.txt" |
|
|
|
for image in image_list: |
|
ori_width = image.shape[1] |
|
|
|
image = image_resize(image) |
|
for idx in range(len(result)): |
|
res = result[idx] |
|
for line in res: |
|
|
|
words, (x, y_mid), (y_min, y_max), color = msg_type(line, image, ori_width) |
|
|
|
if line[1][1] < 0.7: |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
continue |
|
|
|
if words in ['...', '我通过了你的朋友验证请求,现在', '我们可以开始聊天了', '以上是打招呼的内容', '三', |
|
'+', '你撤回了一条消息', 'HD', ':', '按住说话', '以下是新消息', '川']: |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
continue |
|
|
|
if date_delete(words, line_text, line, dialog, res): |
|
continue |
|
|
|
data_pattern = re.compile(r'^\d{1,3}"$') |
|
if bool(data_pattern.match(words)): |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
continue |
|
|
|
|
|
if color[0] > 160 and color[1] > 160 and color[2] > 160: |
|
if line_text == '': |
|
line_text = 'Speaker1:' + words |
|
y_max_last = y_max |
|
elif (y_min - y_max_last) < (y_max - y_min): |
|
line_text = line_text + words |
|
y_max_last = y_max |
|
else: |
|
dialog.append(line_text) |
|
line_text = 'Speaker1:' + words |
|
y_max_last = y_max |
|
else: |
|
if line_text == '': |
|
line_text = 'Speaker2:' + words |
|
y_max_last = y_max |
|
elif (y_min - y_max_last) < (y_max - y_min): |
|
line_text = line_text + words |
|
y_max_last = y_max |
|
else: |
|
dialog.append(line_text) |
|
line_text = 'Speaker2:' + words |
|
y_max_last = y_max |
|
|
|
line_point = line |
|
test_point = res[-1] |
|
if line == res[-1]: |
|
dialog.append(line_text) |
|
|
|
|
|
if filename == '': |
|
with open(output_txt_path, 'a+', encoding='utf-8') as f: |
|
for dia in dialog: |
|
f.write(dia + '\n') |
|
else: |
|
with open(filename.replace('/', '.') + '.txt', 'a+', encoding='utf-8') as f: |
|
for dia in dialog: |
|
f.write(dia + '\n') |
|
|
|
print(f'output dir is: {output_txt_path}') |
|
return output_txt_path |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
img_path = './image/3.png' |
|
output_path = tempfile.mkdtemp() |
|
OCR_text(img_path, output_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|