File size: 9,094 Bytes
2bcf29e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image
import math
import numpy as np
import cv2
import re
import tempfile
from pathlib import Path

def image_split(imageBytes):
    # Read image from imageBytes
    image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
    (h, w) = image.shape[:2]

    sub_images = []
    # If height is greater than or equal to 2000
    if h >= 2000:
        # Split image into multiple images
        num = int(math.ceil(h / 2000))
        height = int(h / num)

        for i in range(num):
            # Calculate the dimensions of the sub-image
            startY = i * height
            endY = (i + 1) * height
            width = w
            dim = (width, endY - startY)
            # Double check that the dimensions are within bounds
            if dim[0] <= w and dim[1] <= h:
                # Resize the sub-image
                sub_image = cv2.resize(image[startY:endY, 0:w], dim, interpolation=cv2.INTER_AREA)
                # Save the sub-image
                # cv2.imwrite('test_{}.png'.format(i),sub_image)
                # Add the sub-image to the list
                sub_images.append(sub_image)
    else:
        sub_images.append(image)
    return sub_images

def image_resize(image):
    #image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
    (h,w) = image.shape[:2]
    WIDTH = 541
    # Calculate the ratio of height and width
    r = WIDTH / float(w)
    # Create a dimension with the calculated width and height
    dim = (WIDTH, int(h * r))
    # Resize the image
    image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)

    return image

def msg_type(line_list, image, ori_width):
    resize_ratio = 541 / ori_width
    if len(line_list) == 2:
        # inner_lines = line_list[0]
        words = line_list[1][0]
        x = int(line_list[0][0][0] * resize_ratio)
        y_min = int(line_list[0][0][1] * resize_ratio)
        y_max = int(line_list[0][3][1] * resize_ratio)
        y_mid = y_min + int((y_max-y_min)/2) # 计算首个文字的正中心起始位置,确保后续计算对话框背景颜色更准确
        color = image[y_mid][x]
        # print("1111")

        return words, (x, y_mid), (y_min, y_max), color
    else:
        return None, (None, None), None

def date_delete(words, line_text, line, dialog, res):

    data_pattern1 = re.compile(r'^\d{1,2}:\d{1,2}$')  # 剔除24:24的时间戳
    if bool(data_pattern1.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern2 = re.compile(r'(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除凌晨|上午|下午|晚上24:24的时间戳
    if bool(data_pattern2.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern3 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$')  # 剔除2024年4月12日24:24的时间戳
    if bool(data_pattern3.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern4 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除2024年4月12日凌晨|上午|下午|晚上24:24的时间戳
    if bool(data_pattern4.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern5 = re.compile(r'^昨天\d{1,2}:\d{1,2}$')  # 剔除昨天24:24的时间戳
    if bool(data_pattern5.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern6 = re.compile(r'昨天(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除昨天凌晨|上午|下午|晚上24:24的时间戳
    if bool(data_pattern6.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern7 = re.compile(r'^(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$')  # 剔除4月12日24:24的时间戳
    if bool(data_pattern7.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern8 = re.compile(r'^(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除4月12日(凌晨|上午|下午|晚上)24:24的时间戳
    if bool(data_pattern8.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern9 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)\d{1,2}:\d{1,2}$')  # 剔除星期一24:24的时间戳
    if bool(data_pattern9.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern10 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除星期一(凌晨|上午|下午|晚上)24:24的时间戳
    if bool(data_pattern10.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

def OCR_text(img_path: str, output_path: str) -> Path:
    ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv4")

    with open(img_path, "rb") as f:
        imageBytes = f.read()
    image_list = image_split(imageBytes)

    result = ocr.ocr(img_path, cls=True)
    dialog = []
    line_text = ''
    # y_max_last = 0
    filename = ''

    # result0 = result[0]
    # image_RGB = Image.open(img_path).convert('RGB')
    # boxes = [line[0] for line in result0]
    # txts = [line[1][0] for line in result0]
    # scores = [line[1][1] for line in result0]
    # im_show = draw_ocr(image_RGB, boxes, txts, scores, font_path='./ppocr_img/fonts/simfang.ttf')
    # imshow = Image.fromarray(im_show)
    # imshow.save('result.jpg')

    output_txt_path = Path(output_path) / "result.txt"

    for image in image_list:
        ori_width = image.shape[1]

        image = image_resize(image)
        for idx in range(len(result)):
            res = result[idx]
            for line in res:
                # print(line)
                words, (x, y_mid), (y_min, y_max), color = msg_type(line, image, ori_width)

                if line[1][1] < 0.7:
                    if line == res[-1]:
                        dialog.append(line_text)
                    continue

                if words in ['...', '我通过了你的朋友验证请求,现在', '我们可以开始聊天了', '以上是打招呼的内容', '三',
                             '+', '你撤回了一条消息', 'HD', ':', '按住说话', '以下是新消息', '川']:
                    if line == res[-1]:
                        dialog.append(line_text)
                    continue

                if date_delete(words, line_text, line, dialog, res):
                    continue

                data_pattern = re.compile(r'^\d{1,3}"$')  # 剔除语音
                if bool(data_pattern.match(words)):
                    if line == res[-1]:
                        dialog.append(line_text)
                    continue

                ### 使用颜色判断(待进一步测试)
                if color[0] > 160 and color[1] > 160 and color[2] > 160:  # 背景颜色为白色,则判断为speaker1
                    if line_text == '':
                        line_text = 'Speaker1:' + words
                        y_max_last = y_max
                    elif (y_min - y_max_last) < (y_max - y_min):
                        line_text = line_text + words
                        y_max_last = y_max
                    else:
                        dialog.append(line_text)
                        line_text = 'Speaker1:' + words
                        y_max_last = y_max
                else:  # 否则判定为speaker2
                    if line_text == '':
                        line_text = 'Speaker2:' + words
                        y_max_last = y_max
                    elif (y_min - y_max_last) < (y_max - y_min):
                        line_text = line_text + words
                        y_max_last = y_max
                    else:
                        dialog.append(line_text)
                        line_text = 'Speaker2:' + words
                        y_max_last = y_max

                line_point = line
                test_point = res[-1]
                if line == res[-1]:
                    dialog.append(line_text)

            ### 保存识别到的结果为txt
            if filename == '':
                with open(output_txt_path, 'a+', encoding='utf-8') as f:
                    for dia in dialog:
                        f.write(dia + '\n')
            else:
                with open(filename.replace('/', '.') + '.txt', 'a+', encoding='utf-8') as f:
                    for dia in dialog:
                        f.write(dia + '\n')

    print(f'output dir is: {output_txt_path}')
    return output_txt_path





if __name__ == '__main__':
    img_path = './image/3.png'
    output_path = tempfile.mkdtemp()
    OCR_text(img_path, output_path)