Spaces:

Suevar
/

OCR

Runtime error

File size: 9,094 Bytes

2bcf29e

from paddleocr import PaddleOCR, draw_ocr
from PIL import Image
import math
import numpy as np
import cv2
import re
import tempfile
from pathlib import Path

def image_split(imageBytes):
    # Read image from imageBytes
    image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
    (h, w) = image.shape[:2]

    sub_images = []
    # If height is greater than or equal to 2000
    if h >= 2000:
        # Split image into multiple images
        num = int(math.ceil(h / 2000))
        height = int(h / num)

        for i in range(num):
            # Calculate the dimensions of the sub-image
            startY = i * height
            endY = (i + 1) * height
            width = w
            dim = (width, endY - startY)
            # Double check that the dimensions are within bounds
            if dim[0] <= w and dim[1] <= h:
                # Resize the sub-image
                sub_image = cv2.resize(image[startY:endY, 0:w], dim, interpolation=cv2.INTER_AREA)
                # Save the sub-image
                # cv2.imwrite('test_{}.png'.format(i),sub_image)
                # Add the sub-image to the list
                sub_images.append(sub_image)
    else:
        sub_images.append(image)
    return sub_images

def image_resize(image):
    #image = cv2.imdecode(np.frombuffer(imageBytes, np.uint8), cv2.IMREAD_COLOR)
    (h,w) = image.shape[:2]
    WIDTH = 541
    # Calculate the ratio of height and width
    r = WIDTH / float(w)
    # Create a dimension with the calculated width and height
    dim = (WIDTH, int(h * r))
    # Resize the image
    image = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)

    return image

def msg_type(line_list, image, ori_width):
    resize_ratio = 541 / ori_width
    if len(line_list) == 2:
        # inner_lines = line_list[0]
        words = line_list[1][0]
        x = int(line_list[0][0][0] * resize_ratio)
        y_min = int(line_list[0][0][1] * resize_ratio)
        y_max = int(line_list[0][3][1] * resize_ratio)
        y_mid = y_min + int((y_max-y_min)/2) # 计算首个文字的正中心起始位置，确保后续计算对话框背景颜色更准确
        color = image[y_mid][x]
        # print("1111")

        return words, (x, y_mid), (y_min, y_max), color
    else:
        return None, (None, None), None

def date_delete(words, line_text, line, dialog, res):

    data_pattern1 = re.compile(r'^\d{1,2}:\d{1,2}$')  # 剔除24：24的时间戳
    if bool(data_pattern1.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern2 = re.compile(r'(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除凌晨|上午|下午|晚上24：24的时间戳
    if bool(data_pattern2.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern3 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$')  # 剔除2024年4月12日24：24的时间戳
    if bool(data_pattern3.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern4 = re.compile(r'^(\d{4})年(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除2024年4月12日凌晨|上午|下午|晚上24：24的时间戳
    if bool(data_pattern4.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern5 = re.compile(r'^昨天\d{1,2}:\d{1,2}$')  # 剔除昨天24：24的时间戳
    if bool(data_pattern5.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern6 = re.compile(r'昨天(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除昨天凌晨|上午|下午|晚上24：24的时间戳
    if bool(data_pattern6.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern7 = re.compile(r'^(\d{1,2})月(\d{1,2})日\d{1,2}:\d{1,2}$')  # 剔除4月12日24：24的时间戳
    if bool(data_pattern7.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern8 = re.compile(r'^(\d{1,2})月(\d{1,2})日(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除4月12日(凌晨|上午|下午|晚上)24：24的时间戳
    if bool(data_pattern8.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern9 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)\d{1,2}:\d{1,2}$')  # 剔除星期一24：24的时间戳
    if bool(data_pattern9.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

    data_pattern10 = re.compile(r'(星期一|星期二|星期三|星期四|星期五|星期六|星期日)(凌晨|上午|下午|晚上)\d{1,2}:\d{1,2}$')  # 剔除星期一(凌晨|上午|下午|晚上)24：24的时间戳
    if bool(data_pattern10.match(words)):
        if line == res[-1]:
            dialog.append(line_text)
        return True

def OCR_text(img_path: str, output_path: str) -> Path:
    ocr = PaddleOCR(use_angle_cls=True, lang="ch", ocr_version="PP-OCRv4")

    with open(img_path, "rb") as f:
        imageBytes = f.read()
    image_list = image_split(imageBytes)

    result = ocr.ocr(img_path, cls=True)
    dialog = []
    line_text = ''
    # y_max_last = 0
    filename = ''

    # result0 = result[0]
    # image_RGB = Image.open(img_path).convert('RGB')
    # boxes = [line[0] for line in result0]
    # txts = [line[1][0] for line in result0]
    # scores = [line[1][1] for line in result0]
    # im_show = draw_ocr(image_RGB, boxes, txts, scores, font_path='./ppocr_img/fonts/simfang.ttf')
    # imshow = Image.fromarray(im_show)
    # imshow.save('result.jpg')

    output_txt_path = Path(output_path) / "result.txt"

    for image in image_list:
        ori_width = image.shape[1]

        image = image_resize(image)
        for idx in range(len(result)):
            res = result[idx]
            for line in res:
                # print(line)
                words, (x, y_mid), (y_min, y_max), color = msg_type(line, image, ori_width)

                if line[1][1] < 0.7:
                    if line == res[-1]:
                        dialog.append(line_text)
                    continue

                if words in ['...', '我通过了你的朋友验证请求，现在', '我们可以开始聊天了', '以上是打招呼的内容', '三',
                             '+', '你撤回了一条消息', 'HD', ':', '按住说话', '以下是新消息', '川']:
                    if line == res[-1]:
                        dialog.append(line_text)
                    continue

                if date_delete(words, line_text, line, dialog, res):
                    continue

                data_pattern = re.compile(r'^\d{1,3}"$')  # 剔除语音
                if bool(data_pattern.match(words)):
                    if line == res[-1]:
                        dialog.append(line_text)
                    continue

                ### 使用颜色判断（待进一步测试）
                if color[0] > 160 and color[1] > 160 and color[2] > 160:  # 背景颜色为白色，则判断为speaker1
                    if line_text == '':
                        line_text = 'Speaker1：' + words
                        y_max_last = y_max
                    elif (y_min - y_max_last) < (y_max - y_min):
                        line_text = line_text + words
                        y_max_last = y_max
                    else:
                        dialog.append(line_text)
                        line_text = 'Speaker1：' + words
                        y_max_last = y_max
                else:  # 否则判定为speaker2
                    if line_text == '':
                        line_text = 'Speaker2:' + words
                        y_max_last = y_max
                    elif (y_min - y_max_last) < (y_max - y_min):
                        line_text = line_text + words
                        y_max_last = y_max
                    else:
                        dialog.append(line_text)
                        line_text = 'Speaker2:' + words
                        y_max_last = y_max

                line_point = line
                test_point = res[-1]
                if line == res[-1]:
                    dialog.append(line_text)

            ### 保存识别到的结果为txt
            if filename == '':
                with open(output_txt_path, 'a+', encoding='utf-8') as f:
                    for dia in dialog:
                        f.write(dia + '\n')
            else:
                with open(filename.replace('/', '.') + '.txt', 'a+', encoding='utf-8') as f:
                    for dia in dialog:
                        f.write(dia + '\n')

    print(f'output dir is: {output_txt_path}')
    return output_txt_path





if __name__ == '__main__':
    img_path = './image/3.png'
    output_path = tempfile.mkdtemp()
    OCR_text(img_path, output_path)