File size: 1,624 Bytes
8f0c284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#通过modelscope接口对问题图片予以删除,保证过审

import base64
import json
import os
from io import BytesIO
import pandas as pd
from PIL import Image

import requests


def ocr(image):

    image = Image.open(image)
    img_buffer = BytesIO()
    image.save(img_buffer, format=image.format)
    byte_data = img_buffer.getvalue()
    base64_bytes = base64.b64encode(byte_data)  # bytes
    base64_str = base64_bytes.decode()
    url = "https://www.modelscope.cn/api/v1/studio/damo/ofa_ocr_pipeline/gradio/api/predict/"
    payload = json.dumps({
        "data": [f"data:image/jpeg;base64,{base64_str}"],
        "dataType": ["image"]
    })
    headers = {
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    jobj = json.loads(response.text)
    return jobj

if __name__ == '__main__':
    # 获取当前目录的子目录的路径
    img_path = 'manga'
    subdir_path = os.path.join(os.getcwd(), img_path)

    # 图片素材获取(包含子目录下所有图片)
    image_files = []
    for root, dirs, files in os.walk(subdir_path):
        for file in files:
            if file.endswith(".jpg") or file.endswith(".png"):
                image_files.append(os.path.relpath(os.path.join(root, file)))
    for image_path in image_files:
        result = ocr(image_path) ##dataframe格式  有两列  boxid 和text
        if 'error' in result:
            print("发现问题图片,需要删除以过审:",image_path)
            os.remove(image_path)
        else:
            print(image_path, "图片没有问题")