Spaces:
Sleeping
Sleeping
#通过modelscope接口对问题图片予以删除,保证过审 | |
import base64 | |
import json | |
import os | |
from io import BytesIO | |
import pandas as pd | |
from PIL import Image | |
import requests | |
def ocr(image): | |
image = Image.open(image) | |
img_buffer = BytesIO() | |
image.save(img_buffer, format=image.format) | |
byte_data = img_buffer.getvalue() | |
base64_bytes = base64.b64encode(byte_data) # bytes | |
base64_str = base64_bytes.decode() | |
url = "https://www.modelscope.cn/api/v1/studio/damo/ofa_ocr_pipeline/gradio/api/predict/" | |
payload = json.dumps({ | |
"data": [f"data:image/jpeg;base64,{base64_str}"], | |
"dataType": ["image"] | |
}) | |
headers = { | |
'Content-Type': 'application/json' | |
} | |
response = requests.request("POST", url, headers=headers, data=payload) | |
jobj = json.loads(response.text) | |
return jobj | |
if __name__ == '__main__': | |
# 获取当前目录的子目录的路径 | |
img_path = 'manga' | |
subdir_path = os.path.join(os.getcwd(), img_path) | |
# 图片素材获取(包含子目录下所有图片) | |
image_files = [] | |
for root, dirs, files in os.walk(subdir_path): | |
for file in files: | |
if file.endswith(".jpg") or file.endswith(".png"): | |
image_files.append(os.path.relpath(os.path.join(root, file))) | |
for image_path in image_files: | |
result = ocr(image_path) ##dataframe格式 有两列 boxid 和text | |
if 'error' in result: | |
print("发现问题图片,需要删除以过审:",image_path) | |
os.remove(image_path) | |
else: | |
print(image_path, "图片没有问题") | |