rogerxavier commited on
Commit
8f0c284
1 Parent(s): 0930ceb

Create 0filterImage.py

Browse files
Files changed (1) hide show
  1. 0filterImage.py +53 -0
0filterImage.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #通过modelscope接口对问题图片予以删除,保证过审
2
+
3
+ import base64
4
+ import json
5
+ import os
6
+ from io import BytesIO
7
+ import pandas as pd
8
+ from PIL import Image
9
+
10
+ import requests
11
+
12
+
13
+ def ocr(image):
14
+
15
+ image = Image.open(image)
16
+ img_buffer = BytesIO()
17
+ image.save(img_buffer, format=image.format)
18
+ byte_data = img_buffer.getvalue()
19
+ base64_bytes = base64.b64encode(byte_data) # bytes
20
+ base64_str = base64_bytes.decode()
21
+ url = "https://www.modelscope.cn/api/v1/studio/damo/ofa_ocr_pipeline/gradio/api/predict/"
22
+ payload = json.dumps({
23
+ "data": [f"data:image/jpeg;base64,{base64_str}"],
24
+ "dataType": ["image"]
25
+ })
26
+ headers = {
27
+ 'Content-Type': 'application/json'
28
+ }
29
+
30
+ response = requests.request("POST", url, headers=headers, data=payload)
31
+ jobj = json.loads(response.text)
32
+ return jobj
33
+
34
+ if __name__ == '__main__':
35
+ # 获取当前目录的子目录的路径
36
+ img_path = 'manga'
37
+ subdir_path = os.path.join(os.getcwd(), img_path)
38
+
39
+ # 图片素材获取(包含子目录下所有图片)
40
+ image_files = []
41
+ for root, dirs, files in os.walk(subdir_path):
42
+ for file in files:
43
+ if file.endswith(".jpg") or file.endswith(".png"):
44
+ image_files.append(os.path.relpath(os.path.join(root, file)))
45
+ for image_path in image_files:
46
+ result = ocr(image_path) ##dataframe格式 有两列 boxid 和text
47
+ if 'error' in result:
48
+ print("发现问题图片,需要删除以过审:",image_path)
49
+ os.remove(image_path)
50
+ else:
51
+ print(image_path, "图片没有问题")
52
+
53
+