George
commited on
Commit
•
2b8b1bb
1
Parent(s):
712f935
add main
Browse files- gender_age.py +5 -1
- item2pic.py +42 -17
- main.py +9 -0
- product2item.py +18 -1
- utils.py +1 -1
gender_age.py
CHANGED
@@ -73,7 +73,7 @@ class ValidImgDetector:
|
|
73 |
return (not has_child) and (has_female) and (not has_male)
|
74 |
|
75 |
|
76 |
-
|
77 |
detector = ValidImgDetector()
|
78 |
create_dir('./output/valid')
|
79 |
create_dir('./output/invalid')
|
@@ -87,3 +87,7 @@ if __name__ == "__main__":
|
|
87 |
dst_path = "./output/valid"
|
88 |
|
89 |
shutil.move(src_path, dst_path)
|
|
|
|
|
|
|
|
|
|
73 |
return (not has_child) and (has_female) and (not has_male)
|
74 |
|
75 |
|
76 |
+
def filter_img():
|
77 |
detector = ValidImgDetector()
|
78 |
create_dir('./output/valid')
|
79 |
create_dir('./output/invalid')
|
|
|
87 |
dst_path = "./output/valid"
|
88 |
|
89 |
shutil.move(src_path, dst_path)
|
90 |
+
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
filter_img()
|
item2pic.py
CHANGED
@@ -20,25 +20,38 @@ def download_image(img_dir='./images'):
|
|
20 |
create_dir(img_dir)
|
21 |
image_urls = load_urls()
|
22 |
print('下载图片中...')
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
36 |
-
|
37 |
-
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
def fix_url(link):
|
@@ -60,7 +73,7 @@ def fix_url(link):
|
|
60 |
|
61 |
|
62 |
def get_pics(id):
|
63 |
-
|
64 |
# selenium
|
65 |
option = webdriver.ChromeOptions()
|
66 |
option.add_experimental_option('excludeSwitches', ['enable-automation'])
|
@@ -136,7 +149,7 @@ def load_urls(images_jsonl_path="./output/images.jsonl"):
|
|
136 |
return urls
|
137 |
|
138 |
|
139 |
-
|
140 |
create_dir('./images')
|
141 |
ids = load_items()
|
142 |
get_img_urls(ids)
|
@@ -146,3 +159,15 @@ if __name__ == "__main__":
|
|
146 |
failist_path='./output/duplicate_img.txt'
|
147 |
)
|
148 |
download_image()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
create_dir(img_dir)
|
21 |
image_urls = load_urls()
|
22 |
print('下载图片中...')
|
23 |
+
trytime = 0
|
24 |
+
while len(image_urls) > 0:
|
25 |
+
failist = []
|
26 |
+
for img in tqdm(image_urls):
|
27 |
+
sleeps(0.5 + 0.1 * trytime, 1.0 + 0.1 * trytime)
|
28 |
+
response = requests.get(img['url'], stream=True)
|
29 |
|
30 |
+
if response.status_code == 200:
|
31 |
+
# 从URL中获取图像文件名
|
32 |
+
image_filename = f'{img_dir}/{img["pid"]}_{img["url"].split("/")[-1]}'
|
33 |
|
34 |
+
# 使用二进制写模式打开文件,准备写入图像数据
|
35 |
+
with open(image_filename, 'wb') as file:
|
36 |
+
for chunk in response.iter_content(chunk_size=8192):
|
37 |
+
file.write(chunk)
|
38 |
|
39 |
+
if DEBUG_MODE:
|
40 |
+
print(f"{image_filename} 下载完成!")
|
41 |
|
42 |
+
elif response.status_code == 420:
|
43 |
+
failist.append(img)
|
44 |
+
|
45 |
+
else:
|
46 |
+
add_to_failist(
|
47 |
+
f"下载 {img['url']} 失败: HTTP 错误码 {response.status_code}")
|
48 |
+
|
49 |
+
trytime += 1
|
50 |
+
print(
|
51 |
+
f'[{len(failist)} / {len(image_urls)}] images failed to download in attempt [{trytime}].')
|
52 |
+
image_urls = failist
|
53 |
+
|
54 |
+
print('下载完成!')
|
55 |
|
56 |
|
57 |
def fix_url(link):
|
|
|
73 |
|
74 |
|
75 |
def get_pics(id):
|
76 |
+
sleeps(1.0, 1.5)
|
77 |
# selenium
|
78 |
option = webdriver.ChromeOptions()
|
79 |
option.add_experimental_option('excludeSwitches', ['enable-automation'])
|
|
|
149 |
return urls
|
150 |
|
151 |
|
152 |
+
def item_to_pic():
|
153 |
create_dir('./images')
|
154 |
ids = load_items()
|
155 |
get_img_urls(ids)
|
|
|
159 |
failist_path='./output/duplicate_img.txt'
|
160 |
)
|
161 |
download_image()
|
162 |
+
|
163 |
+
|
164 |
+
if __name__ == "__main__":
|
165 |
+
# create_dir('./images')
|
166 |
+
# ids = load_items()
|
167 |
+
# get_img_urls(ids)
|
168 |
+
# rm_duplicates_by_key(
|
169 |
+
# jsonl_path='./output/images.jsonl',
|
170 |
+
# key_to_check='url',
|
171 |
+
# failist_path='./output/duplicate_img.txt'
|
172 |
+
# )
|
173 |
+
download_image()
|
main.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from product2item import product_to_items
|
2 |
+
from item2pic import item_to_pic
|
3 |
+
from gender_age import filter_img
|
4 |
+
|
5 |
+
|
6 |
+
if __name__ == "__main__":
|
7 |
+
product_to_items()
|
8 |
+
item_to_pic()
|
9 |
+
filter_img()
|
product2item.py
CHANGED
@@ -30,7 +30,7 @@ def get_second_links(keyword):
|
|
30 |
# 遍历product页面下的所有item,直至已加载全部商品
|
31 |
for i in tqdm(range(1, MAX_PAGE + 1)):
|
32 |
browser.execute_script(f'window.scrollTo(0, {i * 500})')
|
33 |
-
|
34 |
page_str = str(browser.page_source)
|
35 |
if "<title>taobao | 淘寶</title>" in page_str:
|
36 |
print('遭遇验证码...')
|
@@ -61,6 +61,23 @@ def read_lines_to_array(file_path):
|
|
61 |
return lines_array
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
if __name__ == "__main__":
|
65 |
keywords = read_lines_to_array('./input/keywords.txt')
|
66 |
create_dir('./output')
|
|
|
30 |
# 遍历product页面下的所有item,直至已加载全部商品
|
31 |
for i in tqdm(range(1, MAX_PAGE + 1)):
|
32 |
browser.execute_script(f'window.scrollTo(0, {i * 500})')
|
33 |
+
sleeps(0.5, 1.0)
|
34 |
page_str = str(browser.page_source)
|
35 |
if "<title>taobao | 淘寶</title>" in page_str:
|
36 |
print('遭遇验证码...')
|
|
|
61 |
return lines_array
|
62 |
|
63 |
|
64 |
+
def product_to_items():
|
65 |
+
keywords = read_lines_to_array('./input/keywords.txt')
|
66 |
+
create_dir('./output')
|
67 |
+
|
68 |
+
for key in keywords:
|
69 |
+
urls = list(get_second_links(key))
|
70 |
+
print(f'Saving url into jsonl for keyword [{key}]')
|
71 |
+
for url in tqdm(urls):
|
72 |
+
tmp_dict = {
|
73 |
+
'keyword': key,
|
74 |
+
'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1]
|
75 |
+
}
|
76 |
+
append_dict_to_jsonl(tmp_dict)
|
77 |
+
|
78 |
+
rm_duplicates_by_key()
|
79 |
+
|
80 |
+
|
81 |
if __name__ == "__main__":
|
82 |
keywords = read_lines_to_array('./input/keywords.txt')
|
83 |
create_dir('./output')
|
utils.py
CHANGED
@@ -13,7 +13,7 @@ def skip_captcha():
|
|
13 |
print('爬取链接中...')
|
14 |
|
15 |
|
16 |
-
def
|
17 |
if a > 0 and b > a:
|
18 |
sleep((b - a) * random.random() + a)
|
19 |
|
|
|
13 |
print('爬取链接中...')
|
14 |
|
15 |
|
16 |
+
def sleeps(a, b):
|
17 |
if a > 0 and b > a:
|
18 |
sleep((b - a) * random.random() + a)
|
19 |
|