George commited on
Commit
2b8b1bb
1 Parent(s): 712f935
Files changed (5) hide show
  1. gender_age.py +5 -1
  2. item2pic.py +42 -17
  3. main.py +9 -0
  4. product2item.py +18 -1
  5. utils.py +1 -1
gender_age.py CHANGED
@@ -73,7 +73,7 @@ class ValidImgDetector:
73
  return (not has_child) and (has_female) and (not has_male)
74
 
75
 
76
- if __name__ == "__main__":
77
  detector = ValidImgDetector()
78
  create_dir('./output/valid')
79
  create_dir('./output/invalid')
@@ -87,3 +87,7 @@ if __name__ == "__main__":
87
  dst_path = "./output/valid"
88
 
89
  shutil.move(src_path, dst_path)
 
 
 
 
 
73
  return (not has_child) and (has_female) and (not has_male)
74
 
75
 
76
+ def filter_img():
77
  detector = ValidImgDetector()
78
  create_dir('./output/valid')
79
  create_dir('./output/invalid')
 
87
  dst_path = "./output/valid"
88
 
89
  shutil.move(src_path, dst_path)
90
+
91
+
92
+ if __name__ == "__main__":
93
+ filter_img()
item2pic.py CHANGED
@@ -20,25 +20,38 @@ def download_image(img_dir='./images'):
20
  create_dir(img_dir)
21
  image_urls = load_urls()
22
  print('下载图片中...')
23
- for img in tqdm(image_urls):
24
- rand_sleep(1.5, 2)
25
- response = requests.get(img['url'], stream=True)
 
 
 
26
 
27
- if response.status_code == 200:
28
- # 从URL中获取图像文件名
29
- image_filename = f'{img_dir}/{img["pid"]}_{img["url"].split("/")[-1]}'
30
 
31
- # 使用二进制写模式打开文件,准备写入图像数据
32
- with open(image_filename, 'wb') as file:
33
- for chunk in response.iter_content(chunk_size=8192):
34
- file.write(chunk)
35
 
36
- if DEBUG_MODE:
37
- print(f"{image_filename} 下载完成!")
38
 
39
- else:
40
- add_to_failist(
41
- f"下载 {img['url']} 失败: HTTP 错误码 {response.status_code}")
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  def fix_url(link):
@@ -60,7 +73,7 @@ def fix_url(link):
60
 
61
 
62
  def get_pics(id):
63
- rand_sleep(1.0, 1.5)
64
  # selenium
65
  option = webdriver.ChromeOptions()
66
  option.add_experimental_option('excludeSwitches', ['enable-automation'])
@@ -136,7 +149,7 @@ def load_urls(images_jsonl_path="./output/images.jsonl"):
136
  return urls
137
 
138
 
139
- if __name__ == "__main__":
140
  create_dir('./images')
141
  ids = load_items()
142
  get_img_urls(ids)
@@ -146,3 +159,15 @@ if __name__ == "__main__":
146
  failist_path='./output/duplicate_img.txt'
147
  )
148
  download_image()
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  create_dir(img_dir)
21
  image_urls = load_urls()
22
  print('下载图片中...')
23
+ trytime = 0
24
+ while len(image_urls) > 0:
25
+ failist = []
26
+ for img in tqdm(image_urls):
27
+ sleeps(0.5 + 0.1 * trytime, 1.0 + 0.1 * trytime)
28
+ response = requests.get(img['url'], stream=True)
29
 
30
+ if response.status_code == 200:
31
+ # 从URL中获取图像文件名
32
+ image_filename = f'{img_dir}/{img["pid"]}_{img["url"].split("/")[-1]}'
33
 
34
+ # 使用二进制写模式打开文件,准备写入图像数据
35
+ with open(image_filename, 'wb') as file:
36
+ for chunk in response.iter_content(chunk_size=8192):
37
+ file.write(chunk)
38
 
39
+ if DEBUG_MODE:
40
+ print(f"{image_filename} 下载完成!")
41
 
42
+ elif response.status_code == 420:
43
+ failist.append(img)
44
+
45
+ else:
46
+ add_to_failist(
47
+ f"下载 {img['url']} 失败: HTTP 错误码 {response.status_code}")
48
+
49
+ trytime += 1
50
+ print(
51
+ f'[{len(failist)} / {len(image_urls)}] images failed to download in attempt [{trytime}].')
52
+ image_urls = failist
53
+
54
+ print('下载完成!')
55
 
56
 
57
  def fix_url(link):
 
73
 
74
 
75
  def get_pics(id):
76
+ sleeps(1.0, 1.5)
77
  # selenium
78
  option = webdriver.ChromeOptions()
79
  option.add_experimental_option('excludeSwitches', ['enable-automation'])
 
149
  return urls
150
 
151
 
152
+ def item_to_pic():
153
  create_dir('./images')
154
  ids = load_items()
155
  get_img_urls(ids)
 
159
  failist_path='./output/duplicate_img.txt'
160
  )
161
  download_image()
162
+
163
+
164
+ if __name__ == "__main__":
165
+ # create_dir('./images')
166
+ # ids = load_items()
167
+ # get_img_urls(ids)
168
+ # rm_duplicates_by_key(
169
+ # jsonl_path='./output/images.jsonl',
170
+ # key_to_check='url',
171
+ # failist_path='./output/duplicate_img.txt'
172
+ # )
173
+ download_image()
main.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from product2item import product_to_items
2
+ from item2pic import item_to_pic
3
+ from gender_age import filter_img
4
+
5
+
6
+ if __name__ == "__main__":
7
+ product_to_items()
8
+ item_to_pic()
9
+ filter_img()
product2item.py CHANGED
@@ -30,7 +30,7 @@ def get_second_links(keyword):
30
  # 遍历product页面下的所有item,直至已加载全部商品
31
  for i in tqdm(range(1, MAX_PAGE + 1)):
32
  browser.execute_script(f'window.scrollTo(0, {i * 500})')
33
- rand_sleep(0.5, 1.0)
34
  page_str = str(browser.page_source)
35
  if "<title>taobao | 淘寶</title>" in page_str:
36
  print('遭遇验证码...')
@@ -61,6 +61,23 @@ def read_lines_to_array(file_path):
61
  return lines_array
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  if __name__ == "__main__":
65
  keywords = read_lines_to_array('./input/keywords.txt')
66
  create_dir('./output')
 
30
  # 遍历product页面下的所有item,直至已加载全部商品
31
  for i in tqdm(range(1, MAX_PAGE + 1)):
32
  browser.execute_script(f'window.scrollTo(0, {i * 500})')
33
+ sleeps(0.5, 1.0)
34
  page_str = str(browser.page_source)
35
  if "<title>taobao | 淘寶</title>" in page_str:
36
  print('遭遇验证码...')
 
61
  return lines_array
62
 
63
 
64
+ def product_to_items():
65
+ keywords = read_lines_to_array('./input/keywords.txt')
66
+ create_dir('./output')
67
+
68
+ for key in keywords:
69
+ urls = list(get_second_links(key))
70
+ print(f'Saving url into jsonl for keyword [{key}]')
71
+ for url in tqdm(urls):
72
+ tmp_dict = {
73
+ 'keyword': key,
74
+ 'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1]
75
+ }
76
+ append_dict_to_jsonl(tmp_dict)
77
+
78
+ rm_duplicates_by_key()
79
+
80
+
81
  if __name__ == "__main__":
82
  keywords = read_lines_to_array('./input/keywords.txt')
83
  create_dir('./output')
utils.py CHANGED
@@ -13,7 +13,7 @@ def skip_captcha():
13
  print('爬取链接中...')
14
 
15
 
16
- def rand_sleep(a, b):
17
  if a > 0 and b > a:
18
  sleep((b - a) * random.random() + a)
19
 
 
13
  print('爬取链接中...')
14
 
15
 
16
+ def sleeps(a, b):
17
  if a > 0 and b > a:
18
  sleep((b - a) * random.random() + a)
19