add main

Browse files

Files changed (5) hide show

gender_age.py +5 -1
item2pic.py +42 -17
main.py +9 -0
product2item.py +18 -1
utils.py +1 -1

gender_age.py CHANGED Viewed

@@ -73,7 +73,7 @@ class ValidImgDetector:
         return (not has_child) and (has_female) and (not has_male)
-if __name__ == "__main__":
     detector = ValidImgDetector()
     create_dir('./output/valid')
     create_dir('./output/invalid')
@@ -87,3 +87,7 @@ if __name__ == "__main__":
                     dst_path = "./output/valid"
                 shutil.move(src_path, dst_path)

         return (not has_child) and (has_female) and (not has_male)
+def filter_img():
     detector = ValidImgDetector()
     create_dir('./output/valid')
     create_dir('./output/invalid')
                     dst_path = "./output/valid"
                 shutil.move(src_path, dst_path)
+if __name__ == "__main__":
+    filter_img()

item2pic.py CHANGED Viewed

@@ -20,25 +20,38 @@ def download_image(img_dir='./images'):
     create_dir(img_dir)
     image_urls = load_urls()
     print('下载图片中...')
-    for img in tqdm(image_urls):
-        rand_sleep(1.5, 2)
-        response = requests.get(img['url'], stream=True)
-        if response.status_code == 200:
-            # 从URL中获取图像文件名
-            image_filename = f'{img_dir}/{img["pid"]}_{img["url"].split("/")[-1]}'
-            # 使用二进制写模式打开文件，准备写入图像数据
-            with open(image_filename, 'wb') as file:
-                for chunk in response.iter_content(chunk_size=8192):
-                    file.write(chunk)
-            if DEBUG_MODE:
-                print(f"{image_filename} 下载完成！")
-        else:
-            add_to_failist(
-                f"下载 {img['url']} 失败: HTTP 错误码 {response.status_code}")
 def fix_url(link):
@@ -60,7 +73,7 @@ def fix_url(link):
 def get_pics(id):
-    rand_sleep(1.0, 1.5)
     # selenium
     option = webdriver.ChromeOptions()
     option.add_experimental_option('excludeSwitches', ['enable-automation'])
@@ -136,7 +149,7 @@ def load_urls(images_jsonl_path="./output/images.jsonl"):
     return urls
-if __name__ == "__main__":
     create_dir('./images')
     ids = load_items()
     get_img_urls(ids)
@@ -146,3 +159,15 @@ if __name__ == "__main__":
         failist_path='./output/duplicate_img.txt'
     )
     download_image()

     create_dir(img_dir)
     image_urls = load_urls()
     print('下载图片中...')
+    trytime = 0
+    while len(image_urls) > 0:
+        failist = []
+        for img in tqdm(image_urls):
+            sleeps(0.5 + 0.1 * trytime, 1.0 + 0.1 * trytime)
+            response = requests.get(img['url'], stream=True)
+            if response.status_code == 200:
+                # 从URL中获取图像文件名
+                image_filename = f'{img_dir}/{img["pid"]}_{img["url"].split("/")[-1]}'
+                # 使用二进制写模式打开文件，准备写入图像数据
+                with open(image_filename, 'wb') as file:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        file.write(chunk)
+                if DEBUG_MODE:
+                    print(f"{image_filename} 下载完成！")
+            elif response.status_code == 420:
+                failist.append(img)
+            else:
+                add_to_failist(
+                    f"下载 {img['url']} 失败: HTTP 错误码 {response.status_code}")
+        trytime += 1
+        print(
+            f'[{len(failist)} / {len(image_urls)}] images failed to download in attempt [{trytime}].')
+        image_urls = failist
+    print('下载完成!')
 def fix_url(link):
 def get_pics(id):
+    sleeps(1.0, 1.5)
     # selenium
     option = webdriver.ChromeOptions()
     option.add_experimental_option('excludeSwitches', ['enable-automation'])
     return urls
+def item_to_pic():
     create_dir('./images')
     ids = load_items()
     get_img_urls(ids)
         failist_path='./output/duplicate_img.txt'
     )
     download_image()
+if __name__ == "__main__":
+    # create_dir('./images')
+    # ids = load_items()
+    # get_img_urls(ids)
+    # rm_duplicates_by_key(
+    #     jsonl_path='./output/images.jsonl',
+    #     key_to_check='url',
+    #     failist_path='./output/duplicate_img.txt'
+    # )
+    download_image()

main.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from product2item import product_to_items
+from item2pic import item_to_pic
+from gender_age import filter_img
+if __name__ == "__main__":
+    product_to_items()
+    item_to_pic()
+    filter_img()

product2item.py CHANGED Viewed

@@ -30,7 +30,7 @@ def get_second_links(keyword):
     # 遍历product页面下的所有item，直至已加载全部商品
     for i in tqdm(range(1, MAX_PAGE + 1)):
         browser.execute_script(f'window.scrollTo(0, {i * 500})')
-        rand_sleep(0.5, 1.0)
         page_str = str(browser.page_source)
         if "<title>taobao | 淘寶</title>" in page_str:
             print('遭遇验证码...')
@@ -61,6 +61,23 @@ def read_lines_to_array(file_path):
     return lines_array
 if __name__ == "__main__":
     keywords = read_lines_to_array('./input/keywords.txt')
     create_dir('./output')

     # 遍历product页面下的所有item，直至已加载全部商品
     for i in tqdm(range(1, MAX_PAGE + 1)):
         browser.execute_script(f'window.scrollTo(0, {i * 500})')
+        sleeps(0.5, 1.0)
         page_str = str(browser.page_source)
         if "<title>taobao | 淘寶</title>" in page_str:
             print('遭遇验证码...')
     return lines_array
+def product_to_items():
+    keywords = read_lines_to_array('./input/keywords.txt')
+    create_dir('./output')
+    for key in keywords:
+        urls = list(get_second_links(key))
+        print(f'Saving url into jsonl for keyword [{key}]')
+        for url in tqdm(urls):
+            tmp_dict = {
+                'keyword': key,
+                'id': url.split('.htm?spm=')[0].split('//www.taobao.com/list/item/')[1]
+            }
+            append_dict_to_jsonl(tmp_dict)
+    rm_duplicates_by_key()
 if __name__ == "__main__":
     keywords = read_lines_to_array('./input/keywords.txt')
     create_dir('./output')

utils.py CHANGED Viewed

@@ -13,7 +13,7 @@ def skip_captcha():
     print('爬取链接中...')
-def rand_sleep(a, b):
     if a > 0 and b > a:
         sleep((b - a) * random.random() + a)

     print('爬取链接中...')
+def sleeps(a, b):
     if a > 0 and b > a:
         sleep((b - a) * random.random() + a)