import requests import parsel from lxml import etree from tqdm import tqdm import time import re def check_china_ips(proxies_list): """检测ip的方法""" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} can_use = [] for proxy in tqdm(proxies_list, desc = "Checking ips"): try: response = requests.get('http://www.baidu.com', headers=headers, proxies=proxy, timeout=1) # 超时报错 if response.status_code == 200: can_use.append(proxy) except Exception as error: # print(error) pass return can_use def check_us_ips(proxies_list): """检测ip的方法""" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} can_use = [] for proxy in tqdm(proxies_list, desc = "Checking ips"): try: response = requests.get('http://www.google.com', headers=headers, proxies=proxy, timeout=1) # 超时报错 if response.status_code == 200: can_use.append(proxy) except Exception as error: # print(error) pass return can_use def get_china_free_proxy(pages = 10): proxies_list = [] for page in tqdm(range(1, pages+1), desc = "Gathering free ips by pages..."): base_url = f'https://www.kuaidaili.com/free/inha/{page}' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} success = False while not success: try: response = requests.get(base_url, headers=headers) data = response.text res = etree.HTML(data) trs = res.xpath("/html/body/div[1]/div[4]/div[2]/div[2]/div[2]/table/tbody/tr") if len(trs)!=0: success = True for tr in trs: proxies_dict = {} http_type = tr.xpath('./td[4]/text()')[0] ip_num = tr.xpath('./td[1]/text()')[0] port_num = tr.xpath('./td[2]/text()')[0] proxies_dict[http_type] = ip_num + ':' + port_num proxies_list.append(proxies_dict) else: time.delay(0.01) except: pass can_use = check_china_ips(proxies_list) print(f'获取到的代理ip数量: {len(proxies_list)} 。Get proxy ips: {len(proxies_list)}.') print(f'能用的代理数量: {len(can_use)}。Usable proxy ips: {len(can_use)}.' ) return can_use def get_us_free_proxy(pages = 10): url = "https://openproxy.space/list/http" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code != 200: print("Connection Error. Please make sure that your computer now have the access to Google.com") res = etree.HTML(response.text) http_type = "HTTP" proxies_list = [] scripts = res.xpath("//script") content = scripts[3].xpath(".//text()") pattern = re.compile('LIST",data:(.+),added:') result_list = pattern.findall(content[0]) result_list = result_list[0].strip("[{").strip("}]").split("},{") for result in result_list: pattern = re.compile('\[(.+)\]') result = pattern.findall(result) result = result[0].split(",") result = [r.strip("\"") for r in result] for ip in result: proxies_list.append( {http_type: ip} ) total = pages* 15 proxies_list = proxies_list[:total] can_use = check_us_ips(proxies_list) print(f'Get proxy ips: {len(proxies_list)}.') print(f'Usable proxy ips: {len(can_use)}.' ) return can_use class Kuaidaili: def __init__(self, tunnel, username, password): self.tunnel = tunnel self.username = username self.password = password def get_kuaidaili_tunnel_proxy(self): proxies = { "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel}, "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel} } return proxies