xjf6b commited on
Commit
94be149
·
verified ·
1 Parent(s): adfd496

Create crawl.py

Browse files
Files changed (1) hide show
  1. crawl.py +85 -0
crawl.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import re
3
+ from bs4 import BeautifulSoup
4
+ import logging
5
+ import os
6
+
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
+
9
+ class AirPortCollector:
10
+ def __init__(self):
11
+ self.url = "https://t.me/s/jichang_list?before=457"
12
+ self.airports = []
13
+ self.proxy = os.getenv('PROXY')
14
+
15
+ def fetch_content(self):
16
+ try:
17
+ proxies = {'http': self.proxy, 'https': self.proxy} if self.proxy else None
18
+ response = requests.get(self.url, verify=False, proxies=proxies)
19
+ response.raise_for_status()
20
+ return response.text
21
+ except requests.RequestException as e:
22
+ logging.error(f"Error fetching content: {e}")
23
+ return None
24
+
25
+ def parse_content(self, content):
26
+ if not content:
27
+ return
28
+
29
+ soup = BeautifulSoup(content, 'html.parser')
30
+ messages = soup.find_all('div', class_='tgme_widget_message_text')
31
+
32
+ for message in messages:
33
+ airport = {}
34
+ text = message.get_text()
35
+
36
+ # Extract airport name
37
+ name_match = re.search(r'⦁ 名称:\s*(.*)', text)
38
+ if name_match:
39
+ airport['name'] = name_match.group(1).strip()
40
+
41
+ # Extract official website
42
+ website_matches = re.findall(r'⦁ 官网:\s*(https?://\S+)', text)
43
+ if website_matches:
44
+ airport['websites'] = website_matches
45
+
46
+ # Extract Telegram channel
47
+ channel_match = re.search(r'⦁ 频道:\s*(@\S+)', text)
48
+ if channel_match:
49
+ airport['channel'] = channel_match.group(1)
50
+
51
+ # Extract Telegram group
52
+ group_match = re.search(r'⦁ 群组:\s*(@\S+)', text)
53
+ if group_match:
54
+ airport['group'] = group_match.group(1)
55
+
56
+ if airport:
57
+ self.airports.append(airport)
58
+
59
+ def collect(self):
60
+ content = self.fetch_content()
61
+ if content:
62
+ self.parse_content(content)
63
+ logging.info(f"Collected {len(self.airports)} airports")
64
+ else:
65
+ logging.warning("Failed to fetch content")
66
+
67
+ def get_airports(self):
68
+ return self.airports
69
+
70
+ def main():
71
+ collector = AirPortCollector()
72
+ collector.collect()
73
+ airports = collector.get_airports()
74
+
75
+ # Write to file
76
+ with open('/app/subscribes.txt', 'w') as f:
77
+ for airport in airports:
78
+ if 'websites' in airport:
79
+ for website in airport['websites']:
80
+ f.write(f"{website}\n")
81
+
82
+ logging.info(f"Wrote {len(airports)} airport websites to subscribes.txt")
83
+
84
+ if __name__ == "__main__":
85
+ main()