Spaces:

xjf6b
/

ceshidddyyy

Running

App Files Files Community

xjf6b commited on Aug 30, 2024

Commit

94be149

verified ·

1 Parent(s): adfd496

Create crawl.py

Browse files

Files changed (1) hide show

crawl.py +85 -0

crawl.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import requests
+import re
+from bs4 import BeautifulSoup
+import logging
+import os
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+class AirPortCollector:
+    def __init__(self):
+        self.url = "https://t.me/s/jichang_list?before=457"
+        self.airports = []
+        self.proxy = os.getenv('PROXY')
+    def fetch_content(self):
+        try:
+            proxies = {'http': self.proxy, 'https': self.proxy} if self.proxy else None
+            response = requests.get(self.url, verify=False, proxies=proxies)
+            response.raise_for_status()
+            return response.text
+        except requests.RequestException as e:
+            logging.error(f"Error fetching content: {e}")
+            return None
+    def parse_content(self, content):
+        if not content:
+            return
+        soup = BeautifulSoup(content, 'html.parser')
+        messages = soup.find_all('div', class_='tgme_widget_message_text')
+        for message in messages:
+            airport = {}
+            text = message.get_text()
+            # Extract airport name
+            name_match = re.search(r'⦁ 名称:\s*(.*)', text)
+            if name_match:
+                airport['name'] = name_match.group(1).strip()
+            # Extract official website
+            website_matches = re.findall(r'⦁ 官网:\s*(https?://\S+)', text)
+            if website_matches:
+                airport['websites'] = website_matches
+            # Extract Telegram channel
+            channel_match = re.search(r'⦁ 频道:\s*(@\S+)', text)
+            if channel_match:
+                airport['channel'] = channel_match.group(1)
+            # Extract Telegram group
+            group_match = re.search(r'⦁ 群组:\s*(@\S+)', text)
+            if group_match:
+                airport['group'] = group_match.group(1)
+            if airport:
+                self.airports.append(airport)
+    def collect(self):
+        content = self.fetch_content()
+        if content:
+            self.parse_content(content)
+            logging.info(f"Collected {len(self.airports)} airports")
+        else:
+            logging.warning("Failed to fetch content")
+    def get_airports(self):
+        return self.airports
+def main():
+    collector = AirPortCollector()
+    collector.collect()
+    airports = collector.get_airports()
+    # Write to file
+    with open('/app/subscribes.txt', 'w') as f:
+        for airport in airports:
+            if 'websites' in airport:
+                for website in airport['websites']:
+                    f.write(f"{website}\n")
+    logging.info(f"Wrote {len(airports)} airport websites to subscribes.txt")
+if __name__ == "__main__":
+    main()