Mokshith Salian commited on
Commit
5ff752e
·
1 Parent(s): 4db655a

modified app and scraper file

Browse files
Files changed (2) hide show
  1. app.py +21 -36
  2. secure_scraper.py +62 -87
app.py CHANGED
@@ -6,6 +6,7 @@ import os
6
  import time
7
  import sys
8
  import requests
 
9
 
10
  # Import our custom classes
11
  from secure_scraper import SecureScraper
@@ -94,42 +95,13 @@ def main():
94
  st.subheader("Privacy Settings")
95
  use_proxy = st.checkbox("Use Proxy Rotation", value=False)
96
  use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
97
-
98
- test_mode = st.sidebar.checkbox("Enable Test Mode", value=False)
99
-
100
- # If in test mode, show a simplified test interface
101
- if test_mode:
102
- st.header("🔍 Test Mode")
103
- st.info("This mode lets you test basic web connectivity without the full pipeline")
104
 
105
- test_url = st.text_input("Test URL", "https://www.example.com")
106
-
107
- if st.button("Test Connection"):
108
- try:
109
- with st.spinner("Testing connection..."):
110
- # First try with requests for basic connectivity
111
- basic_response = requests.get(test_url, timeout=10)
112
- st.success(f"Basic HTTP connection successful: Status {basic_response.status_code}")
113
-
114
- # Then try with our crawler
115
- st.info("Now testing with crawl4ai integration...")
116
- test_scraper = SecureScraper()
117
- result = test_scraper.scrape_url(test_url)
118
-
119
- if result['status'] == 'success':
120
- st.success(f"crawl4ai connection successful")
121
- st.write("Privacy settings used:")
122
- st.json(result['privacy'])
123
-
124
- with st.expander("Response Preview"):
125
- st.write(result['data']['title'])
126
- st.write(result['data']['text'][:1000] + "..." if len(result['data']['text']) > 1000 else result['data']['text'])
127
- else:
128
- st.error(f"crawl4ai connection failed: {result['message']}")
129
-
130
- except Exception as e:
131
- st.error(f"Connection failed: {str(e)}")
132
- st.code(traceback.format_exc())
133
 
134
  # Input section
135
  st.header("Scraping Target")
@@ -173,9 +145,14 @@ def main():
173
  "http://example-proxy2.com:8080"
174
  ]
175
 
176
- # Initialize the scraper
177
  scraper = SecureScraper(proxy_list=proxy_list)
178
 
 
 
 
 
 
179
  error_placeholder = st.empty()
180
 
181
  # Perform scraping
@@ -231,6 +208,14 @@ def main():
231
  logging.error(f"LLM processing error: {str(e)}")
232
  logging.error(traceback.format_exc())
233
 
 
 
 
 
 
 
 
 
234
 
235
  if __name__ == "__main__":
236
  main()
 
6
  import time
7
  import sys
8
  import requests
9
+ import asyncio
10
 
11
  # Import our custom classes
12
  from secure_scraper import SecureScraper
 
95
  st.subheader("Privacy Settings")
96
  use_proxy = st.checkbox("Use Proxy Rotation", value=False)
97
  use_user_agent = st.checkbox("Use User-Agent Rotation", value=True)
 
 
 
 
 
 
 
98
 
99
+ # Add AsyncWebCrawler specific settings
100
+ st.subheader("Crawler Settings")
101
+ max_connections = st.slider("Max Connections", min_value=1, max_value=20, value=10)
102
+ timeout_seconds = st.slider("Request Timeout (seconds)", min_value=5, max_value=60, value=30)
103
+ max_retries = st.slider("Max Retries", min_value=1, max_value=10, value=5)
104
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Input section
107
  st.header("Scraping Target")
 
145
  "http://example-proxy2.com:8080"
146
  ]
147
 
148
+ # Initialize the scraper with updated parameters
149
  scraper = SecureScraper(proxy_list=proxy_list)
150
 
151
+ # Update AsyncWebCrawler settings based on user input
152
+ scraper.crawler.max_connections = max_connections
153
+ scraper.crawler.timeout = timeout_seconds
154
+ scraper.crawler.random_user_agent = use_user_agent
155
+
156
  error_placeholder = st.empty()
157
 
158
  # Perform scraping
 
208
  logging.error(f"LLM processing error: {str(e)}")
209
  logging.error(traceback.format_exc())
210
 
211
+ # Create a utility for running async code in Streamlit
212
+ def run_async_code(coro):
213
+ """Run an async coroutine in a Streamlit app."""
214
+ try:
215
+ loop = asyncio.new_event_loop()
216
+ return loop.run_until_complete(coro)
217
+ finally:
218
+ loop.close()
219
 
220
  if __name__ == "__main__":
221
  main()
secure_scraper.py CHANGED
@@ -2,26 +2,29 @@ import random
2
  import logging
3
  import time
4
  import json
5
- import crawl4ai
 
6
 
7
  class SecureScraper:
8
  def __init__(self, proxy_list=None):
9
- # Initialize with crawl4ai - adjust parameters according to the actual API
10
  self.use_proxies = bool(proxy_list)
11
  self.proxy_list = proxy_list
12
 
13
- # Initialize crawler based on actual crawl4ai structure
14
- self.crawler = crawl4ai.Crawler() if hasattr(crawl4ai, 'Crawler') else None
15
-
16
- # If the direct Crawler class doesn't exist, try to use the package's main functionality
17
- if self.crawler is None:
18
- self.crawler = crawl4ai
 
 
19
 
20
  logging.basicConfig(level=logging.INFO)
21
 
22
- def scrape_url(self, url, css_selectors=None):
23
  """
24
- Scrape a URL with privacy protection measures
25
 
26
  Args:
27
  url: URL to scrape
@@ -36,89 +39,49 @@ class SecureScraper:
36
  proxy_status = "using proxy" if self.use_proxies else "without proxy"
37
  logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_status}")
38
 
39
- # Use the crawl4ai functionality to scrape the URL
40
- # Adapt this based on the actual crawl4ai API
41
- if hasattr(self.crawler, 'fetch'):
42
- page_data = self.crawler.fetch(url, proxy=self.proxy_list[0] if self.use_proxies and self.proxy_list else None)
43
- elif hasattr(self.crawler, 'scrape'):
44
- page_data = self.crawler.scrape(url, proxy=self.proxy_list[0] if self.use_proxies and self.proxy_list else None)
45
- elif hasattr(self.crawler, 'get'):
46
- page_data = self.crawler.get(url, proxy=self.proxy_list[0] if self.use_proxies and self.proxy_list else None)
47
- else:
48
- # Fallback to crawl4ai's default call pattern
49
- page_data = self.crawler(url, proxy=self.proxy_list[0] if self.use_proxies and self.proxy_list else None)
50
-
51
- # Extract content based on the returned data structure
52
- # This will need to be adjusted based on what crawl4ai actually returns
53
 
54
- # Create a basic result structure
55
- if not css_selectors:
56
- # Default extraction if no selectors provided
57
- if hasattr(page_data, 'title'):
58
- title = page_data.title
59
- elif hasattr(page_data, 'get_title'):
60
- title = page_data.get_title()
61
- else:
62
- title = "Title extraction not supported"
63
-
64
- if hasattr(page_data, 'text'):
65
- text = page_data.text[:10000] # Limit text size
66
- elif hasattr(page_data, 'get_text'):
67
- text = page_data.get_text()[:10000]
68
- else:
69
- text = "Text extraction not supported"
70
-
71
- if hasattr(page_data, 'links'):
72
- links = page_data.links[:20] # Limit links
73
- elif hasattr(page_data, 'get_links'):
74
- links = page_data.get_links()[:20]
75
  else:
76
- links = []
77
-
78
- result = {
79
- 'title': title,
80
- 'text': text,
81
- 'links': links
82
- }
83
- else:
84
- # Extract requested elements using CSS selectors
85
- result = {}
86
- for key, selector in css_selectors.items():
87
- if hasattr(page_data, 'select'):
88
  elements = page_data.select(selector)
89
- elif hasattr(page_data, 'query'):
90
- elements = page_data.query(selector)
91
- else:
92
- result[key] = f"Selector functionality not supported: {selector}"
93
- continue
94
 
95
- if elements:
96
- # If multiple elements match, create a list
97
- if isinstance(elements, list) and len(elements) > 1:
98
- if hasattr(elements[0], 'text'):
99
  result[key] = [elem.text for elem in elements]
100
- elif hasattr(elements[0], 'get_text'):
101
- result[key] = [elem.get_text() for elem in elements]
102
  else:
103
- result[key] = elements
104
- else:
105
- if hasattr(elements[0], 'text'):
106
  result[key] = elements[0].text
107
- elif hasattr(elements[0], 'get_text'):
108
- result[key] = elements[0].get_text()
109
- else:
110
- result[key] = str(elements[0])
111
- else:
112
- result[key] = f"No match for selector: {selector}"
 
 
 
113
 
114
- # Get user agent info if available
115
- user_agent = "Unknown"
116
- if hasattr(self.crawler, 'current_user_agent'):
117
- user_agent = self.crawler.current_user_agent
118
- elif hasattr(self.crawler, 'user_agent'):
119
- user_agent = self.crawler.user_agent
120
- elif hasattr(page_data, 'user_agent'):
121
- user_agent = page_data.user_agent
122
 
123
  # Truncate for privacy
124
  user_agent = user_agent[:30] + '...' if len(str(user_agent)) > 30 else user_agent
@@ -135,11 +98,23 @@ class SecureScraper:
135
  except Exception as e:
136
  logging.error(f"Request failed: {str(e)}")
137
  current_retry += 1
138
- time.sleep(random.uniform(2, 5)) # Incremental backoff
139
 
140
  # Try to rotate proxy if available
141
  if self.use_proxies and self.proxy_list and len(self.proxy_list) > 1:
142
  self.proxy_list = self.proxy_list[1:] + [self.proxy_list[0]] # Rotate proxies
 
 
143
 
144
  # If we've exhausted retries
145
- return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}
 
 
 
 
 
 
 
 
 
 
 
2
  import logging
3
  import time
4
  import json
5
+ import asyncio
6
+ from crawl4ai import AsyncWebCrawler
7
 
8
  class SecureScraper:
9
  def __init__(self, proxy_list=None):
10
+ # Initialize with AsyncWebCrawler from crawl4ai
11
  self.use_proxies = bool(proxy_list)
12
  self.proxy_list = proxy_list
13
 
14
+ # Initialize async crawler
15
+ self.crawler = AsyncWebCrawler(
16
+ max_connections=10,
17
+ timeout=30,
18
+ proxies=self.proxy_list if self.use_proxies and self.proxy_list else None,
19
+ follow_redirects=True,
20
+ random_user_agent=True # Enable random user agent rotation
21
+ )
22
 
23
  logging.basicConfig(level=logging.INFO)
24
 
25
+ async def async_scrape_url(self, url, css_selectors=None):
26
  """
27
+ Asynchronously scrape a URL with privacy protection measures
28
 
29
  Args:
30
  url: URL to scrape
 
39
  proxy_status = "using proxy" if self.use_proxies else "without proxy"
40
  logging.info(f"Scraping {url} (Attempt {current_retry+1}/{max_retries}) {proxy_status}")
41
 
42
+ # Use AsyncWebCrawler to fetch the page
43
+ response = await self.crawler.arun(url)
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # Process the response based on content type
46
+ if response.is_html:
47
+ page_data = await response.parse_html()
48
+
49
+ # Create a basic result structure
50
+ if not css_selectors:
51
+ # Default extraction if no selectors provided
52
+ title = page_data.title or "Title extraction not supported"
53
+ text = page_data.text[:10000] if hasattr(page_data, 'text') else "Text extraction not supported"
54
+ links = page_data.links[:20] if hasattr(page_data, 'links') else []
55
+
56
+ result = {
57
+ 'title': title,
58
+ 'text': text,
59
+ 'links': links
60
+ }
 
 
 
 
 
61
  else:
62
+ # Extract requested elements using CSS selectors
63
+ result = {}
64
+ for key, selector in css_selectors.items():
 
 
 
 
 
 
 
 
 
65
  elements = page_data.select(selector)
 
 
 
 
 
66
 
67
+ if elements:
68
+ # If multiple elements match, create a list
69
+ if len(elements) > 1:
 
70
  result[key] = [elem.text for elem in elements]
 
 
71
  else:
 
 
 
72
  result[key] = elements[0].text
73
+ else:
74
+ result[key] = f"No match for selector: {selector}"
75
+ else:
76
+ # Handle non-HTML responses
77
+ result = {
78
+ 'content_type': response.content_type,
79
+ 'content_length': len(response.content),
80
+ 'summary': 'Non-HTML content'
81
+ }
82
 
83
+ # Get user agent info
84
+ user_agent = self.crawler.current_user_agent or "Unknown"
 
 
 
 
 
 
85
 
86
  # Truncate for privacy
87
  user_agent = user_agent[:30] + '...' if len(str(user_agent)) > 30 else user_agent
 
98
  except Exception as e:
99
  logging.error(f"Request failed: {str(e)}")
100
  current_retry += 1
101
+ await asyncio.sleep(random.uniform(2, 5)) # Async sleep for backoff
102
 
103
  # Try to rotate proxy if available
104
  if self.use_proxies and self.proxy_list and len(self.proxy_list) > 1:
105
  self.proxy_list = self.proxy_list[1:] + [self.proxy_list[0]] # Rotate proxies
106
+ # Update crawler's proxies
107
+ await self.crawler.update_proxies(self.proxy_list)
108
 
109
  # If we've exhausted retries
110
+ return {'status': 'error', 'message': f"Failed after {max_retries} attempts"}
111
+
112
+ def scrape_url(self, url, css_selectors=None):
113
+ """
114
+ Synchronous wrapper for async_scrape_url
115
+
116
+ Args:
117
+ url: URL to scrape
118
+ css_selectors: Dict of elements to extract
119
+ """
120
+ return asyncio.run(self.async_scrape_url(url, css_selectors))