Chris4K commited on
Commit
dc8a06c
·
verified ·
1 Parent(s): f5fb5bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -2
app.py CHANGED
@@ -236,7 +236,8 @@ def get_links_from_page(url, visited_urls, all_links):
236
  return
237
 
238
  visited_urls.add(url)
239
- print(url)
 
240
  response = requests.get(url)
241
 
242
  if response.status_code == 200:
@@ -254,10 +255,40 @@ def get_links_from_page(url, visited_urls, all_links):
254
  print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
255
 
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  def get_all_links_from_domain(domain_url):
258
  visited_urls = set()
259
  domain_links = set()
260
- get_links_from_page(domain_url, visited_urls, domain_links)
261
  return domain_links
262
 
263
 
 
236
  return
237
 
238
  visited_urls.add(url)
239
+ print("Visitied" + visited_urls.lenght)
240
+ print("Getting next" + url)
241
  response = requests.get(url)
242
 
243
  if response.status_code == 200:
 
255
  print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
256
 
257
 
258
+ def get_links_from_page(url, visited_urls, all_links, base_domain):
259
+ if url in visited_urls:
260
+ return
261
+
262
+ if len(visited_urls) > 25:
263
+ return
264
+
265
+ visited_urls.add(url)
266
+ print("Visitied" + visited_urls.lenght)
267
+ print("Getting next" + url)
268
+ response = requests.get(url)
269
+
270
+ if response.status_code == 200:
271
+ soup = BeautifulSoup(response.content, 'html.parser')
272
+ base_url = urlparse(url).scheme + '://' + urlparse(url).netloc
273
+ links = soup.find_all('a', href=True)
274
+
275
+ for link in links:
276
+ href = link.get('href')
277
+ absolute_url = urljoin(base_url, href)
278
+ parsed_url = urlparse(absolute_url)
279
+
280
+ if parsed_url.netloc == base_domain:
281
+ all_links.add(absolute_url)
282
+ get_links_from_page(absolute_url, visited_urls, all_links, base_domain)
283
+
284
+ else:
285
+ print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
286
+
287
+
288
  def get_all_links_from_domain(domain_url):
289
  visited_urls = set()
290
  domain_links = set()
291
+ get_links_from_page(domain_url, visited_urls, domain_links, domain_url)
292
  return domain_links
293
 
294