Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -236,7 +236,8 @@ def get_links_from_page(url, visited_urls, all_links):
|
|
236 |
return
|
237 |
|
238 |
visited_urls.add(url)
|
239 |
-
print(
|
|
|
240 |
response = requests.get(url)
|
241 |
|
242 |
if response.status_code == 200:
|
@@ -254,10 +255,40 @@ def get_links_from_page(url, visited_urls, all_links):
|
|
254 |
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
255 |
|
256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
def get_all_links_from_domain(domain_url):
|
258 |
visited_urls = set()
|
259 |
domain_links = set()
|
260 |
-
get_links_from_page(domain_url, visited_urls, domain_links)
|
261 |
return domain_links
|
262 |
|
263 |
|
|
|
236 |
return
|
237 |
|
238 |
visited_urls.add(url)
|
239 |
+
print("Visitied" + visited_urls.lenght)
|
240 |
+
print("Getting next" + url)
|
241 |
response = requests.get(url)
|
242 |
|
243 |
if response.status_code == 200:
|
|
|
255 |
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
256 |
|
257 |
|
258 |
+
def get_links_from_page(url, visited_urls, all_links, base_domain):
|
259 |
+
if url in visited_urls:
|
260 |
+
return
|
261 |
+
|
262 |
+
if len(visited_urls) > 25:
|
263 |
+
return
|
264 |
+
|
265 |
+
visited_urls.add(url)
|
266 |
+
print("Visitied" + visited_urls.lenght)
|
267 |
+
print("Getting next" + url)
|
268 |
+
response = requests.get(url)
|
269 |
+
|
270 |
+
if response.status_code == 200:
|
271 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
272 |
+
base_url = urlparse(url).scheme + '://' + urlparse(url).netloc
|
273 |
+
links = soup.find_all('a', href=True)
|
274 |
+
|
275 |
+
for link in links:
|
276 |
+
href = link.get('href')
|
277 |
+
absolute_url = urljoin(base_url, href)
|
278 |
+
parsed_url = urlparse(absolute_url)
|
279 |
+
|
280 |
+
if parsed_url.netloc == base_domain:
|
281 |
+
all_links.add(absolute_url)
|
282 |
+
get_links_from_page(absolute_url, visited_urls, all_links, base_domain)
|
283 |
+
|
284 |
+
else:
|
285 |
+
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
286 |
+
|
287 |
+
|
288 |
def get_all_links_from_domain(domain_url):
|
289 |
visited_urls = set()
|
290 |
domain_links = set()
|
291 |
+
get_links_from_page(domain_url, visited_urls, domain_links, domain_url)
|
292 |
return domain_links
|
293 |
|
294 |
|