Chris4K commited on
Commit
f5fb5bf
·
verified ·
1 Parent(s): fdb5b3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -7
app.py CHANGED
@@ -227,7 +227,8 @@ import requests
227
  from bs4 import BeautifulSoup
228
  from urllib.parse import urlparse, urljoin
229
 
230
- def get_links_from_page(url, visited_urls, domain_links):
 
231
  if url in visited_urls:
232
  return
233
 
@@ -246,15 +247,13 @@ def get_links_from_page(url, visited_urls, domain_links):
246
  for link in links:
247
  href = link.get('href')
248
  absolute_url = urljoin(base_url, href)
249
- parsed_url = urlparse(absolute_url)
250
-
251
- if parsed_url.netloc == urlparse(url).netloc:
252
- domain_links.add(absolute_url)
253
- get_links_from_page(absolute_url, visited_urls, domain_links)
254
 
255
  else:
256
  print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
257
 
 
258
  def get_all_links_from_domain(domain_url):
259
  visited_urls = set()
260
  domain_links = set()
@@ -288,7 +287,7 @@ fe_app = gr.ChatInterface(
288
  # load the model asynchronously on startup and save it into memory
289
  @app.on_event("startup")
290
  async def startup():
291
- domain_url = 'https://globl.contact/'
292
  links = get_all_links_from_domain(domain_url)
293
  print("Links from the domain:", links)
294
 
 
227
  from bs4 import BeautifulSoup
228
  from urllib.parse import urlparse, urljoin
229
 
230
+
231
+ def get_links_from_page(url, visited_urls, all_links):
232
  if url in visited_urls:
233
  return
234
 
 
247
  for link in links:
248
  href = link.get('href')
249
  absolute_url = urljoin(base_url, href)
250
+ all_links.add(absolute_url)
251
+ get_links_from_page(absolute_url, visited_urls, all_links)
 
 
 
252
 
253
  else:
254
  print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
255
 
256
+
257
  def get_all_links_from_domain(domain_url):
258
  visited_urls = set()
259
  domain_links = set()
 
287
  # load the model asynchronously on startup and save it into memory
288
  @app.on_event("startup")
289
  async def startup():
290
+ domain_url = 'https://www.bofrost.de/faq/'
291
  links = get_all_links_from_domain(domain_url)
292
  print("Links from the domain:", links)
293