Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -227,7 +227,8 @@ import requests
|
|
227 |
from bs4 import BeautifulSoup
|
228 |
from urllib.parse import urlparse, urljoin
|
229 |
|
230 |
-
|
|
|
231 |
if url in visited_urls:
|
232 |
return
|
233 |
|
@@ -246,15 +247,13 @@ def get_links_from_page(url, visited_urls, domain_links):
|
|
246 |
for link in links:
|
247 |
href = link.get('href')
|
248 |
absolute_url = urljoin(base_url, href)
|
249 |
-
|
250 |
-
|
251 |
-
if parsed_url.netloc == urlparse(url).netloc:
|
252 |
-
domain_links.add(absolute_url)
|
253 |
-
get_links_from_page(absolute_url, visited_urls, domain_links)
|
254 |
|
255 |
else:
|
256 |
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
257 |
|
|
|
258 |
def get_all_links_from_domain(domain_url):
|
259 |
visited_urls = set()
|
260 |
domain_links = set()
|
@@ -288,7 +287,7 @@ fe_app = gr.ChatInterface(
|
|
288 |
# load the model asynchronously on startup and save it into memory
|
289 |
@app.on_event("startup")
|
290 |
async def startup():
|
291 |
-
domain_url = 'https://
|
292 |
links = get_all_links_from_domain(domain_url)
|
293 |
print("Links from the domain:", links)
|
294 |
|
|
|
227 |
from bs4 import BeautifulSoup
|
228 |
from urllib.parse import urlparse, urljoin
|
229 |
|
230 |
+
|
231 |
+
def get_links_from_page(url, visited_urls, all_links):
|
232 |
if url in visited_urls:
|
233 |
return
|
234 |
|
|
|
247 |
for link in links:
|
248 |
href = link.get('href')
|
249 |
absolute_url = urljoin(base_url, href)
|
250 |
+
all_links.add(absolute_url)
|
251 |
+
get_links_from_page(absolute_url, visited_urls, all_links)
|
|
|
|
|
|
|
252 |
|
253 |
else:
|
254 |
print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
|
255 |
|
256 |
+
|
257 |
def get_all_links_from_domain(domain_url):
|
258 |
visited_urls = set()
|
259 |
domain_links = set()
|
|
|
287 |
# load the model asynchronously on startup and save it into memory
|
288 |
@app.on_event("startup")
|
289 |
async def startup():
|
290 |
+
domain_url = 'https://www.bofrost.de/faq/'
|
291 |
links = get_all_links_from_domain(domain_url)
|
292 |
print("Links from the domain:", links)
|
293 |
|