Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -33,7 +33,7 @@ os.makedirs("/home/user/app/data2", exist_ok=True)
|
|
33 |
base_url_1 = os.environ.get("base_url_1")
|
34 |
visited_urls = []
|
35 |
counter = 0
|
36 |
-
limit =
|
37 |
|
38 |
def scrape_page(url):
|
39 |
global counter
|
@@ -62,7 +62,7 @@ def scrape_page(url):
|
|
62 |
links = soup.find_all("a", href=True)
|
63 |
for link in links:
|
64 |
absolute_url = urljoin(url, link["href"])
|
65 |
-
if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and counter <= limit:
|
66 |
content += "\n" + scrape_page(absolute_url)
|
67 |
|
68 |
return ""
|
@@ -76,7 +76,7 @@ base_url_2 = os.environ.get("base_url_2")
|
|
76 |
date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)]
|
77 |
visited_urls = []
|
78 |
counter = 0
|
79 |
-
limit =
|
80 |
|
81 |
def scrape_page(url):
|
82 |
global counter
|
|
|
33 |
base_url_1 = os.environ.get("base_url_1")
|
34 |
visited_urls = []
|
35 |
counter = 0
|
36 |
+
limit = 10000
|
37 |
|
38 |
def scrape_page(url):
|
39 |
global counter
|
|
|
62 |
links = soup.find_all("a", href=True)
|
63 |
for link in links:
|
64 |
absolute_url = urljoin(url, link["href"])
|
65 |
+
if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and 'tel' not in absolute_url and counter <= limit:
|
66 |
content += "\n" + scrape_page(absolute_url)
|
67 |
|
68 |
return ""
|
|
|
76 |
date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)]
|
77 |
visited_urls = []
|
78 |
counter = 0
|
79 |
+
limit = 10000
|
80 |
|
81 |
def scrape_page(url):
|
82 |
global counter
|