for876543 commited on
Commit
bfc96df
1 Parent(s): 4470219

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -33,7 +33,7 @@ os.makedirs("/home/user/app/data2", exist_ok=True)
33
  base_url_1 = os.environ.get("base_url_1")
34
  visited_urls = []
35
  counter = 0
36
- limit = 1000
37
 
38
  def scrape_page(url):
39
  global counter
@@ -62,7 +62,7 @@ def scrape_page(url):
62
  links = soup.find_all("a", href=True)
63
  for link in links:
64
  absolute_url = urljoin(url, link["href"])
65
- if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and counter <= limit:
66
  content += "\n" + scrape_page(absolute_url)
67
 
68
  return ""
@@ -76,7 +76,7 @@ base_url_2 = os.environ.get("base_url_2")
76
  date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)]
77
  visited_urls = []
78
  counter = 0
79
- limit = 1000
80
 
81
  def scrape_page(url):
82
  global counter
 
33
  base_url_1 = os.environ.get("base_url_1")
34
  visited_urls = []
35
  counter = 0
36
+ limit = 10000
37
 
38
  def scrape_page(url):
39
  global counter
 
62
  links = soup.find_all("a", href=True)
63
  for link in links:
64
  absolute_url = urljoin(url, link["href"])
65
+ if absolute_url not in visited_urls and absolute_url.startswith(base_url_1) and 'tel' not in absolute_url and counter <= limit:
66
  content += "\n" + scrape_page(absolute_url)
67
 
68
  return ""
 
76
  date_urls = [base_url_2+str(year)+"/" for year in range(2023,2010,-1)]
77
  visited_urls = []
78
  counter = 0
79
+ limit = 10000
80
 
81
  def scrape_page(url):
82
  global counter