vincentclaes commited on
Commit
0125da1
1 Parent(s): 5505694

implement a cut off

Browse files
Files changed (1) hide show
  1. scrape_website.py +4 -3
scrape_website.py CHANGED
@@ -1,6 +1,7 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
 
 
4
 
5
  def process_webpage(url:str):
6
  # A set to keep track of visited pages
@@ -40,11 +41,11 @@ def process_webpage(url:str):
40
 
41
  # make main page as first item
42
  text_list.reverse()
43
-
44
- page_content = "\n".join(text_list)
45
  # Print the text content of the landing page and all child pages
46
  print(page_content)
47
- return page_content
48
 
49
 
50
  if __name__ == '__main__':
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
 
4
+ TOKEN_CUT_OFF = 2500
5
 
6
  def process_webpage(url:str):
7
  # A set to keep track of visited pages
 
41
 
42
  # make main page as first item
43
  text_list.reverse()
44
+ text_list_cut_off = text_list[:TOKEN_CUT_OFF]
45
+ page_content = "\n".join(text_list_cut_off)
46
  # Print the text content of the landing page and all child pages
47
  print(page_content)
48
+ return
49
 
50
 
51
  if __name__ == '__main__':