vincentclaes commited on
Commit
1fdb555
1 Parent(s): 679ff32

process 1 page only

Browse files
Files changed (1) hide show
  1. scrape_website.py +22 -8
scrape_website.py CHANGED
@@ -37,16 +37,30 @@ def process_webpage(url:str):
37
  text_list.append(text_content)
38
 
39
  # Get the text content of the landing page
40
- get_child_pages(url)
41
 
42
- # make main page as first item
43
- text_list.reverse()
44
- text_list_cut_off = text_list[:TOKEN_CUT_OFF]
45
- page_content = "\n".join(text_list_cut_off)
46
- # Print the text content of the landing page and all child pages
47
- print(page_content)
48
- return page_content
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  if __name__ == '__main__':
52
  process_webpage(url="https://www.meet-drift.ai/")
 
37
  text_list.append(text_content)
38
 
39
  # Get the text content of the landing page
40
+ # get_child_pages(url)
41
 
42
+ # Make a GET request to the page and get the HTML content
43
+ response = requests.get(url)
44
+ html_content = response.content
 
 
 
 
45
 
46
+ # Parse the HTML content using BeautifulSoup
47
+ soup = BeautifulSoup(html_content, "html.parser")
48
+
49
+ # Get all the text content from the relevant HTML tags
50
+ text_content = ""
51
+ for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
52
+ for element in soup.find_all(tag):
53
+ text_content += element.get_text() + " "
54
+
55
+ # # make main page as first item
56
+ # text_list.reverse()
57
+ # text_list_cut_off = text_list[:TOKEN_CUT_OFF]
58
+ # page_content = "\n".join(text_list_cut_off)
59
+ # # Print the text content of the landing page and all child pages
60
+ # print(page_content)
61
+ # return page_content
62
+ print(text_content)
63
+ return text_content
64
 
65
  if __name__ == '__main__':
66
  process_webpage(url="https://www.meet-drift.ai/")