Spaces:

drift-ai
/

faq-website

Runtime error

App Files Files Community

faq-website / scrape_website.py

vincentclaes

process 1 page only

1fdb555 over 1 year ago

raw

history blame

2.2 kB

	import requests
	from bs4 import BeautifulSoup

	TOKEN_CUT_OFF = 2500

	def process_webpage(url:str):
	# A set to keep track of visited pages
	visited_pages = set()

	text_list = []

	# A function to recursively get all child pages
	def get_child_pages(url):
	# Make a GET request to the page and get the HTML content
	response = requests.get(url)
	html_content = response.content

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(html_content, "html.parser")

	# Get all the text content from the relevant HTML tags
	text_content = ""
	for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
	for element in soup.find_all(tag):
	text_content += element.get_text() + " "

	# Add the page to the set of visited pages
	text_content = f"page {url} contains: " + text_content
	visited_pages.add(url)

	# Find all the child links and recursively get their text content
	for link in soup.find_all("a"):
	href = link.get("href")
	if href and href not in visited_pages and url in href:
	get_child_pages(href)

	text_list.append(text_content)

	# Get the text content of the landing page
	# get_child_pages(url)

	# Make a GET request to the page and get the HTML content
	response = requests.get(url)
	html_content = response.content

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(html_content, "html.parser")

	# Get all the text content from the relevant HTML tags
	text_content = ""
	for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
	for element in soup.find_all(tag):
	text_content += element.get_text() + " "

	# # make main page as first item
	# text_list.reverse()
	# text_list_cut_off = text_list[:TOKEN_CUT_OFF]
	# page_content = "\n".join(text_list_cut_off)
	# # Print the text content of the landing page and all child pages
	# print(page_content)
	# return page_content
	print(text_content)
	return text_content

	if __name__ == '__main__':
	process_webpage(url="https://www.meet-drift.ai/")