faq-website

Runtime error

faq-website / scrape_website.py

Peter Vandenabeele

Try PYTORCH_CUDA_ALLOC_CONF = "max_split_size_mb:512"

34d3c87 about 1 year ago

No virus

1.63 kB

	import requests
	from bs4 import BeautifulSoup
	from typing import List

	CHARACTER_CUT_OFF = 20000


	def remove_tags(soup: BeautifulSoup) -> str:
	for data in soup(["style", "script"]):
	# Remove tags
	data.decompose()

	# return data by retrieving the tag content
	return " ".join(soup.stripped_strings)


	def read_webpage(url: str) -> str:
	print(f"Getting the response from url : {url})")
	response = requests.get(url)
	html_content = response.content

	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(html_content, "html.parser")

	# Get all the text content from the relevant HTML tags
	text_content = remove_tags(soup)

	# for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
	# for element in soup.find_all(tag):
	# text_content += element.get_text() + " "

	print(text_content)
	return text_content


	def process_webpages(urls: List[str]):
	# A set to keep track of visited pages
	visited_pages = set()
	aggregated_text = ""
	for url in urls:
	visited_pages.add(url)
	aggregated_text += f"\nGetting the content of {url}:\n"
	aggregated_text += read_webpage(url)

	return aggregated_text[:CHARACTER_CUT_OFF]


	if __name__ == "__main__":
	print(
	process_webpages(
	urls=[
	"https://www.example.org",
	"https://www.example.com",
	"https://www.imperial.ac.uk/stories/climate-action/",
	"https://support.worldwildlife.org/site/SPageNavigator/ActionsToFightClimateChange.html",
	]
	)
	)