import requests from bs4 import BeautifulSoup from typing import List CHARACTER_CUT_OFF = 20000 def remove_tags(soup: BeautifulSoup) -> str: for data in soup(["style", "script"]): # Remove tags data.decompose() # return data by retrieving the tag content return " ".join(soup.stripped_strings) def read_webpage(url: str) -> str: print(f"Getting the response from url : {url})") response = requests.get(url) html_content = response.content # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") # Get all the text content from the relevant HTML tags text_content = remove_tags(soup) # for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]: # for element in soup.find_all(tag): # text_content += element.get_text() + " " print(text_content) return text_content def process_webpages(urls: List[str]): # A set to keep track of visited pages visited_pages = set() aggregated_text = "" for url in urls: visited_pages.add(url) aggregated_text += f"\nGetting the content of {url}:\n" aggregated_text += read_webpage(url) return aggregated_text[:CHARACTER_CUT_OFF] if __name__ == "__main__": print( process_webpages( urls=[ "https://www.example.org", "https://www.example.com", "https://www.imperial.ac.uk/stories/climate-action/", "https://support.worldwildlife.org/site/SPageNavigator/ActionsToFightClimateChange.html", ] ) )