import requests from bs4 import BeautifulSoup TOKEN_CUT_OFF = 2500 def process_webpage(url:str): # A set to keep track of visited pages visited_pages = set() text_list = [] # A function to recursively get all child pages def get_child_pages(url): # Make a GET request to the page and get the HTML content response = requests.get(url) html_content = response.content # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") # Get all the text content from the relevant HTML tags text_content = "" for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]: for element in soup.find_all(tag): text_content += element.get_text() + " " # Add the page to the set of visited pages text_content = f"page {url} contains: " + text_content visited_pages.add(url) # Find all the child links and recursively get their text content for link in soup.find_all("a"): href = link.get("href") if href and href not in visited_pages and url in href: get_child_pages(href) text_list.append(text_content) # Get the text content of the landing page # get_child_pages(url) # Make a GET request to the page and get the HTML content response = requests.get(url) html_content = response.content # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") # Get all the text content from the relevant HTML tags text_content = "" for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]: for element in soup.find_all(tag): text_content += element.get_text() + " " # # make main page as first item # text_list.reverse() # text_list_cut_off = text_list[:TOKEN_CUT_OFF] # page_content = "\n".join(text_list_cut_off) # # Print the text content of the landing page and all child pages # print(page_content) # return page_content print(text_content) return text_content if __name__ == '__main__': process_webpage(url="https://www.meet-drift.ai/")