Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
TOKEN_CUT_OFF = 2500 | |
def process_webpage(url:str): | |
# A set to keep track of visited pages | |
visited_pages = set() | |
text_list = [] | |
# A function to recursively get all child pages | |
def get_child_pages(url): | |
# Make a GET request to the page and get the HTML content | |
response = requests.get(url) | |
html_content = response.content | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(html_content, "html.parser") | |
# Get all the text content from the relevant HTML tags | |
text_content = "" | |
for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]: | |
for element in soup.find_all(tag): | |
text_content += element.get_text() + " " | |
# Add the page to the set of visited pages | |
text_content = f"page {url} contains: " + text_content | |
visited_pages.add(url) | |
# Find all the child links and recursively get their text content | |
for link in soup.find_all("a"): | |
href = link.get("href") | |
if href and href not in visited_pages and url in href: | |
get_child_pages(href) | |
text_list.append(text_content) | |
# Get the text content of the landing page | |
# get_child_pages(url) | |
# Make a GET request to the page and get the HTML content | |
response = requests.get(url) | |
html_content = response.content | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(html_content, "html.parser") | |
# Get all the text content from the relevant HTML tags | |
text_content = "" | |
for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]: | |
for element in soup.find_all(tag): | |
text_content += element.get_text() + " " | |
# # make main page as first item | |
# text_list.reverse() | |
# text_list_cut_off = text_list[:TOKEN_CUT_OFF] | |
# page_content = "\n".join(text_list_cut_off) | |
# # Print the text content of the landing page and all child pages | |
# print(page_content) | |
# return page_content | |
print(text_content) | |
return text_content | |
if __name__ == '__main__': | |
process_webpage(url="https://www.meet-drift.ai/") |