faq-website / scrape_website.py
vincentclaes's picture
remove get child pages
4f7d130
raw
history blame
688 Bytes
import requests
from bs4 import BeautifulSoup
def process_webpage(url: str):
# Make a GET request to the page and get the HTML content
response = requests.get(url)
html_content = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Get all the text content from the relevant HTML tags
text_content = ""
for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
for element in soup.find_all(tag):
text_content += element.get_text() + " "
print(text_content)
return text_content
if __name__ == "__main__":
process_webpage(url="https://www.meet-drift.ai/")