import requests
from bs4 import BeautifulSoup
from typing import List

CHARACTER_CUT_OFF = 20000


def remove_tags(soup: BeautifulSoup) -> str:
    for data in soup(["style", "script"]):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return " ".join(soup.stripped_strings)


def read_webpage(url: str) -> str:
    print(f"Getting the response from url : {url})")
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Get all the text content from the relevant HTML tags
    text_content = remove_tags(soup)

    # for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
    #     for element in soup.find_all(tag):
    #         text_content += element.get_text() + " "

    print(text_content)
    return text_content


def process_webpages(urls: List[str]):
    # A set to keep track of visited pages
    visited_pages = set()
    aggregated_text = ""
    for url in urls:
        visited_pages.add(url)
        aggregated_text += f"\nGetting the content of {url}:\n"
        aggregated_text += read_webpage(url)

    return aggregated_text[:CHARACTER_CUT_OFF]


if __name__ == "__main__":
    print(
        process_webpages(
            urls=[
                "https://www.example.org",
                "https://www.example.com",
                "https://www.imperial.ac.uk/stories/climate-action/",
                "https://support.worldwildlife.org/site/SPageNavigator/ActionsToFightClimateChange.html",
            ]
        )
    )