# Copyright (c) Microsoft. All rights reserved.

from semantic_kernel.skill_definition import sk_function
from bs4 import BeautifulSoup
import re, aiohttp


class WebPagesPlugin:
    """
    A plugin to interact with web pages, e.g. download the text content of a page.
    """

    @sk_function(
        description="Fetch the text content of a webpage. The return is a string containing all the text.",
        name="fetch_webpage",
        input_description="URL of the page to fetch.",
    )
    async def fetch_webpage(self, input: str) -> str:
        """
        A native function that fetches the text content of a webpage.
        HTML tags are removed, and empty lines are compacted.
        """
        if not input:
            raise ValueError("url cannot be `None` or empty")
        async with aiohttp.ClientSession() as session:
            async with session.get(input, raise_for_status=True) as response:
                html = await response.text()
                soup = BeautifulSoup(html, features="html.parser")
                # remove some elements
                for el in soup(["script", "style", "iframe", "img", "video", "audio"]):
                    el.extract()

                # get text and compact empty lines
                text = soup.get_text()
                return re.sub(r"[\r\n][\r\n]{2,}", "\n\n", text)