"""
Textractor module
"""

import requests

from bs4 import BeautifulSoup

from txtai.pipeline.segmentation import Segmentation

class Textractor(Segmentation):
    """
    Extracts text from files.
    """

    def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
        super().__init__(sentences, lines, paragraphs, minlength, join)

    def text(self, text):
        # text is a url
        response = requests.get(text)
        html = response.text

        soup = BeautifulSoup(html, features="html.parser")
        return soup.get_text()