"""
Textractor module
"""

from urllib.request import urlopen
from bs4 import BeautifulSoup

from txtai.pipeline.segmentation import Segmentation


class Textractor(Segmentation):
    """
    Extracts text from files.
    """

    def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
        super().__init__(sentences, lines, paragraphs, minlength, join)

    def text(self, text):
        # text is a path to a file
        html = urlopen(text).read()
        soup = BeautifulSoup(html, features="html.parser")
        return soup.get_text()