""" Textractor module """ from urllib.request import urlopen from bs4 import BeautifulSoup from txtai.pipeline.segmentation import Segmentation class Textractor(Segmentation): """ Extracts text from files. """ def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False): super().__init__(sentences, lines, paragraphs, minlength, join) def text(self, text): # text is a path to a file html = urlopen(text).read() soup = BeautifulSoup(html, features="html.parser") return soup.get_text()