""" Textractor module """ import requests from bs4 import BeautifulSoup from txtai.pipeline.segmentation import Segmentation class Textractor(Segmentation): """ Extracts text from files. """ def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False): super().__init__(sentences, lines, paragraphs, minlength, join) def text(self, text): # text is a url response = requests.get(text) html = response.text soup = BeautifulSoup(html, features="html.parser") return soup.get_text()