""" | |
Textractor module | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from txtai.pipeline.segmentation import Segmentation | |
class Textractor(Segmentation): | |
""" | |
Extracts text from files. | |
""" | |
def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False): | |
super().__init__(sentences, lines, paragraphs, minlength, join) | |
def text(self, text): | |
# text is a url | |
response = requests.get(text) | |
html = response.text | |
soup = BeautifulSoup(html, features="html.parser") | |
return soup.get_text() | |