txtai / textractor.py
davidmezzetti's picture
Update textractor.py
daa1a9a
raw
history blame
589 Bytes
"""
Textractor module
"""
import requests
from bs4 import BeautifulSoup
from txtai.pipeline.segmentation import Segmentation
class Textractor(Segmentation):
"""
Extracts text from files.
"""
def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
super().__init__(sentences, lines, paragraphs, minlength, join)
def text(self, text):
# text is a url
response = requests.get(text)
html = response.text
soup = BeautifulSoup(html, features="html.parser")
return soup.get_text()