File size: 589 Bytes
9b24dfb daa1a9a 9b24dfb daa1a9a 9b24dfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
"""
Textractor module
"""
import requests
from bs4 import BeautifulSoup
from txtai.pipeline.segmentation import Segmentation
class Textractor(Segmentation):
"""
Extracts text from files.
"""
def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
super().__init__(sentences, lines, paragraphs, minlength, join)
def text(self, text):
# text is a url
response = requests.get(text)
html = response.text
soup = BeautifulSoup(html, features="html.parser")
return soup.get_text()
|