File size: 589 Bytes
9b24dfb
 
 
 
daa1a9a
 
9b24dfb
 
 
 
 
 
 
 
 
 
 
 
 
daa1a9a
 
 
 
9b24dfb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""
Textractor module
"""

import requests

from bs4 import BeautifulSoup

from txtai.pipeline.segmentation import Segmentation

class Textractor(Segmentation):
    """
    Extracts text from files.
    """

    def __init__(self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False):
        super().__init__(sentences, lines, paragraphs, minlength, join)

    def text(self, text):
        # text is a url
        response = requests.get(text)
        html = response.text

        soup = BeautifulSoup(html, features="html.parser")
        return soup.get_text()