File size: 1,931 Bytes
17af92c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b35800
17af92c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import sys
from time import sleep
import trafilatura
from trafilatura.meta import reset_caches
from trafilatura.settings import DEFAULT_CONFIG
import spacy


nlp = spacy.load("en_core_web_lg")


DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
MIN_CHAR = 50
MAX_CHAR = 5000


def get_page(url):
    page = None
    for _ in range(3):
        try:
            page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
            assert page is not None
            print("Fetched " + url, file=sys.stderr)
            break
        except:
            sleep(3)
    return page


def url2lines(url):
    page = get_page(url)

    if page is None:
        return []

    lines = html2lines(page)
    return lines


def line_correction(lines, max_size=100):
    out_lines = []
    for line in lines:
        if len(line) < MIN_CHAR:
            continue

        if len(line) > max_size:
            doc = nlp(
                line[:MAX_CHAR]
            )  # We split lines into sentences, but for performance we take only the first 5k characters per line
            stack = ""
            for sent in doc.sents:
                if len(stack) > 0:
                    stack += " "
                stack += str(sent).strip()
                if len(stack) > max_size:
                    out_lines.append(stack)
                    stack = ""

            if (
                len(stack) > MIN_CHAR
            ):  # Ensure every lines in the out_lines suffice the MIN_CHAR restriction
                out_lines.append(stack)
        else:
            out_lines.append(line)

    return out_lines


def html2lines(page):
    out_lines = []

    if len(page.strip()) == 0 or page is None:
        return out_lines

    text = trafilatura.extract(page, config=DEFAULT_CONFIG)
    reset_caches()

    if text is None:
        return out_lines

    return text.split(
        "\n"
    )  # We just spit out the entire page, so need to reformat later.