|
import sys |
|
from time import sleep |
|
import trafilatura |
|
from trafilatura.meta import reset_caches |
|
from trafilatura.settings import DEFAULT_CONFIG |
|
import spacy |
|
|
|
|
|
nlp = spacy.load("en_core_web_lg") |
|
|
|
|
|
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 |
|
MIN_CHAR = 50 |
|
MAX_CHAR = 5000 |
|
|
|
|
|
def get_page(url): |
|
page = None |
|
for _ in range(3): |
|
try: |
|
|
|
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) |
|
assert page is not None |
|
print("Fetched " + url, file=sys.stderr) |
|
break |
|
except: |
|
sleep(3) |
|
return page |
|
|
|
|
|
def url2lines(url): |
|
page = get_page(url) |
|
|
|
if page is None: |
|
return [] |
|
|
|
lines = html2lines(page) |
|
return lines |
|
|
|
|
|
def line_correction(lines, max_size=100): |
|
out_lines = [] |
|
for line in lines: |
|
if len(line) < MIN_CHAR: |
|
continue |
|
|
|
if len(line) > max_size: |
|
doc = nlp( |
|
line[:MAX_CHAR] |
|
) |
|
stack = "" |
|
for sent in doc.sents: |
|
if len(stack) > 0: |
|
stack += " " |
|
stack += str(sent).strip() |
|
if len(stack) > max_size: |
|
out_lines.append(stack) |
|
stack = "" |
|
|
|
if ( |
|
len(stack) > MIN_CHAR |
|
): |
|
out_lines.append(stack) |
|
else: |
|
out_lines.append(line) |
|
|
|
return out_lines |
|
|
|
|
|
def html2lines(page): |
|
out_lines = [] |
|
|
|
if len(page.strip()) == 0 or page is None: |
|
return out_lines |
|
|
|
text = trafilatura.extract(page, config=DEFAULT_CONFIG) |
|
reset_caches() |
|
|
|
if text is None: |
|
return out_lines |
|
|
|
return text.split( |
|
"\n" |
|
) |
|
|