ZhouChuYue
init
a579dd2
# -*- coding:utf-8 -*-
import re
from ultradata_math_parser.utils import *
from ultradata_math_parser.parsers.base_parser import BaseParser
from ultradata_math_parser.parsers.title_parser import TitleParser
class CustomParser(BaseParser):
def __init__(self) -> None:
super().__init__()
def use_clean_rule(self, tree, clean_rules):
for clean_rule in clean_rules:
for x in tree.xpath(clean_rule):
self.remove_node(x)
return tree
def use_extract_rule(self, tree, extract_rule):
if "/text()" in extract_rule["value"]:
return "".join(tree.xpath(extract_rule["value"])).strip()
return tree.xpath(extract_rule["value"])[0]
def extract(self, html="", base_url="", rule={}, **kwargs) -> dict:
self.include_images = kwargs.get("include_images", False)
tree = load_html(html)
if tree is None:
raise ValueError
# base_url
base_href = tree.xpath("//base/@href")
if base_href and "http" in base_href[0]:
base_url = base_href[0]
if "clean" in rule:
tree = self.use_clean_rule(tree, rule["clean"])
# 获取title
if "title" not in rule:
title = TitleParser().process(tree)
else:
title = self.use_extract_rule(tree, rule["title"])
# 文章区域
try:
body_tree = self.use_extract_rule(tree, rule["content"])
except:
raise ValueError
if not self.include_images:
self._remove_images_from_tree(body_tree)
body_html = tostring(body_tree, encoding=str)
body_html = self._strip_images_from_html(body_html)
text_length = self._text_length_from_html(body_html)
return {
"xp_num": "custom",
"drop_list": False,
"html": body_html,
"title": title,
"base_url": base_url,
"text_length": text_length,
}