| | |
| |
|
| | from ultradata_math_parser.utils import * |
| | from ultradata_math_parser.config import * |
| |
|
| |
|
| | class TitleParser: |
| | def extract_by_meta(self, element: HtmlElement): |
| | for xpath in METAS: |
| | title = element.xpath(xpath) |
| | if title: |
| | return "".join(title) |
| |
|
| | def extract_by_title(self, element: HtmlElement): |
| | return "".join(element.xpath("//title//text()")).strip() |
| |
|
| | def extract_by_hs(self, element: HtmlElement): |
| | hs = element.xpath("//h1//text()|//h2//text()|//h3//text()") |
| | return hs or [] |
| |
|
| | def extract_by_h(self, element: HtmlElement): |
| | for xpath in ["//h1", "//h2", "//h3"]: |
| | children = element.xpath(xpath) |
| | if not children: |
| | continue |
| | child = children[0] |
| | texts = child.xpath("./text()") |
| | if texts and len(texts): |
| | return texts[0].strip() |
| |
|
| | def process(self, element: HtmlElement): |
| | title_extracted_by_meta = self.extract_by_meta(element) |
| | if title_extracted_by_meta: |
| | return title_extracted_by_meta |
| | title_extracted_by_h = self.extract_by_h(element) |
| | title_extracted_by_hs = self.extract_by_hs(element) |
| | title_extracted_by_title = self.extract_by_title(element) |
| | title_extracted_by_hs = sorted( |
| | title_extracted_by_hs, |
| | key=lambda x: similarity2(x, title_extracted_by_title), |
| | reverse=True, |
| | ) |
| | if title_extracted_by_hs: |
| | return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) |
| |
|
| | if title_extracted_by_title: |
| | return title_extracted_by_title |
| |
|
| | return title_extracted_by_h |
| |
|