Spaces:
Runtime error
Runtime error
"""HTML parser. | |
Contains parser for html files. | |
""" | |
import re | |
from pathlib import Path | |
from typing import Dict, Union | |
from abc import abstractmethod | |
from pathlib import Path | |
from typing import Dict, List, Optional, Union | |
class BaseParser: | |
"""Base class for all parsers.""" | |
def __init__(self, parser_config: Optional[Dict] = None): | |
"""Init params.""" | |
self._parser_config = parser_config | |
def init_parser(self) -> None: | |
"""Init parser and store it.""" | |
parser_config = self._init_parser() | |
self._parser_config = parser_config | |
def parser_config_set(self) -> bool: | |
"""Check if parser config is set.""" | |
return self._parser_config is not None | |
def parser_config(self) -> Dict: | |
"""Check if parser config is set.""" | |
if self._parser_config is None: | |
raise ValueError("Parser config not set.") | |
return self._parser_config | |
def _init_parser(self) -> Dict: | |
"""Initialize the parser with the config.""" | |
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: | |
"""Parse file.""" | |
class HTMLParser(BaseParser): | |
"""HTML parser.""" | |
def _init_parser(self) -> Dict: | |
"""Init parser.""" | |
return {} | |
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: | |
"""Parse file. | |
Returns: | |
Union[str, List[str]]: a string or a List of strings. | |
""" | |
try: | |
from unstructured.partition.html import partition_html | |
from unstructured.staging.base import convert_to_isd | |
from unstructured.cleaners.core import clean | |
except ImportError: | |
raise ValueError("unstructured package is required to parse HTML files.") | |
# Using the unstructured library to convert the html to isd format | |
# isd sample : isd = [ | |
# {"text": "My Title", "type": "Title"}, | |
# {"text": "My Narrative", "type": "NarrativeText"} | |
# ] | |
with open(file, "r", encoding="utf-8") as fp: | |
elements = partition_html(file=fp) | |
isd = convert_to_isd(elements) | |
# Removing non ascii charactwers from isd_el['text'] | |
for isd_el in isd: | |
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() | |
# Removing all the \n characters from isd_el['text'] using regex and replace with single space | |
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space | |
for isd_el in isd: | |
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL) | |
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL) | |
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation | |
for isd_el in isd: | |
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True) | |
# Creating a list of all the indexes of isd_el['type'] = 'Title' | |
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title'] | |
# Creating 'Chunks' - List of lists of strings | |
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' | |
# Each Chunk can be thought of as an individual set of data, which can be sent to the model | |
# Where Each Title is grouped together with the data under it | |
Chunks = [[]] | |
final_chunks = list(list()) | |
for i, isd_el in enumerate(isd): | |
if i in title_indexes: | |
Chunks.append([]) | |
Chunks[-1].append(isd_el['text']) | |
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 | |
# TODO: This value can be an user defined variable | |
for chunk in Chunks: | |
# sum of lenth of all the strings in the chunk | |
sum = 0 | |
sum += len(str(chunk)) | |
if sum < 25: | |
Chunks.remove(chunk) | |
else: | |
# appending all the approved chunks to final_chunks as a single string | |
final_chunks.append(" ".join([str(item) for item in chunk])) | |
return final_chunks | |