Spaces:

QAI
/

Chat_QnA_v2

Runtime error

App Files Files Community

Chat_QnA_v2 / html_parser.py

binh99

update new

d037cdf over 1 year ago

raw

history blame

4.34 kB

	"""HTML parser.

	Contains parser for html files.

	"""
	import re
	from pathlib import Path
	from typing import Dict, Union
	from abc import abstractmethod
	from pathlib import Path
	from typing import Dict, List, Optional, Union


	class BaseParser:
	"""Base class for all parsers."""

	def __init__(self, parser_config: Optional[Dict] = None):
	"""Init params."""
	self._parser_config = parser_config

	def init_parser(self) -> None:
	"""Init parser and store it."""
	parser_config = self._init_parser()
	self._parser_config = parser_config

	@property
	def parser_config_set(self) -> bool:
	"""Check if parser config is set."""
	return self._parser_config is not None

	@property
	def parser_config(self) -> Dict:
	"""Check if parser config is set."""
	if self._parser_config is None:
	raise ValueError("Parser config not set.")
	return self._parser_config

	@abstractmethod
	def _init_parser(self) -> Dict:
	"""Initialize the parser with the config."""

	@abstractmethod
	def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
	"""Parse file."""

	class HTMLParser(BaseParser):
	"""HTML parser."""

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
	"""Parse file.

	Returns:
	Union[str, List[str]]: a string or a List of strings.
	"""
	try:
	from unstructured.partition.html import partition_html
	from unstructured.staging.base import convert_to_isd
	from unstructured.cleaners.core import clean
	except ImportError:
	raise ValueError("unstructured package is required to parse HTML files.")

	# Using the unstructured library to convert the html to isd format
	# isd sample : isd = [
	# {"text": "My Title", "type": "Title"},
	# {"text": "My Narrative", "type": "NarrativeText"}
	# ]
	with open(file, "r", encoding="utf-8") as fp:
	elements = partition_html(file=fp)
	isd = convert_to_isd(elements)

	# Removing non ascii charactwers from isd_el['text']
	for isd_el in isd:
	isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()

	# Removing all the \n characters from isd_el['text'] using regex and replace with single space
	# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
	for isd_el in isd:
	isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE \| re.DOTALL)
	isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE \| re.DOTALL)

	# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
	for isd_el in isd:
	clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)

	# Creating a list of all the indexes of isd_el['type'] = 'Title'
	title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']

	# Creating 'Chunks' - List of lists of strings
	# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
	# Each Chunk can be thought of as an individual set of data, which can be sent to the model
	# Where Each Title is grouped together with the data under it

	Chunks = [[]]
	final_chunks = list(list())

	for i, isd_el in enumerate(isd):
	if i in title_indexes:
	Chunks.append([])
	Chunks[-1].append(isd_el['text'])

	# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
	# TODO: This value can be an user defined variable
	for chunk in Chunks:
	# sum of lenth of all the strings in the chunk
	sum = 0
	sum += len(str(chunk))
	if sum < 25:
	Chunks.remove(chunk)
	else:
	# appending all the approved chunks to final_chunks as a single string
	final_chunks.append(" ".join([str(item) for item in chunk]))
	return final_chunks