# First creating Document reader
from typing import List
from langchain.docstore.document import Document as LangchainDocument
from bs4 import BeautifulSoup
import re
def read_wiki_html(filename: str) -> List[List]:
"""
Reads an HTML file, extracts the contents of the
tag,
finds all tags with their hrefs and ,
and returns the processed content as a string.
Args:
filename (str): The path to the HTML file.
Returns:
TEXT_KB: list of text extracted from the html
Figure_KB: list of figure captions extracted
"""
try:
with open(filename, 'r', encoding='utf-8') as file:
content = file.read()
# Parse the HTML content
soup = BeautifulSoup(content, 'html.parser')
# Focus only on the tag
body = soup.body
if body is None:
return "Error: No tag found in the HTML file."
body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip())
TEXT_KB = [
LangchainDocument(page_content=body_text)
]
# Extract all tags with their href and figcaption
FIG_KB = []
for figure in body.find_all('figure'):
href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href'
figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption'
# figure_details.append(f"Figure: href={href}, figcaption={figcaption}")
FIG_KB.append(
LangchainDocument(page_content=figcaption, metadata={"url": href})
)
# Join the details into a single string
return (TEXT_KB, FIG_KB)
except FileNotFoundError:
return f"Error: File '{filename}' not found."
except Exception as e:
return f"Error: {str(e)}"
if __name__=="__main__":
contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm")
# read_pdf()
pass