# First creating Document reader from typing import List from langchain.docstore.document import Document as LangchainDocument from bs4 import BeautifulSoup import re def read_wiki_html(filename: str) -> List[List]: """ Reads an HTML file, extracts the contents of the tag, finds all
tags with their hrefs and
, and returns the processed content as a string. Args: filename (str): The path to the HTML file. Returns: TEXT_KB: list of text extracted from the html Figure_KB: list of figure captions extracted """ try: with open(filename, 'r', encoding='utf-8') as file: content = file.read() # Parse the HTML content soup = BeautifulSoup(content, 'html.parser') # Focus only on the tag body = soup.body if body is None: return "Error: No tag found in the HTML file." body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip()) TEXT_KB = [ LangchainDocument(page_content=body_text) ] # Extract all
tags with their href and figcaption FIG_KB = [] for figure in body.find_all('figure'): href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href' figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption' # figure_details.append(f"Figure: href={href}, figcaption={figcaption}") FIG_KB.append( LangchainDocument(page_content=figcaption, metadata={"url": href}) ) # Join the details into a single string return (TEXT_KB, FIG_KB) except FileNotFoundError: return f"Error: File '{filename}' not found." except Exception as e: return f"Error: {str(e)}" if __name__=="__main__": contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm") # read_pdf() pass