Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tools /parse_wikipedia_table.py

altozachmo

improve prompt

0d91eab 4 months ago

raw

history blame contribute delete

3.49 kB

	from smolagents import Tool
	import requests
	from bs4 import BeautifulSoup, Tag


	class WikipediaParser(Tool):
	name: str = "wikipedia_parser_tool"
	description: str = (
	"This tool parse a Wikipedia page into a clean, readable text format."
	)
	inputs: dict[str, dict[str, str]] = {
	"url": {
	"type": "string",
	"description": "The Wikipedia page url.",
	}
	}
	output_type: str = "string"

	def get_wikipedia_page(self, url: str) -> str:
	"""
	Fetches the content of a Wikipedia page given its URL.
	Args:
	url (str): The URL of the Wikipedia page.
	Returns:
	str: The HTML content of the page.
	"""

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
	}
	resp = requests.get(url, headers=headers, timeout=30)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	content_div = soup.find("div", id="mw-content-text")
	if not content_div:
	return "Content not found."

	elements: list[str] = []
	h_tags: list[str] = [f"h{i}" for i in range(1, 6)]
	extra_tags: list[str] = ["p", "ul", "ol"]
	html_tags: list[str] = h_tags + extra_tags

	for elem in content_div.find_all(html_tags):
	if elem.name in h_tags:
	elements.append("\n\n" + elem.get_text(strip=True) + "\n")
	elif elem.name in extra_tags:
	elements.append(elem.get_text(strip=True))
	elif elem.name == "table":
	elements.append(self.parse_wikipedia_table(elem))

	return "\n\n".join(elements)

	def parse_wikipedia_table(table: Tag) -> str:
	"""
	Parses a Wikipedia table into a clean, readable text format.
	Args:
	table (Tag): BeautifulSoup Tag for the table.
	Returns:
	str: Formatted table as readable text.
	"""
	rows = []
	headers = []

	# Try to get headers
	thead = table.find("thead")
	if thead:
	for th in thead.find_all("th"):
	header_text = th.get_text(separator=" ", strip=True)
	headers.append(header_text)
	if headers:
	rows.append(" \| ".join(headers))

	# Parse table body rows
	tbody = table.find("tbody")
	if not tbody:
	tbody = table # fallback: some tables have no tbody explicitly

	for tr in tbody.find_all("tr"):
	cells = tr.find_all(["th", "td"])
	cell_texts = []
	for cell in cells:
	# Clean references like [7], [note 1], etc.
	for sup in cell.find_all("sup", class_="reference"):
	sup.decompose()

	text = cell.get_text(separator=" ", strip=True)
	cell_texts.append(text)

	if cell_texts:
	row_text = " \| ".join(cell_texts)
	rows.append(row_text)

	return "\n".join(rows)

	def forward(self, url: str) -> str:
	"""
	Parses the Wikipedia page and returns the content as a string.
	Args:
	url (str): The URL of the Wikipedia page.
	Returns:
	str: The parsed content of the page.
	"""
	html_string = self.get_wikipedia_page(url)
	return html_string