Final_Assignment_Template / tools /parse_wikipedia_table.py
altozachmo's picture
improve prompt
0d91eab
from smolagents import Tool
import requests
from bs4 import BeautifulSoup, Tag
class WikipediaParser(Tool):
name: str = "wikipedia_parser_tool"
description: str = (
"This tool parse a Wikipedia page into a clean, readable text format."
)
inputs: dict[str, dict[str, str]] = {
"url": {
"type": "string",
"description": "The Wikipedia page url.",
}
}
output_type: str = "string"
def get_wikipedia_page(self, url: str) -> str:
"""
Fetches the content of a Wikipedia page given its URL.
Args:
url (str): The URL of the Wikipedia page.
Returns:
str: The HTML content of the page.
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
resp = requests.get(url, headers=headers, timeout=30)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
content_div = soup.find("div", id="mw-content-text")
if not content_div:
return "Content not found."
elements: list[str] = []
h_tags: list[str] = [f"h{i}" for i in range(1, 6)]
extra_tags: list[str] = ["p", "ul", "ol"]
html_tags: list[str] = h_tags + extra_tags
for elem in content_div.find_all(html_tags):
if elem.name in h_tags:
elements.append("\n\n" + elem.get_text(strip=True) + "\n")
elif elem.name in extra_tags:
elements.append(elem.get_text(strip=True))
elif elem.name == "table":
elements.append(self.parse_wikipedia_table(elem))
return "\n\n".join(elements)
def parse_wikipedia_table(table: Tag) -> str:
"""
Parses a Wikipedia table into a clean, readable text format.
Args:
table (Tag): BeautifulSoup Tag for the table.
Returns:
str: Formatted table as readable text.
"""
rows = []
headers = []
# Try to get headers
thead = table.find("thead")
if thead:
for th in thead.find_all("th"):
header_text = th.get_text(separator=" ", strip=True)
headers.append(header_text)
if headers:
rows.append(" | ".join(headers))
# Parse table body rows
tbody = table.find("tbody")
if not tbody:
tbody = table # fallback: some tables have no tbody explicitly
for tr in tbody.find_all("tr"):
cells = tr.find_all(["th", "td"])
cell_texts = []
for cell in cells:
# Clean references like [7], [note 1], etc.
for sup in cell.find_all("sup", class_="reference"):
sup.decompose()
text = cell.get_text(separator=" ", strip=True)
cell_texts.append(text)
if cell_texts:
row_text = " | ".join(cell_texts)
rows.append(row_text)
return "\n".join(rows)
def forward(self, url: str) -> str:
"""
Parses the Wikipedia page and returns the content as a string.
Args:
url (str): The URL of the Wikipedia page.
Returns:
str: The parsed content of the page.
"""
html_string = self.get_wikipedia_page(url)
return html_string