Final_Assignment_Template / wikipedia_tables_parser.py
Markiian Tsalyk
LlamaIndex agent
12c47a4
import pandas as pd
import requests
from bs4 import BeautifulSoup
def fetch_wikipedia_tables(
url: str,
handle_special_chars: bool = True,
) -> list[pd.DataFrame]:
"""
Fetch tables from a Wikipedia URL with robust error handling.
Parameters:
-----------
url : str
The Wikipedia URL to fetch tables from.
handle_special_chars : bool, default True
Whether to clean special characters in data before parsing.
Returns:
--------
list of pd.DataFrame
A list of pandas DataFrames containing the tables found on the page.
"""
try:
all_tables = _fetch_tables_with_bs4(url)
if handle_special_chars:
# Clean tables to handle special characters and formatting issues
for i, table in enumerate(all_tables):
all_tables[i] = _clean_table(table)
if all_tables:
return all_tables
else:
print(f"No tables found at {url}")
return []
except Exception as e:
print(f"Error fetching tables: {e}")
return []
def _fetch_tables_with_bs4(url: str) -> list[pd.DataFrame]:
"""Method to fetch tables using BeautifulSoup."""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
tables = []
for table in soup.find_all("table", {"class": "wikitable"}):
data = []
headers = []
# Extract headers
for th in table.find_all("th"):
headers.append(th.text.strip())
# If no headers found in th tags, try first tr
if not headers and table.find("tr"):
for td in table.find("tr").find_all(["th", "td"]):
headers.append(td.text.strip())
# Extract rows
for row in table.find_all("tr")[1:] if headers else table.find_all("tr"):
row_data = []
for cell in row.find_all(["td", "th"]):
row_data.append(cell.text.strip())
if row_data: # Skip empty rows
data.append(row_data)
# Create DataFrame
if data:
if headers and len(headers) == len(data[0]):
df = pd.DataFrame(data, columns=headers)
else:
df = pd.DataFrame(data)
tables.append(df)
return tables
except Exception as e:
print(f"Error in BeautifulSoup fallback: {e}")
return []
def _clean_table(df: pd.DataFrame) -> pd.DataFrame:
"""Clean a table by handling special characters and formatting issues."""
# Make a copy to avoid modifying the original
df = df.copy()
# Handle all string columns
for col in df.columns:
if df[col].dtype == "object":
# Replace common problematic characters
df[col] = df[col].astype(str).str.replace(";", "", regex=False)
df[col] = df[col].str.replace("−", "-", regex=False) # Replace minus sign
df[col] = df[col].str.replace(
"\xa0", " ", regex=False
) # Replace non-breaking space
df[col] = df[col].str.replace("\n", " ", regex=False) # Replace newlines
df[col] = df[col].str.strip() # Strip whitespace
# Remove reference tags like [1], [2], etc.
df[col] = df[col].str.replace(r"\[\d+\]", "", regex=True)
return df