File size: 3,519 Bytes
12c47a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
import requests
from bs4 import BeautifulSoup


def fetch_wikipedia_tables(
    url: str,
    handle_special_chars: bool = True,
) -> list[pd.DataFrame]:
    """
    Fetch tables from a Wikipedia URL with robust error handling.

    Parameters:
    -----------
    url : str
        The Wikipedia URL to fetch tables from.
    handle_special_chars : bool, default True
        Whether to clean special characters in data before parsing.

    Returns:
    --------
    list of pd.DataFrame
        A list of pandas DataFrames containing the tables found on the page.
    """
    try:
        all_tables = _fetch_tables_with_bs4(url)

        if handle_special_chars:
            # Clean tables to handle special characters and formatting issues
            for i, table in enumerate(all_tables):
                all_tables[i] = _clean_table(table)

        if all_tables:
            return all_tables
        else:
            print(f"No tables found at {url}")
            return []
    except Exception as e:
        print(f"Error fetching tables: {e}")
        return []


def _fetch_tables_with_bs4(url: str) -> list[pd.DataFrame]:
    """Method to fetch tables using BeautifulSoup."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        tables = []

        for table in soup.find_all("table", {"class": "wikitable"}):
            data = []
            headers = []

            # Extract headers
            for th in table.find_all("th"):
                headers.append(th.text.strip())

            # If no headers found in th tags, try first tr
            if not headers and table.find("tr"):
                for td in table.find("tr").find_all(["th", "td"]):
                    headers.append(td.text.strip())

            # Extract rows
            for row in table.find_all("tr")[1:] if headers else table.find_all("tr"):
                row_data = []
                for cell in row.find_all(["td", "th"]):
                    row_data.append(cell.text.strip())
                if row_data:  # Skip empty rows
                    data.append(row_data)

            # Create DataFrame
            if data:
                if headers and len(headers) == len(data[0]):
                    df = pd.DataFrame(data, columns=headers)
                else:
                    df = pd.DataFrame(data)
                tables.append(df)

        return tables
    except Exception as e:
        print(f"Error in BeautifulSoup fallback: {e}")
        return []


def _clean_table(df: pd.DataFrame) -> pd.DataFrame:
    """Clean a table by handling special characters and formatting issues."""
    # Make a copy to avoid modifying the original
    df = df.copy()

    # Handle all string columns
    for col in df.columns:
        if df[col].dtype == "object":
            # Replace common problematic characters
            df[col] = df[col].astype(str).str.replace(";", "", regex=False)
            df[col] = df[col].str.replace("−", "-", regex=False)  # Replace minus sign
            df[col] = df[col].str.replace(
                "\xa0", " ", regex=False
            )  # Replace non-breaking space
            df[col] = df[col].str.replace("\n", " ", regex=False)  # Replace newlines
            df[col] = df[col].str.strip()  # Strip whitespace

            # Remove reference tags like [1], [2], etc.
            df[col] = df[col].str.replace(r"\[\d+\]", "", regex=True)

    return df