import pandas as pd import numpy as np from sklearn.cluster import KMeans from typing import Tuple, List def load_data(filepath: str) -> pd.DataFrame: """ Load dataset from a CSV file. Args: filepath: Path to the CSV file. Returns: Pandas DataFrame. """ return pd.read_csv(filepath) def extract_features(df: pd.DataFrame, feature_cols: List[str]) -> np.ndarray: """ Extract numeric feature matrix from DataFrame. Args: df: Input DataFrame. feature_cols: List of column names to use as features. Returns: 2D NumPy array of features. """ return df[feature_cols].to_numpy() def fit_kmeans( X: np.ndarray, n_clusters: int, random_state: int = 42 ) -> Tuple[np.ndarray, np.ndarray]: """ Fit KMeans and return labels and centroids. Args: X: Feature matrix. n_clusters: Number of clusters. random_state: Random seed for reproducibility. Returns: Tuple of (labels array, centers array). """ kmeans = KMeans(n_clusters=n_clusters, random_state=random_state) labels = kmeans.fit_predict(X) return labels, kmeans.cluster_centers_ def calculate_wcss( X: np.ndarray, max_clusters: int = 10 ) -> List[float]: """ Compute within-cluster sum of squares for 1..max_clusters. Args: X: Feature matrix. max_clusters: Maximum number of clusters to evaluate. Returns: List of inertia values. """ wcss = [] for k in range(1, max_clusters + 1): kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(X) wcss.append(kmeans.inertia_) return wcss