Customer-Segmentation / src /clustering.py
SURESHBEEKHANI's picture
Upload 6 files
874b2d8 verified
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from typing import Tuple, List
def load_data(filepath: str) -> pd.DataFrame:
"""
Load dataset from a CSV file.
Args:
filepath: Path to the CSV file.
Returns:
Pandas DataFrame.
"""
return pd.read_csv(filepath)
def extract_features(df: pd.DataFrame, feature_cols: List[str]) -> np.ndarray:
"""
Extract numeric feature matrix from DataFrame.
Args:
df: Input DataFrame.
feature_cols: List of column names to use as features.
Returns:
2D NumPy array of features.
"""
return df[feature_cols].to_numpy()
def fit_kmeans(
X: np.ndarray,
n_clusters: int,
random_state: int = 42
) -> Tuple[np.ndarray, np.ndarray]:
"""
Fit KMeans and return labels and centroids.
Args:
X: Feature matrix.
n_clusters: Number of clusters.
random_state: Random seed for reproducibility.
Returns:
Tuple of (labels array, centers array).
"""
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
labels = kmeans.fit_predict(X)
return labels, kmeans.cluster_centers_
def calculate_wcss(
X: np.ndarray,
max_clusters: int = 10
) -> List[float]:
"""
Compute within-cluster sum of squares for 1..max_clusters.
Args:
X: Feature matrix.
max_clusters: Maximum number of clusters to evaluate.
Returns:
List of inertia values.
"""
wcss = []
for k in range(1, max_clusters + 1):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
return wcss