“Transcendental-Programmer”
fix : minor fixes
fc5fa78
"""metrics.py module."""
from typing import Dict, List
import numpy as np
from scipy.stats import wasserstein_distance, ks_2samp
from sklearn.metrics import mutual_info_score, silhouette_score
from sklearn.neighbors import NearestNeighbors
def calculate_model_similarity(model_a, model_b):
"""
Placeholder for model similarity calculation.
Returns a dummy similarity score.
"""
return 1.0 # Always returns perfect similarity for now
class MetricsCalculator:
@staticmethod
def calculate_distribution_similarity(real_data: np.ndarray,
synthetic_data: np.ndarray) -> Dict[str, float]:
"""Calculate statistical similarity metrics between real and synthetic data."""
metrics = {}
# Wasserstein distance
metrics['wasserstein'] = wasserstein_distance(
real_data.flatten(),
synthetic_data.flatten()
)
# KL divergence approximation
metrics['mutual_info'] = mutual_info_score(
real_data.flatten(),
synthetic_data.flatten()
)
# Kolmogorov-Smirnov test
ks_statistic, p_value = ks_2samp(real_data.flatten(), synthetic_data.flatten())
metrics['ks_statistic'] = ks_statistic
metrics['ks_p_value'] = p_value
# Basic statistical measures
metrics['mean_diff'] = abs(np.mean(real_data) - np.mean(synthetic_data))
metrics['std_diff'] = abs(np.std(real_data) - np.std(synthetic_data))
metrics['percentile_diff'] = np.mean([
abs(np.percentile(real_data, p) - np.percentile(synthetic_data, p))
for p in [25, 50, 75]
])
return metrics
@staticmethod
def evaluate_privacy_metrics(model, test_data: np.ndarray,
synthetic_data: np.ndarray) -> Dict[str, float]:
"""Evaluate privacy-related metrics."""
metrics = {}
# Membership inference risk
metrics['membership_inference_risk'] = MetricsCalculator._calculate_membership_inference_risk(
test_data, synthetic_data
)
# Attribute inference risk
metrics['attribute_inference_risk'] = MetricsCalculator._calculate_attribute_inference_risk(
test_data, synthetic_data
)
# k-anonymity approximation
metrics['k_anonymity_score'] = MetricsCalculator._calculate_k_anonymity(synthetic_data)
# Uniqueness score
metrics['uniqueness_score'] = MetricsCalculator._calculate_uniqueness(synthetic_data)
return metrics
@staticmethod
def _calculate_membership_inference_risk(test_data: np.ndarray,
synthetic_data: np.ndarray) -> float:
"""Calculate membership inference risk using nearest neighbor distance ratio."""
k = 3 # number of neighbors to consider
nn = NearestNeighbors(n_neighbors=k)
nn.fit(synthetic_data)
distances, _ = nn.kneighbors(test_data)
avg_min_distances = distances.mean(axis=1)
# Normalize to [0,1] where higher values indicate higher privacy
risk_score = 1.0 - (1.0 / (1.0 + np.mean(avg_min_distances)))
return risk_score
@staticmethod
def _calculate_attribute_inference_risk(test_data: np.ndarray,
synthetic_data: np.ndarray) -> float:
"""Calculate attribute inference risk using correlation analysis."""
real_corr = np.corrcoef(test_data.T)
synth_corr = np.corrcoef(synthetic_data.T)
# Compare correlation matrices
correlation_diff = np.abs(real_corr - synth_corr).mean()
# Convert to risk score (0 to 1, where lower is better)
risk_score = 1.0 - np.exp(-correlation_diff)
return risk_score
@staticmethod
def _calculate_k_anonymity(data: np.ndarray, k: int = 5) -> float:
"""Calculate approximate k-anonymity score."""
nn = NearestNeighbors(n_neighbors=k)
nn.fit(data)
distances, _ = nn.kneighbors(data)
k_anonymity_scores = distances[:, -1] # Distance to k-th neighbor
# Convert to score (0 to 1, where higher is better)
return float(np.mean(k_anonymity_scores > 0.1))
@staticmethod
def _calculate_uniqueness(data: np.ndarray) -> float:
"""Calculate uniqueness score of the dataset."""
nn = NearestNeighbors(n_neighbors=2) # 2 because first neighbor is self
nn.fit(data)
distances, _ = nn.kneighbors(data)
uniqueness_scores = distances[:, 1] # Distance to nearest non-self neighbor
# Convert to score (0 to 1, where higher means more unique records)
return float(np.mean(uniqueness_scores > np.median(uniqueness_scores)))