|
"""metrics.py module.""" |
|
|
|
from typing import Dict, List |
|
import numpy as np |
|
from scipy.stats import wasserstein_distance, ks_2samp |
|
from sklearn.metrics import mutual_info_score, silhouette_score |
|
from sklearn.neighbors import NearestNeighbors |
|
|
|
def calculate_model_similarity(model_a, model_b): |
|
""" |
|
Placeholder for model similarity calculation. |
|
Returns a dummy similarity score. |
|
""" |
|
return 1.0 |
|
|
|
class MetricsCalculator: |
|
@staticmethod |
|
def calculate_distribution_similarity(real_data: np.ndarray, |
|
synthetic_data: np.ndarray) -> Dict[str, float]: |
|
"""Calculate statistical similarity metrics between real and synthetic data.""" |
|
metrics = {} |
|
|
|
|
|
metrics['wasserstein'] = wasserstein_distance( |
|
real_data.flatten(), |
|
synthetic_data.flatten() |
|
) |
|
|
|
|
|
metrics['mutual_info'] = mutual_info_score( |
|
real_data.flatten(), |
|
synthetic_data.flatten() |
|
) |
|
|
|
|
|
ks_statistic, p_value = ks_2samp(real_data.flatten(), synthetic_data.flatten()) |
|
metrics['ks_statistic'] = ks_statistic |
|
metrics['ks_p_value'] = p_value |
|
|
|
|
|
metrics['mean_diff'] = abs(np.mean(real_data) - np.mean(synthetic_data)) |
|
metrics['std_diff'] = abs(np.std(real_data) - np.std(synthetic_data)) |
|
metrics['percentile_diff'] = np.mean([ |
|
abs(np.percentile(real_data, p) - np.percentile(synthetic_data, p)) |
|
for p in [25, 50, 75] |
|
]) |
|
|
|
return metrics |
|
|
|
@staticmethod |
|
def evaluate_privacy_metrics(model, test_data: np.ndarray, |
|
synthetic_data: np.ndarray) -> Dict[str, float]: |
|
"""Evaluate privacy-related metrics.""" |
|
metrics = {} |
|
|
|
|
|
metrics['membership_inference_risk'] = MetricsCalculator._calculate_membership_inference_risk( |
|
test_data, synthetic_data |
|
) |
|
|
|
|
|
metrics['attribute_inference_risk'] = MetricsCalculator._calculate_attribute_inference_risk( |
|
test_data, synthetic_data |
|
) |
|
|
|
|
|
metrics['k_anonymity_score'] = MetricsCalculator._calculate_k_anonymity(synthetic_data) |
|
|
|
|
|
metrics['uniqueness_score'] = MetricsCalculator._calculate_uniqueness(synthetic_data) |
|
|
|
return metrics |
|
|
|
@staticmethod |
|
def _calculate_membership_inference_risk(test_data: np.ndarray, |
|
synthetic_data: np.ndarray) -> float: |
|
"""Calculate membership inference risk using nearest neighbor distance ratio.""" |
|
k = 3 |
|
nn = NearestNeighbors(n_neighbors=k) |
|
nn.fit(synthetic_data) |
|
|
|
distances, _ = nn.kneighbors(test_data) |
|
avg_min_distances = distances.mean(axis=1) |
|
|
|
|
|
risk_score = 1.0 - (1.0 / (1.0 + np.mean(avg_min_distances))) |
|
return risk_score |
|
|
|
@staticmethod |
|
def _calculate_attribute_inference_risk(test_data: np.ndarray, |
|
synthetic_data: np.ndarray) -> float: |
|
"""Calculate attribute inference risk using correlation analysis.""" |
|
real_corr = np.corrcoef(test_data.T) |
|
synth_corr = np.corrcoef(synthetic_data.T) |
|
|
|
|
|
correlation_diff = np.abs(real_corr - synth_corr).mean() |
|
|
|
|
|
risk_score = 1.0 - np.exp(-correlation_diff) |
|
return risk_score |
|
|
|
@staticmethod |
|
def _calculate_k_anonymity(data: np.ndarray, k: int = 5) -> float: |
|
"""Calculate approximate k-anonymity score.""" |
|
nn = NearestNeighbors(n_neighbors=k) |
|
nn.fit(data) |
|
|
|
distances, _ = nn.kneighbors(data) |
|
k_anonymity_scores = distances[:, -1] |
|
|
|
|
|
return float(np.mean(k_anonymity_scores > 0.1)) |
|
|
|
@staticmethod |
|
def _calculate_uniqueness(data: np.ndarray) -> float: |
|
"""Calculate uniqueness score of the dataset.""" |
|
nn = NearestNeighbors(n_neighbors=2) |
|
nn.fit(data) |
|
|
|
distances, _ = nn.kneighbors(data) |
|
uniqueness_scores = distances[:, 1] |
|
|
|
|
|
return float(np.mean(uniqueness_scores > np.median(uniqueness_scores))) |
|
|
|
|