import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns import distributions binary_dist = distributions.BinaryYFDistribution( p_f=0.5, p_y_given_f1=0.8, p_y_given_f0=0.2 ) def get_mse(*, n, mu_f, mu_y, var_f, var_y, cov_fy, cov_f2_y, cov_f_y2, cov_f2_y2): cov_n = n / 2 cov_err = ( (1 / cov_n) * cov_f2_y2 + (1 / (cov_n - 1)) * var_f * var_y - (cov_n - 2) / (cov_n * (cov_n - 1)) * cov_fy**2 - (2 / cov_n) * (cov_f_y2 * mu_f + cov_f2_y * mu_y - 2 * cov_fy * mu_f * mu_y) ) mse_single_split = ( var_y / cov_n - cov_fy**2 / (cov_n * var_f) + (cov_err / (cov_n * var_f)) ) mse_cross_fit = (1 / 2) * mse_single_split + (1 / n**2 * var_f**2) * ( cov_f2_y - 2 * cov_fy * mu_f ) ** 2 return mse_cross_fit def compute_curve(dist=binary_dist, n_range=np.arange(4, 50, 2)): mu_f = dist.mu_f() mu_y = dist.mu_y() var_f = dist.variance_f() var_y = dist.variance_y() cov_fy = dist.covariance_f_y() var_y_minus_f = var_y + var_f - 2 * cov_fy cov_f2_y = dist.covariance_f2_y() cov_f_y2 = dist.covariance_f_y2() cov_f2_y2 = dist.covariance_f_y2() res = [] for n in n_range: cf_mse = get_mse( n=n, mu_f=mu_f, mu_y=mu_y, var_f=var_f, var_y=var_y, cov_fy=cov_fy, cov_f2_y=cov_f2_y, cov_f_y2=cov_f_y2, cov_f2_y2=cov_f2_y2, ) res.append( { "n": n, "cross-fit_var": cf_mse, "classical_var": var_y / n, "ppi_var": var_y_minus_f / n, "cross-fit_std": np.sqrt(cf_mse), "classical_std": np.sqrt(var_y / n), "relative_var_ppi": var_y_minus_f / var_y, "relative_var_cf": cf_mse / (var_y / n), } ) return res