### Kernel Density Estimation 
Given n data points, X$\in R^{n\times m}$, estimate the probability density function of the data i.e. Prob(x).

In KDE, the pdf is given by $P(x) = \frac{1}{nh}\sum_{i=1}^{N}K(\frac{X_i-x}{h})$,
where K is the kernel function, h is smoothing bandwidth (small h undersmoothing, large h oversmoothing).

In [None]:
import sklearn
import fnmatch
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV

#### Load the real data and select samples for a specific race and sex

In [None]:
df = pd.read_csv('istaging_all.csv') # load istaging data

In [None]:
# select black females
df = df[((df.Race == 'Black') & (df.Sex == 'F'))].reset_index(drop=True)

In [None]:
# select baseline data for each subject
df.Date = pd.to_datetime(df.Date)
df_tp1 = df.loc[df.groupby('PTID')['Date'].idxmin()].reset_index(drop=True)

In [None]:
# split the data to train and test set, train set will be used to learn the probablity distribtuion of the real data
df_train, df_test = sklearn.model_selection.train_test_split(df_tp1, test_size=0.3, random_state=40)

#### Fit a KDE model to estimate the joint probability density of Age and ROI volumes.

In [None]:
## standardized ROI grid search
# use grid search to select the bandwidth
cols = ['Age']
roi_cols = [] #fill in with the roi column names
cols.extend(fnmatch.filter(df_train.columns, roi_cols)) # select the ROI volumes
data = df_train.loc[:, cols].to_numpy()
data_standard = pd.DataFrame()
# standardize the data
data_standard['Age'] = (df_train['Age'] - df_train.loc[:, 'Age'].mean()) / df_train.loc[:, 'Age'].std()
data_standard[cols[1:]] = ((df_train.loc[:, cols[1:]] - df_train.loc[:, cols[1:]].mean()) / df_train.loc[:, cols[1:]].std())
data_standard = data_standard.to_numpy()

# Use a Gaussian kernel
kde = GridSearchCV(KernelDensity(kernel='gaussian'),{'bandwidth': np.linspace(0, 3, 100)}, cv=5)
kde.fit(data_standard)
kde = kde.best_estimator_
print(f'optimal bandwidth of kernel estimated via grid search is {kde.bandwidth_} ')

#### Generate synthetic data using a KDE model for the specified category of race and sex

In [None]:
# sample 3000 data points
sample = kde.sample(3000, random_state=0)
sample[:, :] = np.multiply(sample[:, :], df_train.loc[:, cols[:]].std().tolist()) + df_train.loc[:, cols[:]].mean().tolist()
cov_list = np.array([[f'Synth_{i+1}', 'F', 'Black'] for i in range(3000)])
synthetic_data = np.concatenate([cov_list, sample], axis=1)
cols=['PTID', 'Sex', 'Race', 'Age']
cols.extend(roi_cols)
df_kde_synth = pd.DataFrame(synthetic_data, columns=cols)