frames = [train, test] | |
df = pd.concat(frames) | |
list_frequency_encoding = ['AppVersion', | |
'AvSigVersion', | |
'Census_OSVersion', | |
'EngineVersion', | |
'OsBuildLab'] | |
def frequency_encoding(feature): | |
t = df[feature].value_counts().reset_index() | |
t = t.reset_index() | |
t.loc[t[feature] == 1, 'level_0'] = np.nan | |
t.set_index('index', inplace=True) | |
max_label = t['level_0'].max() + 1 | |
t.fillna(max_label, inplace=True) | |
return t.to_dict()['level_0'] | |
for feature in tqdm(list_frequency_encoding): | |
freq_enc_dict = frequency_encoding(feature) | |
df[feature] = df[feature].map(lambda x: freq_enc_dict.get(x, np.nan)) | |
df[feature] = df[feature].astype('int64') | |