Maria Castellanos
commited on
Commit
·
5c22f32
1
Parent(s):
d23ae67
fix evaluation
Browse files
about.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
from huggingface_hub import HfApi
|
| 3 |
|
| 4 |
ENDPOINTS = ["LogD",
|
| 5 |
-
"
|
| 6 |
"MLM CLint",
|
| 7 |
"HLM CLint",
|
| 8 |
"Caco-2 Permeability Efflux",
|
|
@@ -11,13 +11,24 @@ ENDPOINTS = ["LogD",
|
|
| 11 |
"MBPB",
|
| 12 |
"MGMB"]
|
| 13 |
|
| 14 |
-
STANDARD_COLS = ["
|
| 15 |
METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 16 |
# Final columns
|
| 17 |
LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
|
| 18 |
LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
|
| 19 |
LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
TOKEN = os.environ.get("HF_TOKEN")
|
| 22 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 23 |
API = HfApi(token=TOKEN)
|
|
|
|
| 2 |
from huggingface_hub import HfApi
|
| 3 |
|
| 4 |
ENDPOINTS = ["LogD",
|
| 5 |
+
"KSOL",
|
| 6 |
"MLM CLint",
|
| 7 |
"HLM CLint",
|
| 8 |
"Caco-2 Permeability Efflux",
|
|
|
|
| 11 |
"MBPB",
|
| 12 |
"MGMB"]
|
| 13 |
|
| 14 |
+
STANDARD_COLS = ["Endpoint", "user", "submission_time", "model_report"]
|
| 15 |
METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 16 |
# Final columns
|
| 17 |
LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
|
| 18 |
LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
|
| 19 |
LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
|
| 20 |
|
| 21 |
+
# Dictionary with unit conversion multipliers for each endpoint
|
| 22 |
+
multiplier_dict = {"LogD": 1,
|
| 23 |
+
"KSOL": 1e-6,
|
| 24 |
+
"MLM CLint": 1,
|
| 25 |
+
"HLM CLint": 1,
|
| 26 |
+
"Caco-2 Permeability Efflux": 1e-6,
|
| 27 |
+
"Caco-2 Permeability Papp A>B": 1,
|
| 28 |
+
"MPPB": 1,
|
| 29 |
+
"MBPB": 1,
|
| 30 |
+
"MGMB": 1}
|
| 31 |
+
|
| 32 |
TOKEN = os.environ.get("HF_TOKEN")
|
| 33 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
| 34 |
API = HfApi(token=TOKEN)
|
app.py
CHANGED
|
@@ -16,8 +16,7 @@ from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
|
|
| 16 |
|
| 17 |
ALL_EPS = ['Average'] + ENDPOINTS
|
| 18 |
|
| 19 |
-
def build_leaderboard(
|
| 20 |
-
df_results = df_results0.rename(columns={"endpoint": "Endpoint"})
|
| 21 |
per_ep = {}
|
| 22 |
for ep in ALL_EPS:
|
| 23 |
df = df_results[df_results["Endpoint"] == ep].copy()
|
|
|
|
| 16 |
|
| 17 |
ALL_EPS = ['Average'] + ENDPOINTS
|
| 18 |
|
| 19 |
+
def build_leaderboard(df_results):
|
|
|
|
| 20 |
per_ep = {}
|
| 21 |
for ep in ALL_EPS:
|
| 22 |
df = df_results[df_results["Endpoint"] == ep].copy()
|
evaluate.py
CHANGED
|
@@ -2,8 +2,14 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Optional
|
| 5 |
-
from about import
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
import datetime
|
| 9 |
import io
|
|
@@ -188,7 +194,7 @@ def evaluate_data(filename: str) -> None:
|
|
| 188 |
test_path = hf_hub_download(
|
| 189 |
repo_id=test_repo,
|
| 190 |
repo_type="dataset",
|
| 191 |
-
filename="data/
|
| 192 |
)
|
| 193 |
except Exception as e:
|
| 194 |
raise gr.Error(f"Failed to download test file: {e}")
|
|
@@ -255,12 +261,10 @@ def calculate_metrics(
|
|
| 255 |
# Do some checks
|
| 256 |
|
| 257 |
# 1) Check all columns are present
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
_check_required_columns(results_dataframe, "Results file", ["Name"] + ENDPOINTS)
|
| 261 |
-
_check_required_columns(test_dataframe, "Test file", ["Name"] + ENDPOINTS)
|
| 262 |
# 2) Check all Molecules in the test set are present in the predictions
|
| 263 |
-
merged_df = pd.merge(test_dataframe, results_dataframe, on=['Name'], how='left', indicator=True)
|
| 264 |
if not (merged_df['_merge'] == 'both').all():
|
| 265 |
raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
|
| 266 |
# TODO: What to do when a molecule is duplicated in the Predictions file?
|
|
@@ -269,11 +273,8 @@ def calculate_metrics(
|
|
| 269 |
final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 270 |
all_endpoint_results = []
|
| 271 |
for i, measurement in enumerate(ENDPOINTS):
|
| 272 |
-
df_pred = results_dataframe[['Name', measurement]].copy()
|
| 273 |
-
|
| 274 |
-
mask = test_dataframe[f"op_{measurement}"] != '='
|
| 275 |
-
test_dataframe.loc[mask, measurement] = np.nan
|
| 276 |
-
df_true = test_dataframe[['Name', measurement]].copy()
|
| 277 |
# coerce numeric columns
|
| 278 |
df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
|
| 279 |
df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
|
|
@@ -287,47 +288,47 @@ def calculate_metrics(
|
|
| 287 |
df_pred.rename(columns={measurement: f"{measurement}_pred"})
|
| 288 |
.merge(
|
| 289 |
df_true.rename(columns={measurement: f"{measurement}_true"}),
|
| 290 |
-
on="Name",
|
| 291 |
how="inner",
|
| 292 |
)
|
| 293 |
.dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
|
| 294 |
)
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
merged = merged.sort_values("Name", kind="stable")
|
| 299 |
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
y_pred = np.log10(y_pred)
|
| 305 |
-
y_true = np.log10(y_true)
|
| 306 |
|
|
|
|
|
|
|
| 307 |
# Calculate dataframe with the metrics for 1000 bootstraps
|
| 308 |
bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
|
| 309 |
df_endpoint = bootstrap_df.pivot_table(
|
| 310 |
index=["Endpoint"],
|
| 311 |
-
columns=
|
| 312 |
values="Value",
|
| 313 |
aggfunc=["mean", "std"]
|
| 314 |
).reset_index()
|
| 315 |
# Get a df with columns 'mean_MAE', 'std_MAE', ...
|
| 316 |
df_endpoint.columns = [
|
| 317 |
-
f'{
|
| 318 |
]
|
| 319 |
-
df_endpoint.rename(columns={'
|
| 320 |
all_endpoint_results.append(df_endpoint)
|
| 321 |
|
| 322 |
df_results = pd.concat(all_endpoint_results, ignore_index=True)
|
| 323 |
-
mean_cols = [f'{m}
|
| 324 |
-
std_cols = [f'{m}
|
| 325 |
# Average results
|
| 326 |
macro_means = df_results[mean_cols].mean()
|
| 327 |
macro_stds = df_results[std_cols].mean()
|
| 328 |
-
avg_row = {"
|
| 329 |
avg_row.update(macro_means.to_dict())
|
| 330 |
avg_row.update(macro_stds.to_dict())
|
| 331 |
df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
|
| 332 |
-
|
| 333 |
-
|
|
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import Optional
|
| 5 |
+
from about import (
|
| 6 |
+
ENDPOINTS, API,
|
| 7 |
+
submissions_repo,
|
| 8 |
+
results_repo,
|
| 9 |
+
test_repo,
|
| 10 |
+
multiplier_dict,
|
| 11 |
+
)
|
| 12 |
+
from utils import bootstrap_metrics, convert_to_log
|
| 13 |
from huggingface_hub import hf_hub_download
|
| 14 |
import datetime
|
| 15 |
import io
|
|
|
|
| 194 |
test_path = hf_hub_download(
|
| 195 |
repo_id=test_repo,
|
| 196 |
repo_type="dataset",
|
| 197 |
+
filename="data/expansion_data_test.csv",
|
| 198 |
)
|
| 199 |
except Exception as e:
|
| 200 |
raise gr.Error(f"Failed to download test file: {e}")
|
|
|
|
| 261 |
# Do some checks
|
| 262 |
|
| 263 |
# 1) Check all columns are present
|
| 264 |
+
_check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
|
| 265 |
+
_check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
|
|
|
|
|
|
|
| 266 |
# 2) Check all Molecules in the test set are present in the predictions
|
| 267 |
+
merged_df = pd.merge(test_dataframe, results_dataframe, on=['Molecule Name'], how='left', indicator=True)
|
| 268 |
if not (merged_df['_merge'] == 'both').all():
|
| 269 |
raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
|
| 270 |
# TODO: What to do when a molecule is duplicated in the Predictions file?
|
|
|
|
| 273 |
final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 274 |
all_endpoint_results = []
|
| 275 |
for i, measurement in enumerate(ENDPOINTS):
|
| 276 |
+
df_pred = results_dataframe[['Molecule Name', measurement]].copy()
|
| 277 |
+
df_true = test_dataframe[['Molecule Name', measurement]].copy()
|
|
|
|
|
|
|
|
|
|
| 278 |
# coerce numeric columns
|
| 279 |
df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
|
| 280 |
df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
|
|
|
|
| 288 |
df_pred.rename(columns={measurement: f"{measurement}_pred"})
|
| 289 |
.merge(
|
| 290 |
df_true.rename(columns={measurement: f"{measurement}_true"}),
|
| 291 |
+
on="Molecule Name",
|
| 292 |
how="inner",
|
| 293 |
)
|
| 294 |
.dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
|
| 295 |
)
|
| 296 |
+
merged = merged.sort_values("Molecule Name", kind="stable")
|
| 297 |
+
pred_col = f"{measurement}_pred"
|
| 298 |
+
true_col = f"{measurement}_true"
|
|
|
|
| 299 |
|
| 300 |
+
if measurement not in ['logD']:
|
| 301 |
+
# Force log scale for all endpoints except LogD (for outliers)
|
| 302 |
+
merged[pred_col] = convert_to_log(merged[pred_col], multiplier_dict.get(measurement, 1)).to_numpy()
|
| 303 |
+
merged[true_col] = convert_to_log(merged[true_col], multiplier_dict.get(measurement, 1)).to_numpy()
|
|
|
|
|
|
|
| 304 |
|
| 305 |
+
y_pred = merged[pred_col].to_numpy()
|
| 306 |
+
y_true = merged[true_col].to_numpy()
|
| 307 |
# Calculate dataframe with the metrics for 1000 bootstraps
|
| 308 |
bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
|
| 309 |
df_endpoint = bootstrap_df.pivot_table(
|
| 310 |
index=["Endpoint"],
|
| 311 |
+
columns="Metric",
|
| 312 |
values="Value",
|
| 313 |
aggfunc=["mean", "std"]
|
| 314 |
).reset_index()
|
| 315 |
# Get a df with columns 'mean_MAE', 'std_MAE', ...
|
| 316 |
df_endpoint.columns = [
|
| 317 |
+
f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
|
| 318 |
]
|
| 319 |
+
df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
|
| 320 |
all_endpoint_results.append(df_endpoint)
|
| 321 |
|
| 322 |
df_results = pd.concat(all_endpoint_results, ignore_index=True)
|
| 323 |
+
mean_cols = [f'mean_{m}' for m in final_cols]
|
| 324 |
+
std_cols = [f'std_{m}' for m in final_cols]
|
| 325 |
# Average results
|
| 326 |
macro_means = df_results[mean_cols].mean()
|
| 327 |
macro_stds = df_results[std_cols].mean()
|
| 328 |
+
avg_row = {"Endpoint": "Average"}
|
| 329 |
avg_row.update(macro_means.to_dict())
|
| 330 |
avg_row.update(macro_stds.to_dict())
|
| 331 |
df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
|
| 332 |
+
# Fix order of columns
|
| 333 |
+
df_with_average = df_with_average[["Endpoint"]+mean_cols+std_cols]
|
| 334 |
+
return df_with_average
|
utils.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
from typing import Tuple
|
| 5 |
-
from datasets import load_dataset
|
| 6 |
from about import results_repo
|
| 7 |
from about import METRICS, STANDARD_COLS
|
| 8 |
|
|
@@ -13,7 +13,27 @@ def make_tag_clickable(tag: str):
|
|
| 13 |
return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
| 14 |
|
| 15 |
def fetch_dataset_df():
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
full_df = dset.to_pandas()
|
| 18 |
expected_mean_cols = [f"mean_{col}" for col in METRICS]
|
| 19 |
expected_std_cols = [f"std_{col}" for col in METRICS]
|
|
@@ -30,13 +50,19 @@ def fetch_dataset_df():
|
|
| 30 |
# Get the most recent submission per user & endpoint
|
| 31 |
latest = (
|
| 32 |
df.sort_values("submission_time")
|
| 33 |
-
.drop_duplicates(subset=["
|
| 34 |
-
.sort_values(["
|
| 35 |
.reset_index(drop=True)
|
| 36 |
)
|
| 37 |
latest.rename(columns={"submission_time": "submission time"}, inplace=True)
|
| 38 |
return latest
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
|
| 41 |
"""
|
| 42 |
Generate bootstrap samples for a given size and number of samples.
|
|
@@ -111,7 +137,7 @@ def bootstrap_metrics(pred: np.ndarray,
|
|
| 111 |
Dataframe with estimated metric per bootstrap sample for the given endpoint
|
| 112 |
"""
|
| 113 |
cols = ["Sample", "Endpoint", "Metric", "Value"]
|
| 114 |
-
bootstrap_results = pd.DataFrame(columns=cols
|
| 115 |
for i, indx in enumerate(
|
| 116 |
bootstrap_sampling(true.shape[0], n_bootstrap_samples)
|
| 117 |
):
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
from typing import Tuple
|
| 5 |
+
from datasets import load_dataset, Features, Value
|
| 6 |
from about import results_repo
|
| 7 |
from about import METRICS, STANDARD_COLS
|
| 8 |
|
|
|
|
| 13 |
return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
| 14 |
|
| 15 |
def fetch_dataset_df():
|
| 16 |
+
|
| 17 |
+
# Specify feature types to load results dataset
|
| 18 |
+
metric_features = {
|
| 19 |
+
f'mean_{m}': Value('float64') for m in METRICS
|
| 20 |
+
}
|
| 21 |
+
metric_features.update({
|
| 22 |
+
f'std_{m}': Value('float64') for m in METRICS
|
| 23 |
+
})
|
| 24 |
+
other_features = {
|
| 25 |
+
'user': Value('string'),
|
| 26 |
+
'Endpoint': Value('string'),
|
| 27 |
+
'submission_time': Value('string'),
|
| 28 |
+
'model_report': Value('string'),
|
| 29 |
+
'anonymous': Value('bool'),
|
| 30 |
+
}
|
| 31 |
+
feature_schema = Features(metric_features | other_features)
|
| 32 |
+
|
| 33 |
+
dset = load_dataset(results_repo,
|
| 34 |
+
split='train',
|
| 35 |
+
features=feature_schema,
|
| 36 |
+
download_mode="force_redownload")
|
| 37 |
full_df = dset.to_pandas()
|
| 38 |
expected_mean_cols = [f"mean_{col}" for col in METRICS]
|
| 39 |
expected_std_cols = [f"std_{col}" for col in METRICS]
|
|
|
|
| 50 |
# Get the most recent submission per user & endpoint
|
| 51 |
latest = (
|
| 52 |
df.sort_values("submission_time")
|
| 53 |
+
.drop_duplicates(subset=["Endpoint", "user"], keep="last")
|
| 54 |
+
.sort_values(["Endpoint", "user"])
|
| 55 |
.reset_index(drop=True)
|
| 56 |
)
|
| 57 |
latest.rename(columns={"submission_time": "submission time"}, inplace=True)
|
| 58 |
return latest
|
| 59 |
|
| 60 |
+
def convert_to_log(data: pd.Series, multiplier: float) -> pd.Series:
|
| 61 |
+
# Add 0.01 to avoid inf
|
| 62 |
+
values = np.clip(data, a_min=0.01, a_max=None)
|
| 63 |
+
values = values * multiplier # Adjust units
|
| 64 |
+
return np.log10(values)
|
| 65 |
+
|
| 66 |
def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
|
| 67 |
"""
|
| 68 |
Generate bootstrap samples for a given size and number of samples.
|
|
|
|
| 137 |
Dataframe with estimated metric per bootstrap sample for the given endpoint
|
| 138 |
"""
|
| 139 |
cols = ["Sample", "Endpoint", "Metric", "Value"]
|
| 140 |
+
bootstrap_results = pd.DataFrame(columns=cols)
|
| 141 |
for i, indx in enumerate(
|
| 142 |
bootstrap_sampling(true.shape[0], n_bootstrap_samples)
|
| 143 |
):
|