Maria Castellanos commited on
Commit
5c22f32
·
1 Parent(s): d23ae67

fix evaluation

Browse files
Files changed (4) hide show
  1. about.py +13 -2
  2. app.py +1 -2
  3. evaluate.py +33 -32
  4. utils.py +31 -5
about.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  from huggingface_hub import HfApi
3
 
4
  ENDPOINTS = ["LogD",
5
- "KSol",
6
  "MLM CLint",
7
  "HLM CLint",
8
  "Caco-2 Permeability Efflux",
@@ -11,13 +11,24 @@ ENDPOINTS = ["LogD",
11
  "MBPB",
12
  "MGMB"]
13
 
14
- STANDARD_COLS = ["endpoint", "user", "submission_time", "model_report"]
15
  METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
16
  # Final columns
17
  LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
18
  LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
19
  LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
20
 
 
 
 
 
 
 
 
 
 
 
 
21
  TOKEN = os.environ.get("HF_TOKEN")
22
  CACHE_PATH=os.getenv("HF_HOME", ".")
23
  API = HfApi(token=TOKEN)
 
2
  from huggingface_hub import HfApi
3
 
4
  ENDPOINTS = ["LogD",
5
+ "KSOL",
6
  "MLM CLint",
7
  "HLM CLint",
8
  "Caco-2 Permeability Efflux",
 
11
  "MBPB",
12
  "MGMB"]
13
 
14
+ STANDARD_COLS = ["Endpoint", "user", "submission_time", "model_report"]
15
  METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
16
  # Final columns
17
  LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
18
  LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
19
  LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
20
 
21
+ # Dictionary with unit conversion multipliers for each endpoint
22
+ multiplier_dict = {"LogD": 1,
23
+ "KSOL": 1e-6,
24
+ "MLM CLint": 1,
25
+ "HLM CLint": 1,
26
+ "Caco-2 Permeability Efflux": 1e-6,
27
+ "Caco-2 Permeability Papp A>B": 1,
28
+ "MPPB": 1,
29
+ "MBPB": 1,
30
+ "MGMB": 1}
31
+
32
  TOKEN = os.environ.get("HF_TOKEN")
33
  CACHE_PATH=os.getenv("HF_HOME", ".")
34
  API = HfApi(token=TOKEN)
app.py CHANGED
@@ -16,8 +16,7 @@ from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
16
 
17
  ALL_EPS = ['Average'] + ENDPOINTS
18
 
19
- def build_leaderboard(df_results0):
20
- df_results = df_results0.rename(columns={"endpoint": "Endpoint"})
21
  per_ep = {}
22
  for ep in ALL_EPS:
23
  df = df_results[df_results["Endpoint"] == ep].copy()
 
16
 
17
  ALL_EPS = ['Average'] + ENDPOINTS
18
 
19
+ def build_leaderboard(df_results):
 
20
  per_ep = {}
21
  for ep in ALL_EPS:
22
  df = df_results[df_results["Endpoint"] == ep].copy()
evaluate.py CHANGED
@@ -2,8 +2,14 @@ import gradio as gr
2
  import pandas as pd
3
  from pathlib import Path
4
  from typing import Optional
5
- from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
6
- from utils import bootstrap_metrics
 
 
 
 
 
 
7
  from huggingface_hub import hf_hub_download
8
  import datetime
9
  import io
@@ -188,7 +194,7 @@ def evaluate_data(filename: str) -> None:
188
  test_path = hf_hub_download(
189
  repo_id=test_repo,
190
  repo_type="dataset",
191
- filename="data/challenge_mock_test_set.csv", #Replace later with "test_dataset.csv",
192
  )
193
  except Exception as e:
194
  raise gr.Error(f"Failed to download test file: {e}")
@@ -255,12 +261,10 @@ def calculate_metrics(
255
  # Do some checks
256
 
257
  # 1) Check all columns are present
258
- if "Molecule Name" in results_dataframe.columns: # Temporary check so old version of results doesn't fail
259
- results_dataframe.rename({"Molecule Name": "Name"}, inplace=True)
260
- _check_required_columns(results_dataframe, "Results file", ["Name"] + ENDPOINTS)
261
- _check_required_columns(test_dataframe, "Test file", ["Name"] + ENDPOINTS)
262
  # 2) Check all Molecules in the test set are present in the predictions
263
- merged_df = pd.merge(test_dataframe, results_dataframe, on=['Name'], how='left', indicator=True)
264
  if not (merged_df['_merge'] == 'both').all():
265
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
266
  # TODO: What to do when a molecule is duplicated in the Predictions file?
@@ -269,11 +273,8 @@ def calculate_metrics(
269
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
270
  all_endpoint_results = []
271
  for i, measurement in enumerate(ENDPOINTS):
272
- df_pred = results_dataframe[['Name', measurement]].copy()
273
- # Only use data with operator "="
274
- mask = test_dataframe[f"op_{measurement}"] != '='
275
- test_dataframe.loc[mask, measurement] = np.nan
276
- df_true = test_dataframe[['Name', measurement]].copy()
277
  # coerce numeric columns
278
  df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
279
  df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
@@ -287,47 +288,47 @@ def calculate_metrics(
287
  df_pred.rename(columns={measurement: f"{measurement}_pred"})
288
  .merge(
289
  df_true.rename(columns={measurement: f"{measurement}_true"}),
290
- on="Name",
291
  how="inner",
292
  )
293
  .dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
294
  )
295
- n_total = merged[f"{measurement}_true"].notna().sum() # Valid test set points
296
- n_pairs = len(merged) # actual pairs with predictions
297
- coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
298
- merged = merged.sort_values("Name", kind="stable")
299
 
300
- y_pred = merged[f"{measurement}_pred"].to_numpy()
301
- y_true = merged[f"{measurement}_true"].to_numpy()
302
- # Force log scale for all endpoints except LogD (for outliers)
303
- if measurement != "LogD":
304
- y_pred = np.log10(y_pred)
305
- y_true = np.log10(y_true)
306
 
 
 
307
  # Calculate dataframe with the metrics for 1000 bootstraps
308
  bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
309
  df_endpoint = bootstrap_df.pivot_table(
310
  index=["Endpoint"],
311
- columns=final_cols,
312
  values="Value",
313
  aggfunc=["mean", "std"]
314
  ).reset_index()
315
  # Get a df with columns 'mean_MAE', 'std_MAE', ...
316
  df_endpoint.columns = [
317
- f'{j}_{i}' if i != '' else j for i, j in df_endpoint.columns
318
  ]
319
- df_endpoint.rename(columns={'_Endpoint': 'Endpoint'}, inplace=True)
320
  all_endpoint_results.append(df_endpoint)
321
 
322
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
323
- mean_cols = [f'{m}_mean' for m in final_cols]
324
- std_cols = [f'{m}_std' for m in final_cols]
325
  # Average results
326
  macro_means = df_results[mean_cols].mean()
327
  macro_stds = df_results[std_cols].mean()
328
- avg_row = {"endpoint": "Average"}
329
  avg_row.update(macro_means.to_dict())
330
  avg_row.update(macro_stds.to_dict())
331
  df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
332
-
333
- return df_with_average
 
 
2
  import pandas as pd
3
  from pathlib import Path
4
  from typing import Optional
5
+ from about import (
6
+ ENDPOINTS, API,
7
+ submissions_repo,
8
+ results_repo,
9
+ test_repo,
10
+ multiplier_dict,
11
+ )
12
+ from utils import bootstrap_metrics, convert_to_log
13
  from huggingface_hub import hf_hub_download
14
  import datetime
15
  import io
 
194
  test_path = hf_hub_download(
195
  repo_id=test_repo,
196
  repo_type="dataset",
197
+ filename="data/expansion_data_test.csv",
198
  )
199
  except Exception as e:
200
  raise gr.Error(f"Failed to download test file: {e}")
 
261
  # Do some checks
262
 
263
  # 1) Check all columns are present
264
+ _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
265
+ _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
 
 
266
  # 2) Check all Molecules in the test set are present in the predictions
267
+ merged_df = pd.merge(test_dataframe, results_dataframe, on=['Molecule Name'], how='left', indicator=True)
268
  if not (merged_df['_merge'] == 'both').all():
269
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
270
  # TODO: What to do when a molecule is duplicated in the Predictions file?
 
273
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
274
  all_endpoint_results = []
275
  for i, measurement in enumerate(ENDPOINTS):
276
+ df_pred = results_dataframe[['Molecule Name', measurement]].copy()
277
+ df_true = test_dataframe[['Molecule Name', measurement]].copy()
 
 
 
278
  # coerce numeric columns
279
  df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
280
  df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
 
288
  df_pred.rename(columns={measurement: f"{measurement}_pred"})
289
  .merge(
290
  df_true.rename(columns={measurement: f"{measurement}_true"}),
291
+ on="Molecule Name",
292
  how="inner",
293
  )
294
  .dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
295
  )
296
+ merged = merged.sort_values("Molecule Name", kind="stable")
297
+ pred_col = f"{measurement}_pred"
298
+ true_col = f"{measurement}_true"
 
299
 
300
+ if measurement not in ['logD']:
301
+ # Force log scale for all endpoints except LogD (for outliers)
302
+ merged[pred_col] = convert_to_log(merged[pred_col], multiplier_dict.get(measurement, 1)).to_numpy()
303
+ merged[true_col] = convert_to_log(merged[true_col], multiplier_dict.get(measurement, 1)).to_numpy()
 
 
304
 
305
+ y_pred = merged[pred_col].to_numpy()
306
+ y_true = merged[true_col].to_numpy()
307
  # Calculate dataframe with the metrics for 1000 bootstraps
308
  bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
309
  df_endpoint = bootstrap_df.pivot_table(
310
  index=["Endpoint"],
311
+ columns="Metric",
312
  values="Value",
313
  aggfunc=["mean", "std"]
314
  ).reset_index()
315
  # Get a df with columns 'mean_MAE', 'std_MAE', ...
316
  df_endpoint.columns = [
317
+ f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
318
  ]
319
+ df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
320
  all_endpoint_results.append(df_endpoint)
321
 
322
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
323
+ mean_cols = [f'mean_{m}' for m in final_cols]
324
+ std_cols = [f'std_{m}' for m in final_cols]
325
  # Average results
326
  macro_means = df_results[mean_cols].mean()
327
  macro_stds = df_results[std_cols].mean()
328
+ avg_row = {"Endpoint": "Average"}
329
  avg_row.update(macro_means.to_dict())
330
  avg_row.update(macro_stds.to_dict())
331
  df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
332
+ # Fix order of columns
333
+ df_with_average = df_with_average[["Endpoint"]+mean_cols+std_cols]
334
+ return df_with_average
utils.py CHANGED
@@ -2,7 +2,7 @@
2
  import pandas as pd
3
  import numpy as np
4
  from typing import Tuple
5
- from datasets import load_dataset
6
  from about import results_repo
7
  from about import METRICS, STANDARD_COLS
8
 
@@ -13,7 +13,27 @@ def make_tag_clickable(tag: str):
13
  return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
14
 
15
  def fetch_dataset_df():
16
- dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  full_df = dset.to_pandas()
18
  expected_mean_cols = [f"mean_{col}" for col in METRICS]
19
  expected_std_cols = [f"std_{col}" for col in METRICS]
@@ -30,13 +50,19 @@ def fetch_dataset_df():
30
  # Get the most recent submission per user & endpoint
31
  latest = (
32
  df.sort_values("submission_time")
33
- .drop_duplicates(subset=["endpoint", "user"], keep="last")
34
- .sort_values(["endpoint", "user"])
35
  .reset_index(drop=True)
36
  )
37
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
38
  return latest
39
 
 
 
 
 
 
 
40
  def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
41
  """
42
  Generate bootstrap samples for a given size and number of samples.
@@ -111,7 +137,7 @@ def bootstrap_metrics(pred: np.ndarray,
111
  Dataframe with estimated metric per bootstrap sample for the given endpoint
112
  """
113
  cols = ["Sample", "Endpoint", "Metric", "Value"]
114
- bootstrap_results = pd.DataFrame(columns=cols, dtype=[int, str, str, float])
115
  for i, indx in enumerate(
116
  bootstrap_sampling(true.shape[0], n_bootstrap_samples)
117
  ):
 
2
  import pandas as pd
3
  import numpy as np
4
  from typing import Tuple
5
+ from datasets import load_dataset, Features, Value
6
  from about import results_repo
7
  from about import METRICS, STANDARD_COLS
8
 
 
13
  return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
14
 
15
  def fetch_dataset_df():
16
+
17
+ # Specify feature types to load results dataset
18
+ metric_features = {
19
+ f'mean_{m}': Value('float64') for m in METRICS
20
+ }
21
+ metric_features.update({
22
+ f'std_{m}': Value('float64') for m in METRICS
23
+ })
24
+ other_features = {
25
+ 'user': Value('string'),
26
+ 'Endpoint': Value('string'),
27
+ 'submission_time': Value('string'),
28
+ 'model_report': Value('string'),
29
+ 'anonymous': Value('bool'),
30
+ }
31
+ feature_schema = Features(metric_features | other_features)
32
+
33
+ dset = load_dataset(results_repo,
34
+ split='train',
35
+ features=feature_schema,
36
+ download_mode="force_redownload")
37
  full_df = dset.to_pandas()
38
  expected_mean_cols = [f"mean_{col}" for col in METRICS]
39
  expected_std_cols = [f"std_{col}" for col in METRICS]
 
50
  # Get the most recent submission per user & endpoint
51
  latest = (
52
  df.sort_values("submission_time")
53
+ .drop_duplicates(subset=["Endpoint", "user"], keep="last")
54
+ .sort_values(["Endpoint", "user"])
55
  .reset_index(drop=True)
56
  )
57
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
58
  return latest
59
 
60
+ def convert_to_log(data: pd.Series, multiplier: float) -> pd.Series:
61
+ # Add 0.01 to avoid inf
62
+ values = np.clip(data, a_min=0.01, a_max=None)
63
+ values = values * multiplier # Adjust units
64
+ return np.log10(values)
65
+
66
  def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
67
  """
68
  Generate bootstrap samples for a given size and number of samples.
 
137
  Dataframe with estimated metric per bootstrap sample for the given endpoint
138
  """
139
  cols = ["Sample", "Endpoint", "Metric", "Value"]
140
+ bootstrap_results = pd.DataFrame(columns=cols)
141
  for i, indx in enumerate(
142
  bootstrap_sampling(true.shape[0], n_bootstrap_samples)
143
  ):