Spaces:

openadmet
/

OpenADMET-ExpansionRx-Challenge

Running

App Files Files Community

Maria Castellanos commited on Oct 3

Commit

5c22f32

1 Parent(s): d23ae67

fix evaluation

Browse files

Files changed (4) hide show

about.py +13 -2
app.py +1 -2
evaluate.py +33 -32
utils.py +31 -5

about.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from huggingface_hub import HfApi
 ENDPOINTS = ["LogD",
-             "KSol",
              "MLM CLint",
              "HLM CLint",
              "Caco-2 Permeability Efflux",
@@ -11,13 +11,24 @@ ENDPOINTS = ["LogD",
              "MBPB",
              "MGMB"]
-STANDARD_COLS = ["endpoint", "user", "submission_time", "model_report"]
 METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
 # Final columns
 LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
 LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
 LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
 TOKEN = os.environ.get("HF_TOKEN")
 CACHE_PATH=os.getenv("HF_HOME", ".")
 API = HfApi(token=TOKEN)

 from huggingface_hub import HfApi
 ENDPOINTS = ["LogD",
+             "KSOL",
              "MLM CLint",
              "HLM CLint",
              "Caco-2 Permeability Efflux",
              "MBPB",
              "MGMB"]
+STANDARD_COLS = ["Endpoint", "user", "submission_time", "model_report"]
 METRICS = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
 # Final columns
 LB_COLS = ["user", "MAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"]
 LB_AVG = ["user", "MA-RAE", "R2", "Spearman R", "Kendall's Tau", "submission time", "model details"] # Delete some columns for overall LB?
 LB_DTYPES = ['markdown', 'number', 'number', 'number', 'number', 'str', 'markdown', 'number']
+# Dictionary with unit conversion multipliers for each endpoint
+multiplier_dict = {"LogD": 1,
+             "KSOL": 1e-6,
+             "MLM CLint": 1,
+             "HLM CLint": 1,
+             "Caco-2 Permeability Efflux": 1e-6,
+             "Caco-2 Permeability Papp A>B": 1,
+             "MPPB": 1,
+             "MBPB": 1,
+             "MGMB": 1}
 TOKEN = os.environ.get("HF_TOKEN")
 CACHE_PATH=os.getenv("HF_HOME", ".")
 API = HfApi(token=TOKEN)

app.py CHANGED Viewed

@@ -16,8 +16,7 @@ from about import ENDPOINTS, LB_COLS, LB_AVG, LB_DTYPES
 ALL_EPS = ['Average'] + ENDPOINTS
-def build_leaderboard(df_results0):
-    df_results = df_results0.rename(columns={"endpoint": "Endpoint"})
     per_ep = {}
     for ep in ALL_EPS:
         df = df_results[df_results["Endpoint"] == ep].copy()

 ALL_EPS = ['Average'] + ENDPOINTS
+def build_leaderboard(df_results):
     per_ep = {}
     for ep in ALL_EPS:
         df = df_results[df_results["Endpoint"] == ep].copy()

evaluate.py CHANGED Viewed

@@ -2,8 +2,14 @@ import gradio as gr
 import pandas as pd
 from pathlib import Path
 from typing import Optional
-from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo
-from utils import bootstrap_metrics
 from huggingface_hub import hf_hub_download
 import datetime
 import io
@@ -188,7 +194,7 @@ def evaluate_data(filename: str) -> None:
         test_path = hf_hub_download(
             repo_id=test_repo,
             repo_type="dataset",
-            filename="data/challenge_mock_test_set.csv", #Replace later with "test_dataset.csv",
         )
     except Exception as e:
         raise gr.Error(f"Failed to download test file: {e}")
@@ -255,12 +261,10 @@ def calculate_metrics(
     # Do some checks
     # 1) Check all columns are present
-    if "Molecule Name" in results_dataframe.columns: # Temporary check so old version of results doesn't fail
-        results_dataframe.rename({"Molecule Name": "Name"}, inplace=True)
-    _check_required_columns(results_dataframe, "Results file", ["Name"] + ENDPOINTS)
-    _check_required_columns(test_dataframe, "Test file", ["Name"] + ENDPOINTS)
     # 2) Check all Molecules in the test set are present in the predictions
-    merged_df = pd.merge(test_dataframe, results_dataframe, on=['Name'], how='left', indicator=True)
     if not (merged_df['_merge'] == 'both').all():
         raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
     # TODO: What to do when a molecule is duplicated in the Predictions file?
@@ -269,11 +273,8 @@ def calculate_metrics(
     final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
     all_endpoint_results = []
     for i, measurement in enumerate(ENDPOINTS):
-        df_pred = results_dataframe[['Name', measurement]].copy()
-        # Only use data with operator "="
-        mask = test_dataframe[f"op_{measurement}"] != '='
-        test_dataframe.loc[mask, measurement] = np.nan
-        df_true = test_dataframe[['Name', measurement]].copy()
         # coerce numeric columns
         df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
         df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
@@ -287,47 +288,47 @@ def calculate_metrics(
             df_pred.rename(columns={measurement: f"{measurement}_pred"})
                 .merge(
                     df_true.rename(columns={measurement: f"{measurement}_true"}),
-                    on="Name",
                     how="inner",
                 )
                 .dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
         )
-        n_total = merged[f"{measurement}_true"].notna().sum()     # Valid test set points
-        n_pairs = len(merged)                         # actual pairs with predictions
-        coverage = (n_pairs / n_total * 100.0) if n_total else 0.0
-        merged = merged.sort_values("Name", kind="stable")
-        y_pred = merged[f"{measurement}_pred"].to_numpy()
-        y_true = merged[f"{measurement}_true"].to_numpy()
-        # Force log scale for all endpoints except LogD (for outliers)
-        if measurement != "LogD":
-            y_pred = np.log10(y_pred)
-            y_true = np.log10(y_true)
         # Calculate dataframe with the metrics for 1000 bootstraps
         bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
         df_endpoint = bootstrap_df.pivot_table(
             index=["Endpoint"],
-            columns=final_cols,
             values="Value",
             aggfunc=["mean", "std"]
         ).reset_index()
         # Get a df with columns 'mean_MAE', 'std_MAE', ...
         df_endpoint.columns = [
-            f'{j}_{i}' if i != '' else j for i, j in df_endpoint.columns
         ]
-        df_endpoint.rename(columns={'_Endpoint': 'Endpoint'}, inplace=True)
         all_endpoint_results.append(df_endpoint)
     df_results = pd.concat(all_endpoint_results, ignore_index=True)
-    mean_cols = [f'{m}_mean' for m in final_cols]
-    std_cols = [f'{m}_std' for m in final_cols]
     # Average results
     macro_means = df_results[mean_cols].mean()
     macro_stds = df_results[std_cols].mean()
-    avg_row = {"endpoint": "Average"}
     avg_row.update(macro_means.to_dict())
     avg_row.update(macro_stds.to_dict())
     df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
-    return df_with_average

 import pandas as pd
 from pathlib import Path
 from typing import Optional
+from about import (
+    ENDPOINTS, API,
+    submissions_repo,
+    results_repo,
+    test_repo,
+    multiplier_dict,
+)
+from utils import bootstrap_metrics, convert_to_log
 from huggingface_hub import hf_hub_download
 import datetime
 import io
         test_path = hf_hub_download(
             repo_id=test_repo,
             repo_type="dataset",
+            filename="data/expansion_data_test.csv",
         )
     except Exception as e:
         raise gr.Error(f"Failed to download test file: {e}")
     # Do some checks
     # 1) Check all columns are present
+    _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
+    _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
     # 2) Check all Molecules in the test set are present in the predictions
+    merged_df = pd.merge(test_dataframe, results_dataframe, on=['Molecule Name'], how='left', indicator=True)
     if not (merged_df['_merge'] == 'both').all():
         raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
     # TODO: What to do when a molecule is duplicated in the Predictions file?
     final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
     all_endpoint_results = []
     for i, measurement in enumerate(ENDPOINTS):
+        df_pred = results_dataframe[['Molecule Name', measurement]].copy()
+        df_true = test_dataframe[['Molecule Name', measurement]].copy()
         # coerce numeric columns
         df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
         df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
             df_pred.rename(columns={measurement: f"{measurement}_pred"})
                 .merge(
                     df_true.rename(columns={measurement: f"{measurement}_true"}),
+                    on="Molecule Name",
                     how="inner",
                 )
                 .dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
         )
+        merged = merged.sort_values("Molecule Name", kind="stable")
+        pred_col = f"{measurement}_pred"
+        true_col = f"{measurement}_true"
+        if measurement not in ['logD']:
+            # Force log scale for all endpoints except LogD (for outliers)
+            merged[pred_col] = convert_to_log(merged[pred_col], multiplier_dict.get(measurement, 1)).to_numpy()
+            merged[true_col]  = convert_to_log(merged[true_col], multiplier_dict.get(measurement, 1)).to_numpy()
+        y_pred = merged[pred_col].to_numpy()
+        y_true = merged[true_col].to_numpy()
         # Calculate dataframe with the metrics for 1000 bootstraps
         bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
         df_endpoint = bootstrap_df.pivot_table(
             index=["Endpoint"],
+            columns="Metric",
             values="Value",
             aggfunc=["mean", "std"]
         ).reset_index()
         # Get a df with columns 'mean_MAE', 'std_MAE', ...
         df_endpoint.columns = [
+            f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
         ]
+        df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
         all_endpoint_results.append(df_endpoint)
     df_results = pd.concat(all_endpoint_results, ignore_index=True)
+    mean_cols = [f'mean_{m}' for m in final_cols]
+    std_cols = [f'std_{m}' for m in final_cols]
     # Average results
     macro_means = df_results[mean_cols].mean()
     macro_stds = df_results[std_cols].mean()
+    avg_row = {"Endpoint": "Average"}
     avg_row.update(macro_means.to_dict())
     avg_row.update(macro_stds.to_dict())
     df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
+    # Fix order of columns
+    df_with_average = df_with_average[["Endpoint"]+mean_cols+std_cols]
+    return df_with_average

utils.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np
 from typing import Tuple
-from datasets import load_dataset
 from about import results_repo
 from about import METRICS, STANDARD_COLS
@@ -13,7 +13,27 @@ def make_tag_clickable(tag: str):
     return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
 def fetch_dataset_df():
-    dset = load_dataset(results_repo, split='train', download_mode="force_redownload")
     full_df = dset.to_pandas()
     expected_mean_cols = [f"mean_{col}" for col in METRICS]
     expected_std_cols = [f"std_{col}" for col in METRICS]
@@ -30,13 +50,19 @@ def fetch_dataset_df():
     # Get the most recent submission per user & endpoint
     latest = (
         df.sort_values("submission_time")
-          .drop_duplicates(subset=["endpoint", "user"], keep="last")
-          .sort_values(["endpoint", "user"])
           .reset_index(drop=True)
     )
     latest.rename(columns={"submission_time": "submission time"}, inplace=True)
     return latest
 def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
     """
     Generate bootstrap samples for a given size and number of samples.
@@ -111,7 +137,7 @@ def bootstrap_metrics(pred: np.ndarray,
         Dataframe with estimated metric per bootstrap sample for the given endpoint
     """
     cols = ["Sample", "Endpoint", "Metric", "Value"]
-    bootstrap_results = pd.DataFrame(columns=cols, dtype=[int, str, str, float])
     for i, indx in enumerate(
         bootstrap_sampling(true.shape[0], n_bootstrap_samples)
     ):

 import pandas as pd
 import numpy as np
 from typing import Tuple
+from datasets import load_dataset, Features, Value
 from about import results_repo
 from about import METRICS, STANDARD_COLS
     return f'<a target="_blank" href="{tag}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
 def fetch_dataset_df():
+    # Specify feature types to load results dataset
+    metric_features = {
+        f'mean_{m}': Value('float64') for m in METRICS
+    }
+    metric_features.update({
+        f'std_{m}': Value('float64') for m in METRICS
+    })
+    other_features = {
+        'user': Value('string'),
+        'Endpoint': Value('string'),
+        'submission_time': Value('string'),
+        'model_report': Value('string'),
+        'anonymous': Value('bool'),
+    }
+    feature_schema = Features(metric_features | other_features)
+    dset = load_dataset(results_repo,
+                        split='train',
+                        features=feature_schema,
+                        download_mode="force_redownload")
     full_df = dset.to_pandas()
     expected_mean_cols = [f"mean_{col}" for col in METRICS]
     expected_std_cols = [f"std_{col}" for col in METRICS]
     # Get the most recent submission per user & endpoint
     latest = (
         df.sort_values("submission_time")
+          .drop_duplicates(subset=["Endpoint", "user"], keep="last")
+          .sort_values(["Endpoint", "user"])
           .reset_index(drop=True)
     )
     latest.rename(columns={"submission_time": "submission time"}, inplace=True)
     return latest
+def convert_to_log(data: pd.Series, multiplier: float) -> pd.Series:
+    # Add 0.01 to avoid inf
+    values = np.clip(data, a_min=0.01, a_max=None)
+    values = values * multiplier # Adjust units
+    return np.log10(values)
 def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
     """
     Generate bootstrap samples for a given size and number of samples.
         Dataframe with estimated metric per bootstrap sample for the given endpoint
     """
     cols = ["Sample", "Endpoint", "Metric", "Value"]
+    bootstrap_results = pd.DataFrame(columns=cols)
     for i, indx in enumerate(
         bootstrap_sampling(true.shape[0], n_bootstrap_samples)
     ):