Maria Castellanos
commited on
Commit
·
179f265
1
Parent(s):
d1f7806
Create raw csv file along clean
Browse files- evaluate.py +46 -4
- requirements.txt +2 -1
- utils.py +1 -0
evaluate.py
CHANGED
|
@@ -223,7 +223,7 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
|
|
| 223 |
data_df = pd.read_csv(local_path)
|
| 224 |
test_df = pd.read_csv(test_path)
|
| 225 |
try:
|
| 226 |
-
results_df = calculate_metrics(data_df, test_df)
|
| 227 |
if not isinstance(results_df, pd.DataFrame) or results_df.empty:
|
| 228 |
raise gr.Error("Evaluation produced no results.")
|
| 229 |
except Exception as e:
|
|
@@ -256,6 +256,13 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
|
|
| 256 |
results_df['model_report'] = report
|
| 257 |
results_df['anonymous'] = meta.participant.anonymous
|
| 258 |
results_df['hf_username'] = username
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
safe_user = _unsafify_username(username)
|
| 260 |
destination_path = f"results/{safe_user}_{timestamp}_results.csv"
|
| 261 |
tmp_name = None
|
|
@@ -273,7 +280,22 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
|
|
| 273 |
)
|
| 274 |
Path(tmp_name).unlink()
|
| 275 |
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
def calculate_metrics(
|
| 279 |
results_dataframe: pd.DataFrame,
|
|
@@ -310,6 +332,7 @@ def calculate_metrics(
|
|
| 310 |
|
| 311 |
final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 312 |
all_endpoint_results = []
|
|
|
|
| 313 |
|
| 314 |
for ept in ENDPOINTS:
|
| 315 |
pred_col = f"{ept}_pred"
|
|
@@ -351,10 +374,22 @@ def calculate_metrics(
|
|
| 351 |
df_endpoint = df_reindexed.reset_index()
|
| 352 |
all_endpoint_results.append(df_endpoint)
|
| 353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
df_results = pd.concat(all_endpoint_results, ignore_index=True)
|
|
|
|
|
|
|
|
|
|
| 355 |
mean_cols = [f'mean_{m}' for m in final_cols]
|
| 356 |
std_cols = [f'std_{m}' for m in final_cols]
|
| 357 |
-
# Average results
|
| 358 |
macro_means = df_results[mean_cols].mean()
|
| 359 |
macro_stds = df_results[std_cols].mean()
|
| 360 |
avg_row = {"Endpoint": "Average"}
|
|
@@ -363,4 +398,11 @@ def calculate_metrics(
|
|
| 363 |
df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
|
| 364 |
# Fix order of columns
|
| 365 |
df_with_average = df_with_average[["Endpoint"]+mean_cols+std_cols]
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
data_df = pd.read_csv(local_path)
|
| 224 |
test_df = pd.read_csv(test_path)
|
| 225 |
try:
|
| 226 |
+
results_df, results_raw_df = calculate_metrics(data_df, test_df)
|
| 227 |
if not isinstance(results_df, pd.DataFrame) or results_df.empty:
|
| 228 |
raise gr.Error("Evaluation produced no results.")
|
| 229 |
except Exception as e:
|
|
|
|
| 256 |
results_df['model_report'] = report
|
| 257 |
results_df['anonymous'] = meta.participant.anonymous
|
| 258 |
results_df['hf_username'] = username
|
| 259 |
+
|
| 260 |
+
results_raw_df['user'] = display_name
|
| 261 |
+
results_raw_df['submission_time'] = timestamp
|
| 262 |
+
results_raw_df['model_report'] = report
|
| 263 |
+
results_raw_df['anonymous'] = meta.participant.anonymous
|
| 264 |
+
results_raw_df['hf_username'] = username
|
| 265 |
+
|
| 266 |
safe_user = _unsafify_username(username)
|
| 267 |
destination_path = f"results/{safe_user}_{timestamp}_results.csv"
|
| 268 |
tmp_name = None
|
|
|
|
| 280 |
)
|
| 281 |
Path(tmp_name).unlink()
|
| 282 |
|
| 283 |
+
# Same for raw file
|
| 284 |
+
destination_path_raw = f"results/{safe_user}_{timestamp}_results_raw.csv"
|
| 285 |
+
tmp_name = None
|
| 286 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
|
| 287 |
+
results_raw_df.to_csv(tmp, index=False)
|
| 288 |
+
tmp.flush()
|
| 289 |
+
tmp_name = tmp.name
|
| 290 |
+
|
| 291 |
+
API.upload_file(
|
| 292 |
+
path_or_fileobj=tmp_name,
|
| 293 |
+
path_in_repo=destination_path_raw,
|
| 294 |
+
repo_id=results_repo,
|
| 295 |
+
repo_type="dataset",
|
| 296 |
+
commit_message=f"Add raw result data for {username}"
|
| 297 |
+
)
|
| 298 |
+
Path(tmp_name).unlink()
|
| 299 |
|
| 300 |
def calculate_metrics(
|
| 301 |
results_dataframe: pd.DataFrame,
|
|
|
|
| 332 |
|
| 333 |
final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
|
| 334 |
all_endpoint_results = []
|
| 335 |
+
all_endpoint_results_raw = []
|
| 336 |
|
| 337 |
for ept in ENDPOINTS:
|
| 338 |
pred_col = f"{ept}_pred"
|
|
|
|
| 374 |
df_endpoint = df_reindexed.reset_index()
|
| 375 |
all_endpoint_results.append(df_endpoint)
|
| 376 |
|
| 377 |
+
# Also save a raw dataframe with all the bootstrapping samples
|
| 378 |
+
df_endpoint_raw = bootstrap_df.pivot_table(
|
| 379 |
+
index=["Sample", "Endpoint"],
|
| 380 |
+
columns="Metric",
|
| 381 |
+
values="Value"
|
| 382 |
+
).reset_index()
|
| 383 |
+
df_endpoint_raw.columns.name = None
|
| 384 |
+
df_endpoint_raw['Sample'] = df_endpoint_raw['Sample'].astype(int)
|
| 385 |
+
all_endpoint_results_raw.append(df_endpoint_raw)
|
| 386 |
+
|
| 387 |
df_results = pd.concat(all_endpoint_results, ignore_index=True)
|
| 388 |
+
df_results_raw = pd.concat(all_endpoint_results_raw, ignore_index=True)
|
| 389 |
+
|
| 390 |
+
# Average results
|
| 391 |
mean_cols = [f'mean_{m}' for m in final_cols]
|
| 392 |
std_cols = [f'std_{m}' for m in final_cols]
|
|
|
|
| 393 |
macro_means = df_results[mean_cols].mean()
|
| 394 |
macro_stds = df_results[std_cols].mean()
|
| 395 |
avg_row = {"Endpoint": "Average"}
|
|
|
|
| 398 |
df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
|
| 399 |
# Fix order of columns
|
| 400 |
df_with_average = df_with_average[["Endpoint"]+mean_cols+std_cols]
|
| 401 |
+
|
| 402 |
+
# Average results for raw dataframe
|
| 403 |
+
macro_results_by_sample = df_results_raw.groupby('Sample')[final_cols].mean().reset_index()
|
| 404 |
+
macro_results_by_sample["Endpoint"] = "Average"
|
| 405 |
+
df_raw_with_average = pd.concat([df_results_raw, macro_results_by_sample], ignore_index=True)
|
| 406 |
+
df_raw_with_average = df_raw_with_average[["Sample","Endpoint"] + final_cols]
|
| 407 |
+
|
| 408 |
+
return df_with_average, df_raw_with_average
|
requirements.txt
CHANGED
|
@@ -6,4 +6,5 @@ plotly
|
|
| 6 |
scipy
|
| 7 |
scikit-learn
|
| 8 |
loguru
|
| 9 |
-
statsmodels
|
|
|
|
|
|
| 6 |
scipy
|
| 7 |
scikit-learn
|
| 8 |
loguru
|
| 9 |
+
statsmodels
|
| 10 |
+
tqdm
|
utils.py
CHANGED
|
@@ -35,6 +35,7 @@ def fetch_dataset_df():
|
|
| 35 |
feature_schema = Features(metric_features | other_features)
|
| 36 |
|
| 37 |
dset = load_dataset(results_repo_validation, # change to results_repo_test for test set
|
|
|
|
| 38 |
split='train',
|
| 39 |
features=feature_schema,
|
| 40 |
download_mode="force_redownload")
|
|
|
|
| 35 |
feature_schema = Features(metric_features | other_features)
|
| 36 |
|
| 37 |
dset = load_dataset(results_repo_validation, # change to results_repo_test for test set
|
| 38 |
+
name='default',
|
| 39 |
split='train',
|
| 40 |
features=feature_schema,
|
| 41 |
download_mode="force_redownload")
|