Maria Castellanos commited on
Commit
179f265
·
1 Parent(s): d1f7806

Create raw csv file along clean

Browse files
Files changed (3) hide show
  1. evaluate.py +46 -4
  2. requirements.txt +2 -1
  3. utils.py +1 -0
evaluate.py CHANGED
@@ -223,7 +223,7 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
223
  data_df = pd.read_csv(local_path)
224
  test_df = pd.read_csv(test_path)
225
  try:
226
- results_df = calculate_metrics(data_df, test_df)
227
  if not isinstance(results_df, pd.DataFrame) or results_df.empty:
228
  raise gr.Error("Evaluation produced no results.")
229
  except Exception as e:
@@ -256,6 +256,13 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
256
  results_df['model_report'] = report
257
  results_df['anonymous'] = meta.participant.anonymous
258
  results_df['hf_username'] = username
 
 
 
 
 
 
 
259
  safe_user = _unsafify_username(username)
260
  destination_path = f"results/{safe_user}_{timestamp}_results.csv"
261
  tmp_name = None
@@ -273,7 +280,22 @@ def _evaluate_data(filename: str, test_repo: str, split_filename: str, results_r
273
  )
274
  Path(tmp_name).unlink()
275
 
276
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  def calculate_metrics(
279
  results_dataframe: pd.DataFrame,
@@ -310,6 +332,7 @@ def calculate_metrics(
310
 
311
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
312
  all_endpoint_results = []
 
313
 
314
  for ept in ENDPOINTS:
315
  pred_col = f"{ept}_pred"
@@ -351,10 +374,22 @@ def calculate_metrics(
351
  df_endpoint = df_reindexed.reset_index()
352
  all_endpoint_results.append(df_endpoint)
353
 
 
 
 
 
 
 
 
 
 
 
354
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
 
 
 
355
  mean_cols = [f'mean_{m}' for m in final_cols]
356
  std_cols = [f'std_{m}' for m in final_cols]
357
- # Average results
358
  macro_means = df_results[mean_cols].mean()
359
  macro_stds = df_results[std_cols].mean()
360
  avg_row = {"Endpoint": "Average"}
@@ -363,4 +398,11 @@ def calculate_metrics(
363
  df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
364
  # Fix order of columns
365
  df_with_average = df_with_average[["Endpoint"]+mean_cols+std_cols]
366
- return df_with_average
 
 
 
 
 
 
 
 
223
  data_df = pd.read_csv(local_path)
224
  test_df = pd.read_csv(test_path)
225
  try:
226
+ results_df, results_raw_df = calculate_metrics(data_df, test_df)
227
  if not isinstance(results_df, pd.DataFrame) or results_df.empty:
228
  raise gr.Error("Evaluation produced no results.")
229
  except Exception as e:
 
256
  results_df['model_report'] = report
257
  results_df['anonymous'] = meta.participant.anonymous
258
  results_df['hf_username'] = username
259
+
260
+ results_raw_df['user'] = display_name
261
+ results_raw_df['submission_time'] = timestamp
262
+ results_raw_df['model_report'] = report
263
+ results_raw_df['anonymous'] = meta.participant.anonymous
264
+ results_raw_df['hf_username'] = username
265
+
266
  safe_user = _unsafify_username(username)
267
  destination_path = f"results/{safe_user}_{timestamp}_results.csv"
268
  tmp_name = None
 
280
  )
281
  Path(tmp_name).unlink()
282
 
283
+ # Same for raw file
284
+ destination_path_raw = f"results/{safe_user}_{timestamp}_results_raw.csv"
285
+ tmp_name = None
286
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp:
287
+ results_raw_df.to_csv(tmp, index=False)
288
+ tmp.flush()
289
+ tmp_name = tmp.name
290
+
291
+ API.upload_file(
292
+ path_or_fileobj=tmp_name,
293
+ path_in_repo=destination_path_raw,
294
+ repo_id=results_repo,
295
+ repo_type="dataset",
296
+ commit_message=f"Add raw result data for {username}"
297
+ )
298
+ Path(tmp_name).unlink()
299
 
300
  def calculate_metrics(
301
  results_dataframe: pd.DataFrame,
 
332
 
333
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
334
  all_endpoint_results = []
335
+ all_endpoint_results_raw = []
336
 
337
  for ept in ENDPOINTS:
338
  pred_col = f"{ept}_pred"
 
374
  df_endpoint = df_reindexed.reset_index()
375
  all_endpoint_results.append(df_endpoint)
376
 
377
+ # Also save a raw dataframe with all the bootstrapping samples
378
+ df_endpoint_raw = bootstrap_df.pivot_table(
379
+ index=["Sample", "Endpoint"],
380
+ columns="Metric",
381
+ values="Value"
382
+ ).reset_index()
383
+ df_endpoint_raw.columns.name = None
384
+ df_endpoint_raw['Sample'] = df_endpoint_raw['Sample'].astype(int)
385
+ all_endpoint_results_raw.append(df_endpoint_raw)
386
+
387
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
388
+ df_results_raw = pd.concat(all_endpoint_results_raw, ignore_index=True)
389
+
390
+ # Average results
391
  mean_cols = [f'mean_{m}' for m in final_cols]
392
  std_cols = [f'std_{m}' for m in final_cols]
 
393
  macro_means = df_results[mean_cols].mean()
394
  macro_stds = df_results[std_cols].mean()
395
  avg_row = {"Endpoint": "Average"}
 
398
  df_with_average = pd.concat([df_results, pd.DataFrame([avg_row])], ignore_index=True)
399
  # Fix order of columns
400
  df_with_average = df_with_average[["Endpoint"]+mean_cols+std_cols]
401
+
402
+ # Average results for raw dataframe
403
+ macro_results_by_sample = df_results_raw.groupby('Sample')[final_cols].mean().reset_index()
404
+ macro_results_by_sample["Endpoint"] = "Average"
405
+ df_raw_with_average = pd.concat([df_results_raw, macro_results_by_sample], ignore_index=True)
406
+ df_raw_with_average = df_raw_with_average[["Sample","Endpoint"] + final_cols]
407
+
408
+ return df_with_average, df_raw_with_average
requirements.txt CHANGED
@@ -6,4 +6,5 @@ plotly
6
  scipy
7
  scikit-learn
8
  loguru
9
- statsmodels
 
 
6
  scipy
7
  scikit-learn
8
  loguru
9
+ statsmodels
10
+ tqdm
utils.py CHANGED
@@ -35,6 +35,7 @@ def fetch_dataset_df():
35
  feature_schema = Features(metric_features | other_features)
36
 
37
  dset = load_dataset(results_repo_validation, # change to results_repo_test for test set
 
38
  split='train',
39
  features=feature_schema,
40
  download_mode="force_redownload")
 
35
  feature_schema = Features(metric_features | other_features)
36
 
37
  dset = load_dataset(results_repo_validation, # change to results_repo_test for test set
38
+ name='default',
39
  split='train',
40
  features=feature_schema,
41
  download_mode="force_redownload")