lewtun HF staff commited on
Commit
ac54d65
β€’
2 Parent(s): 74760ef d658c8a

Merge pull request #40 from huggingface/refactor-model-filter

Browse files
Files changed (2) hide show
  1. app.py +106 -105
  2. evaluation.py +13 -14
app.py CHANGED
@@ -436,17 +436,6 @@ with st.form(key="form"):
436
  )
437
  print("INFO -- Selected models before filter:", selected_models)
438
 
439
- if len(selected_models) > 0:
440
- selected_models = filter_evaluated_models(
441
- selected_models,
442
- selected_task,
443
- selected_dataset,
444
- selected_config,
445
- selected_split,
446
- selected_metrics,
447
- )
448
- print("INFO -- Selected models after filter:", selected_models)
449
-
450
  hf_username = st.text_input("Enter your πŸ€— Hub username to be notified when the evaluation is finished")
451
 
452
  submit_button = st.form_submit_button("Evaluate models πŸš€")
@@ -454,106 +443,118 @@ with st.form(key="form"):
454
  if submit_button:
455
  if len(hf_username) == 0:
456
  st.warning("No πŸ€— Hub username provided! Please enter your username and try again.")
 
 
457
  elif len(selected_models) > 10:
458
- st.warning("Only 10 models can be evaluated at once. Please select fewer models to evaluate.")
459
- elif len(selected_models) > 0 and len(selected_models) <= 10:
460
- project_id = str(uuid.uuid4())[:8]
461
- project_payload = {
462
- "username": AUTOTRAIN_USERNAME,
463
- "proj_name": f"eval-project-{project_id}",
464
- "task": TASK_TO_ID[selected_task],
465
- "config": {
466
- "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
467
- if selected_task in AUTOTRAIN_TASK_TO_LANG
468
- else "en",
469
- "max_models": 5,
470
- "instance": {
471
- "provider": "aws",
472
- "instance_type": "ml.g4dn.4xlarge",
473
- "max_runtime_seconds": 172800,
474
- "num_instances": 1,
475
- "disk_size_gb": 150,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
  },
477
- "evaluation": {"metrics": selected_metrics, "models": selected_models, "hf_username": hf_username},
478
- },
479
- }
480
- print(f"INFO -- Payload: {project_payload}")
481
- project_json_resp = http_post(
482
- path="/projects/create",
483
- payload=project_payload,
484
- token=HF_TOKEN,
485
- domain=AUTOTRAIN_BACKEND_API,
486
- ).json()
487
- print(f"INFO -- Project creation response: {project_json_resp}")
488
-
489
- if project_json_resp["created"]:
490
- data_payload = {
491
- "split": 4, # use "auto" split choice in AutoTrain
492
- "col_mapping": col_mapping,
493
- "load_config": {"max_size_bytes": 0, "shuffle": False},
494
  }
495
- data_json_resp = http_post(
496
- path=f"/projects/{project_json_resp['id']}/data/{selected_dataset}",
497
- payload=data_payload,
 
498
  token=HF_TOKEN,
499
  domain=AUTOTRAIN_BACKEND_API,
500
- params={
501
- "type": "dataset",
502
- "config_name": selected_config,
503
- "split_name": selected_split,
504
- },
505
  ).json()
506
- print(f"INFO -- Dataset creation response: {data_json_resp}")
507
- if data_json_resp["download_status"] == 1:
508
- train_json_resp = http_get(
509
- path=f"/projects/{project_json_resp['id']}/data/start_process",
 
 
 
 
 
 
 
510
  token=HF_TOKEN,
511
  domain=AUTOTRAIN_BACKEND_API,
 
 
 
 
 
512
  ).json()
513
- print(f"INFO -- AutoTrain job response: {train_json_resp}")
514
- if train_json_resp["success"]:
515
- train_eval_index = {
516
- "train-eval-index": [
517
- {
518
- "config": selected_config,
519
- "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
520
- "task_id": selected_task,
521
- "splits": {"eval_split": selected_split},
522
- "col_mapping": col_mapping,
523
- }
524
- ]
525
- }
526
- selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
527
- dataset_card_url = get_dataset_card_url(selected_dataset)
528
- st.success("βœ… Successfully submitted evaluation job!")
529
- st.markdown(
530
- f"""
531
- Evaluation can take up to 1 hour to complete, so grab a β˜•οΈ or 🍡 while you wait:
532
-
533
- * πŸ”” A \
534
- [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)\
535
- with the evaluation results will be opened for each model you selected. \
536
- Check your email for notifications.
537
- * πŸ“Š Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) \
538
- to view the results from your submission once the Hub pull request is merged.
539
- * πŸ₯± Tired of configuring evaluations? Add the following metadata to the \
540
- [dataset card]({dataset_card_url}) to enable 1-click evaluations:
541
- """
542
- )
543
- st.markdown(
544
- f"""
545
- ```yaml
546
- {selected_metadata}
547
- """
548
- )
549
- print("INFO -- Pushing evaluation job logs to the Hub")
550
- evaluation_log = {}
551
- evaluation_log["payload"] = project_payload
552
- evaluation_log["project_creation_response"] = project_json_resp
553
- evaluation_log["dataset_creation_response"] = data_json_resp
554
- evaluation_log["autotrain_job_response"] = train_json_resp
555
- commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
556
- else:
557
- st.error("πŸ™ˆ Oh no, there was an error submitting your evaluation job!")
558
- else:
559
- st.warning("⚠️ No models were selected for evaluation!")
 
 
 
436
  )
437
  print("INFO -- Selected models before filter:", selected_models)
438
 
 
 
 
 
 
 
 
 
 
 
 
439
  hf_username = st.text_input("Enter your πŸ€— Hub username to be notified when the evaluation is finished")
440
 
441
  submit_button = st.form_submit_button("Evaluate models πŸš€")
 
443
  if submit_button:
444
  if len(hf_username) == 0:
445
  st.warning("No πŸ€— Hub username provided! Please enter your username and try again.")
446
+ elif len(selected_models) == 0:
447
+ st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
448
  elif len(selected_models) > 10:
449
+ st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
450
+ else:
451
+ # Filter out previously evaluated models
452
+ selected_models = filter_evaluated_models(
453
+ selected_models,
454
+ selected_task,
455
+ selected_dataset,
456
+ selected_config,
457
+ selected_split,
458
+ selected_metrics,
459
+ )
460
+ print("INFO -- Selected models after filter:", selected_models)
461
+ if len(selected_models) > 0:
462
+ project_id = str(uuid.uuid4())[:8]
463
+ project_payload = {
464
+ "username": AUTOTRAIN_USERNAME,
465
+ "proj_name": f"eval-project-{project_id}",
466
+ "task": TASK_TO_ID[selected_task],
467
+ "config": {
468
+ "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
469
+ if selected_task in AUTOTRAIN_TASK_TO_LANG
470
+ else "en",
471
+ "max_models": 5,
472
+ "instance": {
473
+ "provider": "aws",
474
+ "instance_type": "ml.g4dn.4xlarge",
475
+ "max_runtime_seconds": 172800,
476
+ "num_instances": 1,
477
+ "disk_size_gb": 150,
478
+ },
479
+ "evaluation": {
480
+ "metrics": selected_metrics,
481
+ "models": selected_models,
482
+ "hf_username": hf_username,
483
+ },
484
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  }
486
+ print(f"INFO -- Payload: {project_payload}")
487
+ project_json_resp = http_post(
488
+ path="/projects/create",
489
+ payload=project_payload,
490
  token=HF_TOKEN,
491
  domain=AUTOTRAIN_BACKEND_API,
 
 
 
 
 
492
  ).json()
493
+ print(f"INFO -- Project creation response: {project_json_resp}")
494
+
495
+ if project_json_resp["created"]:
496
+ data_payload = {
497
+ "split": 4, # use "auto" split choice in AutoTrain
498
+ "col_mapping": col_mapping,
499
+ "load_config": {"max_size_bytes": 0, "shuffle": False},
500
+ }
501
+ data_json_resp = http_post(
502
+ path=f"/projects/{project_json_resp['id']}/data/{selected_dataset}",
503
+ payload=data_payload,
504
  token=HF_TOKEN,
505
  domain=AUTOTRAIN_BACKEND_API,
506
+ params={
507
+ "type": "dataset",
508
+ "config_name": selected_config,
509
+ "split_name": selected_split,
510
+ },
511
  ).json()
512
+ print(f"INFO -- Dataset creation response: {data_json_resp}")
513
+ if data_json_resp["download_status"] == 1:
514
+ train_json_resp = http_get(
515
+ path=f"/projects/{project_json_resp['id']}/data/start_process",
516
+ token=HF_TOKEN,
517
+ domain=AUTOTRAIN_BACKEND_API,
518
+ ).json()
519
+ print(f"INFO -- AutoTrain job response: {train_json_resp}")
520
+ if train_json_resp["success"]:
521
+ train_eval_index = {
522
+ "train-eval-index": [
523
+ {
524
+ "config": selected_config,
525
+ "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
526
+ "task_id": selected_task,
527
+ "splits": {"eval_split": selected_split},
528
+ "col_mapping": col_mapping,
529
+ }
530
+ ]
531
+ }
532
+ selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
533
+ dataset_card_url = get_dataset_card_url(selected_dataset)
534
+ st.success("βœ… Successfully submitted evaluation job!")
535
+ st.markdown(
536
+ f"""
537
+ Evaluation can take up to 1 hour to complete, so grab a β˜•οΈ or 🍡 while you wait:
538
+
539
+ * πŸ”” A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
540
+ * πŸ“Š Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
541
+ * πŸ₯± Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
542
+ """ # noqa
543
+ )
544
+ st.markdown(
545
+ f"""
546
+ ```yaml
547
+ {selected_metadata}
548
+ """
549
+ )
550
+ print("INFO -- Pushing evaluation job logs to the Hub")
551
+ evaluation_log = {}
552
+ evaluation_log["payload"] = project_payload
553
+ evaluation_log["project_creation_response"] = project_json_resp
554
+ evaluation_log["dataset_creation_response"] = data_json_resp
555
+ evaluation_log["autotrain_job_response"] = train_json_resp
556
+ commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
557
+ else:
558
+ st.error("πŸ™ˆ Oh no, there was an error submitting your evaluation job!")
559
+ else:
560
+ st.warning("⚠️ No models left to evaluate! Please select other models and try again.")
evaluation.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from dataclasses import dataclass
2
 
3
  import streamlit as st
@@ -15,30 +16,29 @@ class EvaluationInfo:
15
  metrics: set
16
 
17
 
18
- def compute_evaluation_id(dataset_info: DatasetInfo) -> int:
19
  if dataset_info.cardData is not None:
20
  metadata = dataset_info.cardData["eval_info"]
21
  metadata.pop("col_mapping", None)
22
  # TODO(lewtun): populate dataset cards with metric info
23
  if "metrics" not in metadata:
24
  metadata["metrics"] = frozenset()
25
- metadata["metrics"] = frozenset(metadata["metrics"])
26
- evaluation_info = EvaluationInfo(**metadata)
27
- return hash(evaluation_info)
28
- else:
29
- return None
30
 
31
 
32
- def get_evaluation_ids():
33
  filt = DatasetFilter(author="autoevaluate")
34
  evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
35
- return [compute_evaluation_id(dset) for dset in evaluation_datasets]
36
 
37
 
38
  def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
39
- evaluation_ids = get_evaluation_ids()
 
40
 
41
- for idx, model in enumerate(models):
42
  evaluation_info = EvaluationInfo(
43
  task=task,
44
  model=model,
@@ -47,12 +47,11 @@ def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_
47
  dataset_split=dataset_split,
48
  metrics=frozenset(metrics),
49
  )
50
- candidate_id = hash(evaluation_info)
51
- if candidate_id in evaluation_ids:
52
  st.info(
53
- f"Model `{model}` has already been evaluated on this configuration. \
54
  This model will be excluded from the evaluation job..."
55
  )
56
- models.pop(idx)
57
 
58
  return models
 
1
+ import copy
2
  from dataclasses import dataclass
3
 
4
  import streamlit as st
 
16
  metrics: set
17
 
18
 
19
+ def create_evaluation_info(dataset_info: DatasetInfo) -> int:
20
  if dataset_info.cardData is not None:
21
  metadata = dataset_info.cardData["eval_info"]
22
  metadata.pop("col_mapping", None)
23
  # TODO(lewtun): populate dataset cards with metric info
24
  if "metrics" not in metadata:
25
  metadata["metrics"] = frozenset()
26
+ else:
27
+ metadata["metrics"] = frozenset(metadata["metrics"])
28
+ return EvaluationInfo(**metadata)
 
 
29
 
30
 
31
+ def get_evaluation_infos():
32
  filt = DatasetFilter(author="autoevaluate")
33
  evaluation_datasets = HfApi().list_datasets(filter=filt, full=True)
34
+ return [create_evaluation_info(dset) for dset in evaluation_datasets]
35
 
36
 
37
  def filter_evaluated_models(models, task, dataset_name, dataset_config, dataset_split, metrics):
38
+ evaluation_infos = get_evaluation_infos()
39
+ models_to_filter = copy.copy(models)
40
 
41
+ for model in models_to_filter:
42
  evaluation_info = EvaluationInfo(
43
  task=task,
44
  model=model,
 
47
  dataset_split=dataset_split,
48
  metrics=frozenset(metrics),
49
  )
50
+ if evaluation_info in evaluation_infos:
 
51
  st.info(
52
+ f"Model [`{model}`](https://huggingface.co/{model}) has already been evaluated on this configuration. \
53
  This model will be excluded from the evaluation job..."
54
  )
55
+ models.remove(model)
56
 
57
  return models