Yacine Jernite commited on
Commit
28dd726
1 Parent(s): 8a2ec29
Files changed (1) hide show
  1. datacards/results.py +28 -21
datacards/results.py CHANGED
@@ -21,6 +21,11 @@ def results_page():
21
  st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
22
  "results"
23
  ].get("results", {})
 
 
 
 
 
24
  make_multiselect(
25
  label="What metrics are typically used for this task?",
26
  key_list=key_pref + ["metrics"],
@@ -39,8 +44,16 @@ def results_page():
39
  ],
40
  help="Select all metrics that are typically used when evaluating models for this task.",
41
  )
 
 
 
 
 
 
 
 
42
  make_text_area(
43
- label="Describe the metrics and evaluation methodology that the dataset creators used when introducing this task.",
44
  key_list=key_pref + ["original-evaluation"],
45
  help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
46
  )
@@ -50,26 +63,20 @@ def results_page():
50
  key_list=key_pref + ["has-previous-results"],
51
  help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
52
  )
53
- make_text_area(
54
- label="What evaluation approaches have others used?",
55
- key_list=key_pref + ["modern-evaluation"],
56
- help="If the modern evaluation strategy diverts from the original, describe how models are being evaluated.",
57
- )
58
- make_text_area(
59
- label="What are previous results",
60
- key_list=key_pref + ["previous-results"],
61
- help="List the source and performance metrics for models on this dataset.",
62
- )
63
- make_text_area(
64
- label="Definitions",
65
- key_list=key_pref + ["definitions"],
66
- help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
67
- )
68
- make_text_area(
69
- label="What aspect of model ability can be measured with this dataset?",
70
- key_list=key_pref + ["model-abilities"],
71
- help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
72
- )
73
 
74
 
75
 
 
21
  st.session_state.card_dict["results"]["results"] = st.session_state.card_dict[
22
  "results"
23
  ].get("results", {})
24
+ make_text_area(
25
+ label="What aspect of model ability can be measured with this dataset?",
26
+ key_list=key_pref + ["model-abilities"],
27
+ help="What kind of abilities should a model exhibit that performs well on the task of this dataset (e.g., reasoning capability, morphological inflection)?.",
28
+ )
29
  make_multiselect(
30
  label="What metrics are typically used for this task?",
31
  key_list=key_pref + ["metrics"],
 
44
  ],
45
  help="Select all metrics that are typically used when evaluating models for this task.",
46
  )
47
+ if "Other: Other Metrics" in st.session_state.card_dict["results"]["results"].get("metrics", []):
48
+ make_text_area(
49
+ label="Definitions of other metrics",
50
+ key_list=key_pref + ["other-metrics-definitions"],
51
+ help="If the evaluation strategies in the previous questions go beyond the list of metrics above, add descriptions and/or definitions for each metric.",
52
+ )
53
+ else:
54
+ st.session_state.card_dict["results"]["results"]["other-metrics-definitions"] = "N/A"
55
  make_text_area(
56
+ label="List and describe the purpose of the metrics and evaluation methodology (including human evaluation) that the dataset creators used when introducing this task.",
57
  key_list=key_pref + ["original-evaluation"],
58
  help="When the generation task was not evaluated when this dataset was introduced, write N/A.",
59
  )
 
63
  key_list=key_pref + ["has-previous-results"],
64
  help="Have papers evaluated models on this task? If no, write N/A for the following three questions.",
65
  )
66
+ if st.session_state.card_dict["results"]["results"]["has-previous-results"] == "yes":
67
+ make_text_area(
68
+ label="What evaluation approaches have others used?",
69
+ key_list=key_pref + ["current-evaluation"],
70
+ help="If the current evaluation strategy diverts from the original, describe how models are being evaluated.",
71
+ )
72
+ make_text_area(
73
+ label="What are the most relevant previous results for this task/dataset",
74
+ key_list=key_pref + ["previous-results"],
75
+ help="List and describe the source and performance metrics for models on this dataset.",
76
+ )
77
+ else:
78
+ st.session_state.card_dict["results"]["results"]["current-evaluation"] = "N/A"
79
+ st.session_state.card_dict["results"]["results"]["previous-results"] = "N/A"
 
 
 
 
 
 
80
 
81
 
82