Yacine Jernite commited on
Commit
8a2ec29
1 Parent(s): 9994065

first half done

Browse files
Files changed (2) hide show
  1. datacards/curation.py +128 -105
  2. datacards/overview.py +26 -11
datacards/curation.py CHANGED
@@ -11,7 +11,7 @@ from .streamlit_utils import (
11
  )
12
 
13
  N_FIELDS_ORIGINAL = 4
14
- N_FIELDS_LANGUAGE = 12
15
  N_FIELDS_ANNOTATIONS = 10
16
  N_FIELDS_CONSENT = 4
17
  N_FIELDS_PII = 7
@@ -52,11 +52,14 @@ def curation_page():
52
  key_list=key_pref + ["is-aggregated"],
53
  help="e.g. Wikipedia, movi dialogues, etc.",
54
  )
55
- make_text_area(
56
- label="If yes, list the sources",
57
- key_list=key_pref + ["aggregated-sources"],
58
- help="Otherwise, type N/A",
59
- )
 
 
 
60
 
61
  with st.expander("Language Data", expanded=False):
62
  key_pref = ["curation", "language"]
@@ -74,38 +77,49 @@ def curation_page():
74
  ],
75
  key_list=key_pref + ["obtained"],
76
  )
77
- make_multiselect(
78
- label="If found, where from?",
79
- options=["Multiple websites", "Single website", "Offline media collection", "Other", "N/A"],
80
- key_list=key_pref + ["found"],
81
- help="select N/A if none of the language data was found",
82
- )
83
- make_multiselect(
84
- label="If crowdsourced, where from?",
85
- options=[
86
- "Amazon Mechanical Turk",
87
- "Other crowdworker platform",
88
- "Participatory experiment",
89
- "Other",
90
- "N/A",
91
- ],
92
- key_list=key_pref + ["crowdsourced"],
93
- help="select N/A if none of the language data was crowdsourced",
94
- )
95
- make_text_area(
96
- label="If created for the dataset, describe the creation process.",
97
- key_list=key_pref + ["created"],
98
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  make_text_area(
100
  label="What further information do we have on the language producers?",
101
  key_list=key_pref + ["producers-description"],
102
  help="Provide a description of the context in which the language was produced and who produced it.",
103
  )
104
- make_text_input(
105
- label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
106
- key_list=key_pref + ["machine-generated"],
107
- help="if the generation code is unavailable, enter N/A",
108
- )
109
  make_selectbox(
110
  label="Was the text validated by a different worker or a data curator?",
111
  options=[
@@ -117,16 +131,6 @@ def curation_page():
117
  key_list=key_pref + ["validated"],
118
  help="this question is about human or human-in-the-loop validation only",
119
  )
120
- make_multiselect(
121
- label="In what kind of organization did the curation happen?",
122
- options=["industry", "academic", "independent", "other"],
123
- key_list=key_pref + ["organization-type"],
124
- )
125
- make_text_input(
126
- label="Name the organization(s).",
127
- key_list=key_pref + ["organization-names"],
128
- help="comma-separated",
129
- )
130
  make_text_area(
131
  label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
132
  key_list=key_pref + ["pre-processed"],
@@ -137,11 +141,14 @@ def curation_page():
137
  options=["not filtered", "manually", "algorithmically", "hybrid"],
138
  key_list=key_pref + ["is-filtered"],
139
  )
140
- make_text_area(
141
- label="What were the selection criteria?",
142
- key_list=key_pref + ["filtered-criteria"],
143
- help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A.",
144
- )
 
 
 
145
 
146
  with st.expander("Structured Annotations", expanded=False):
147
  key_pref = ["curation", "annotations"]
@@ -149,72 +156,88 @@ def curation_page():
149
  "annotations"
150
  ] = st.session_state.card_dict["curation"].get("annotations", {})
151
 
152
- make_radio(
153
  label="Does the dataset have additional annotations for each instance?",
154
  options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
155
  key_list=key_pref + ["origin"],
156
  help="Was any additional data collected?",
157
  )
158
 
159
- # TODO: If yes....
160
  # If expert or crowdsourced, this branch
161
- make_radio(
162
- label="What is the number of raters ",
163
- options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"],
164
- key_list=key_pref + ["rater-number"],
165
- help="How many raters were used to create the additional annotations?",
166
- )
167
- make_text_area(
168
- label="Describe the qualifications required of an annotator.",
169
- key_list=key_pref + ["rater-qualifications"],
170
- help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).",
171
- )
172
- make_radio(
173
- label="How many annotators saw each training example?",
174
- options=["0", "1", "2", "3", "4", "5", ">5"],
175
- key_list=key_pref + ["rater-training-num"],
176
- help="",
177
- )
178
- make_radio(
179
- label="How many annotators saw each test example?",
180
- options=["0", "1", "2", "3", "4", "5", ">5"],
181
- key_list=key_pref + ["rater-test-num"],
182
- help="",
183
- )
184
- make_radio(
185
- label="Was an annotation service used?",
186
- options=["yes", "no", "unknown"],
187
- key_list=key_pref + ["rater-annotation-service-bool"],
188
- help="",
189
- )
190
- # TODO if yes
191
- make_multiselect(
192
- label="Which annotation services were used?",
193
- options=[
194
- "Amazon Mechanical Turk", "Prolific Academic",
195
- "Upwork", "Appen", "Crowdflower", "other"
196
- ],
197
- key_list=key_pref + ["rater-annotation-service"],
198
- )
 
 
 
 
 
 
 
 
 
 
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- make_text_area(
202
- label="Purpose and values for each annoation",
203
- key_list=key_pref + ["values"],
204
- help="Describe the purpose and possible values for each kind of annotation.",
205
- )
206
- make_multiselect(
207
- label="Quality control measures?",
208
- options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"],
209
- key_list=key_pref + ["quality-control"],
210
- help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
211
- )
212
- # TODO: If not none / unknown
213
- make_text_area(
214
- label="Describe the quality control measures that were taken.",
215
- key_list=key_pref + ["quality-control-details"],
216
- help="Describe how quality was ensured in the data curation process.",
217
- )
218
 
219
  with st.expander("Consent", expanded=False):
220
  key_pref = ["curation", "consent"]
 
11
  )
12
 
13
  N_FIELDS_ORIGINAL = 4
14
+ N_FIELDS_LANGUAGE = 10
15
  N_FIELDS_ANNOTATIONS = 10
16
  N_FIELDS_CONSENT = 4
17
  N_FIELDS_PII = 7
 
52
  key_list=key_pref + ["is-aggregated"],
53
  help="e.g. Wikipedia, movi dialogues, etc.",
54
  )
55
+ if st.session_state.card_dict["curation"]["original"]["is-aggregated"] == "yes":
56
+ make_text_area(
57
+ label="List the sources (one per line)",
58
+ key_list=key_pref + ["aggregated-sources"],
59
+ help="One source per line",
60
+ )
61
+ else:
62
+ st.session_state.card_dict["curation"]["original"]["aggregated-sources"] = "N/A"
63
 
64
  with st.expander("Language Data", expanded=False):
65
  key_pref = ["curation", "language"]
 
77
  ],
78
  key_list=key_pref + ["obtained"],
79
  )
80
+ if "Found" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
81
+ make_multiselect(
82
+ label="If found, where from?",
83
+ options=["Multiple websites", "Single website", "Offline media collection", "Other"],
84
+ key_list=key_pref + ["found"],
85
+ help="select N/A if none of the language data was found",
86
+ )
87
+ else:
88
+ st.session_state.card_dict["curation"]["language"]["found"] = []
89
+ if "Crowdsourced" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
90
+ make_multiselect(
91
+ label="If crowdsourced, where from?",
92
+ options=[
93
+ "Amazon Mechanical Turk",
94
+ "Other crowdworker platform",
95
+ "Participatory experiment",
96
+ "Other",
97
+ ],
98
+ key_list=key_pref + ["crowdsourced"],
99
+ help="select N/A if none of the language data was crowdsourced",
100
+ )
101
+ else:
102
+ st.session_state.card_dict["curation"]["language"]["crowdsourced"] = []
103
+ if "Created for the dataset" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
104
+ make_text_area(
105
+ label="If created for the dataset, describe the creation process.",
106
+ key_list=key_pref + ["created"],
107
+ )
108
+ else:
109
+ st.session_state.card_dict["curation"]["language"]["created"] = "N/A"
110
+ if "Machine-generated" in st.session_state.card_dict["curation"]["language"].get("obtained", []):
111
+ make_text_input(
112
+ label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
113
+ key_list=key_pref + ["machine-generated"],
114
+ help="if the generation code is unavailable, enter N/A",
115
+ )
116
+ else:
117
+ st.session_state.card_dict["curation"]["language"]["machine-generated"] = "N/A"
118
  make_text_area(
119
  label="What further information do we have on the language producers?",
120
  key_list=key_pref + ["producers-description"],
121
  help="Provide a description of the context in which the language was produced and who produced it.",
122
  )
 
 
 
 
 
123
  make_selectbox(
124
  label="Was the text validated by a different worker or a data curator?",
125
  options=[
 
131
  key_list=key_pref + ["validated"],
132
  help="this question is about human or human-in-the-loop validation only",
133
  )
 
 
 
 
 
 
 
 
 
 
134
  make_text_area(
135
  label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
136
  key_list=key_pref + ["pre-processed"],
 
141
  options=["not filtered", "manually", "algorithmically", "hybrid"],
142
  key_list=key_pref + ["is-filtered"],
143
  )
144
+ if st.session_state.card_dict["curation"]["language"]["is-filtered"] == "not filtered":
145
+ st.session_state.card_dict["curation"]["language"]["filtered-criteria"] = "N/A"
146
+ else:
147
+ make_text_area(
148
+ label="What were the selection criteria?",
149
+ key_list=key_pref + ["filtered-criteria"],
150
+ help="Describe the process for selecting instances to include in the dataset, including any tools used.",
151
+ )
152
 
153
  with st.expander("Structured Annotations", expanded=False):
154
  key_pref = ["curation", "annotations"]
 
156
  "annotations"
157
  ] = st.session_state.card_dict["curation"].get("annotations", {})
158
 
159
+ make_selectbox(
160
  label="Does the dataset have additional annotations for each instance?",
161
  options=["none", "found", "automatically created", "expert created", "crowd-sourced"],
162
  key_list=key_pref + ["origin"],
163
  help="Was any additional data collected?",
164
  )
165
 
 
166
  # If expert or crowdsourced, this branch
167
+ if st.session_state.card_dict["curation"]["annotations"]["origin"] in ["expert created", "crowd-sourced"]:
168
+ make_selectbox(
169
+ label="What is the number of raters ",
170
+ options=["unknown", "1", "2<n<10", "11<n<50", "51<n<100", "n>100"],
171
+ key_list=key_pref + ["rater-number"],
172
+ help="How many raters were used to create the additional annotations?",
173
+ )
174
+ make_text_area(
175
+ label="Describe the qualifications required of an annotator.",
176
+ key_list=key_pref + ["rater-qualifications"],
177
+ help="e.g., languages or dialects they speak, education requirements, number of HITs (if MTurk).",
178
+ )
179
+ make_selectbox(
180
+ label="How many annotators saw each training example?",
181
+ options=["0", "1", "2", "3", "4", "5", ">5"],
182
+ key_list=key_pref + ["rater-training-num"],
183
+ help="",
184
+ )
185
+ make_selectbox(
186
+ label="How many annotators saw each test example?",
187
+ options=["0", "1", "2", "3", "4", "5", ">5"],
188
+ key_list=key_pref + ["rater-test-num"],
189
+ help="",
190
+ )
191
+ make_radio(
192
+ label="Was an annotation service used?",
193
+ options=["no", "yes", "unknown"],
194
+ key_list=key_pref + ["rater-annotation-service-bool"],
195
+ help="",
196
+ )
197
+ if st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] == "yes":
198
+ make_multiselect(
199
+ label="Which annotation services were used?",
200
+ options=[
201
+ "Amazon Mechanical Turk", "Prolific Academic",
202
+ "Upwork", "Appen", "Crowdflower", "other"
203
+ ],
204
+ key_list=key_pref + ["rater-annotation-service"],
205
+ )
206
+ else:
207
+ st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
208
+ else:
209
+ st.session_state.card_dict["curation"]["annotations"]["rater-number"] = "N/A"
210
+ st.session_state.card_dict["curation"]["annotations"]["rater-qualifications"] = "N/A"
211
+ st.session_state.card_dict["curation"]["annotations"]["rater-training-num"] = "N/A"
212
+ st.session_state.card_dict["curation"]["annotations"]["rater-test-num"] = "N/A"
213
+ st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service-bool"] = "no"
214
+ st.session_state.card_dict["curation"]["annotations"]["rater-annotation-service"] = []
215
 
216
+ if st.session_state.card_dict["curation"]["annotations"]["origin"] != "none":
217
+ make_text_area(
218
+ label="Purpose and values for each annoation",
219
+ key_list=key_pref + ["values"],
220
+ help="Describe the purpose and possible values for each kind of annotation.",
221
+ )
222
+ make_selectbox(
223
+ label="Quality control measures?",
224
+ options=["none", "unknown", "validated by another rater", "validated by data curators", "validated through automated script", "other"],
225
+ key_list=key_pref + ["quality-control"],
226
+ help="How was annotation quality controlled for / what control measures were put in place to ensure annotation quality?",
227
+ )
228
+ if st.session_state.card_dict["curation"]["annotations"]["quality-control"] in ["none", "unknown"]:
229
+ st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
230
+ else:
231
+ make_text_area(
232
+ label="Describe the quality control measures that were taken.",
233
+ key_list=key_pref + ["quality-control-details"],
234
+ help="Describe how quality was ensured in the data curation process.",
235
+ )
236
+ else:
237
+ st.session_state.card_dict["curation"]["annotations"]["values"] = "N/A"
238
+ st.session_state.card_dict["curation"]["annotations"]["quality-control"] = []
239
+ st.session_state.card_dict["curation"]["annotations"]["quality-control-details"] = "N/A"
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  with st.expander("Consent", expanded=False):
243
  key_pref = ["curation", "consent"]
datacards/overview.py CHANGED
@@ -13,7 +13,7 @@ from .streamlit_utils import (
13
 
14
  N_FIELDS_WHERE = 9
15
  N_FIELDS_LANGUAGES = 8
16
- N_FIELDS_CREDIT = 3
17
  N_FIELDS_STRUCTURE = 7
18
 
19
  N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
@@ -65,16 +65,20 @@ def overview_page():
65
  key_list=key_pref + ["has-leaderboard"],
66
  help="If no, enter N/A for the following two fields",
67
  )
68
- make_text_input(
69
- label="Provide a link to the leaderboard if it exists. Otherwise, enter N/A.",
70
- key_list=key_pref + ["leaderboard-url"],
71
- help="[URL] or N/A",
72
- )
73
- make_text_area(
74
- label="Briefly describe how the leaderboard evaluates models if it exists. Otherwise, enter N/A.",
75
- key_list=key_pref + ["leaderboard-description"],
76
- help="[free text; a paragraph] or N/A",
77
- )
 
 
 
 
78
  make_text_input(
79
  label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
80
  key_list=key_pref + ["contact-name"],
@@ -127,6 +131,7 @@ def overview_page():
127
  label="What primary task does the dataset support?",
128
  key_list=key_pref + ["task"],
129
  options=[
 
130
  "Content Transfer",
131
  "Data-to-Text",
132
  "Dialog Response Generation",
@@ -150,6 +155,16 @@ def overview_page():
150
  st.session_state.card_dict["overview"][
151
  "credit"
152
  ] = st.session_state.card_dict.get("credit", {})
 
 
 
 
 
 
 
 
 
 
153
  make_text_input(
154
  label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
155
  key_list=key_pref + ["creators"],
 
13
 
14
  N_FIELDS_WHERE = 9
15
  N_FIELDS_LANGUAGES = 8
16
+ N_FIELDS_CREDIT = 5
17
  N_FIELDS_STRUCTURE = 7
18
 
19
  N_FIELDS = N_FIELDS_WHERE + N_FIELDS_LANGUAGES + N_FIELDS_CREDIT + N_FIELDS_STRUCTURE
 
65
  key_list=key_pref + ["has-leaderboard"],
66
  help="If no, enter N/A for the following two fields",
67
  )
68
+ if st.session_state.card_dict["overview"]["where"]["has-leaderboard"] == "yes":
69
+ make_text_input(
70
+ label="Provide a link to the leaderboard.",
71
+ key_list=key_pref + ["leaderboard-url"],
72
+ help="[URL] or N/A",
73
+ )
74
+ make_text_area(
75
+ label="Briefly describe how the leaderboard evaluates models.",
76
+ key_list=key_pref + ["leaderboard-description"],
77
+ help="[free text; a paragraph] or N/A",
78
+ )
79
+ else:
80
+ st.session_state.card_dict["overview"]["where"]["leaderboard-url"] = "N/A"
81
+ st.session_state.card_dict["overview"]["where"]["leaderboard-description"] = "N/A"
82
  make_text_input(
83
  label="If known, provide the name of at least one person the reader can contact for questions about the dataset.",
84
  key_list=key_pref + ["contact-name"],
 
131
  label="What primary task does the dataset support?",
132
  key_list=key_pref + ["task"],
133
  options=[
134
+ "", # default needs to be invalid value to make sure people actually fill in
135
  "Content Transfer",
136
  "Data-to-Text",
137
  "Dialog Response Generation",
 
155
  st.session_state.card_dict["overview"][
156
  "credit"
157
  ] = st.session_state.card_dict.get("credit", {})
158
+ make_multiselect(
159
+ label="In what kind of organization did the dataset curation happen?",
160
+ options=["industry", "academic", "independent", "other"],
161
+ key_list=key_pref + ["organization-type"],
162
+ )
163
+ make_text_input(
164
+ label="Name the organization(s).",
165
+ key_list=key_pref + ["organization-names"],
166
+ help="comma-separated",
167
+ )
168
  make_text_input(
169
  label="Who created the original dataset? List the people involved in collecting the dataset and their affiliation(s).",
170
  key_list=key_pref + ["creators"],