Yacine Jernite commited on
Commit
d1a58c9
1 Parent(s): 37b8c09

curation part 1

Browse files
Files changed (4) hide show
  1. app.py +0 -1
  2. datacards/curation.py +146 -3
  3. datacards/gem.py +0 -3
  4. datacards/overview.py +11 -1
app.py CHANGED
@@ -77,7 +77,6 @@ def main():
77
 
78
  def glance_page():
79
  with st.expander("Dataset at a Glance", expanded=True):
80
- st.markdown(f"### Dataset Name: {st.session_state.save_state.get('dataset_name', '')}")
81
  dataset_summary = ""
82
  dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
83
  dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"
 
77
 
78
  def glance_page():
79
  with st.expander("Dataset at a Glance", expanded=True):
 
80
  dataset_summary = ""
81
  dataset_summary += f"- **Dataset Website**: {st.session_state.save_state.get('overview_where_website', '')}\n"
82
  dataset_summary += f"- **Dataset Contact**: {st.session_state.save_state.get('overview_where_contact-name', '')}\n"
datacards/curation.py CHANGED
@@ -4,10 +4,153 @@ from .streamlit_utils import (
4
  make_text_input
5
  )
6
 
7
- N_FIELDS = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def curation_page():
10
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def curation_summary():
13
- return None
 
 
 
 
 
 
 
 
 
 
 
 
4
  make_text_input
5
  )
6
 
7
+ from .streamlit_utils import (
8
+ make_multiselect,
9
+ make_selectbox,
10
+ make_text_area,
11
+ make_text_input,
12
+ make_radio,
13
+ )
14
+
15
+ N_FIELDS_ORIGINAL = 4
16
+ N_FIELDS_LANGUAGE = 12
17
+ N_FIELDS_ANNOTATIONS = 0
18
+ N_FIELDS_CONSENT = 0
19
+ N_FIELDS_PII = 0
20
+ N_FIELDS_MAINTENANCE = 0
21
+ N_FIELDS_GEM = 0
22
+
23
+ N_FIELDS = N_FIELDS_ORIGINAL + \
24
+ N_FIELDS_LANGUAGE + \
25
+ N_FIELDS_ANNOTATIONS + \
26
+ N_FIELDS_CONSENT + \
27
+ N_FIELDS_PII + \
28
+ N_FIELDS_MAINTENANCE + \
29
+ N_FIELDS_GEM
30
+
31
+
32
+ """
33
+ What was the selection criteria? [Describe the process for selecting instances to include in the dataset, including any tools used.]
34
+ """
35
 
36
  def curation_page():
37
+ st.session_state.card_dict["curation"] = st.session_state.card_dict.get("curation", {})
38
+ with st.expander("Original Curation", expanded=False):
39
+ key_pref = ["curation", "original"]
40
+ st.session_state.card_dict["curation"]["original"] = st.session_state.card_dict["curation"].get("original", {})
41
+ make_text_area(
42
+ label="Original curation rationale",
43
+ key_list=key_pref + ["rationale"],
44
+ help="Describe the curation rationale behind the original dataset(s)."
45
+ )
46
+ make_text_area(
47
+ label="What was the communicative goal?",
48
+ key_list=key_pref + ["communicative"],
49
+ help="Describe the communicative goal that the original dataset(s) was trying to represent."
50
+ )
51
+ make_radio(
52
+ label="Is the dataset aggregated from different data sources?",
53
+ options=["no", "yes"],
54
+ key_list=key_pref + ["is-aggregated"],
55
+ help="e.g. Wikipedia, movi dialogues, etc.",
56
+ )
57
+ make_text_area(
58
+ label="If yes, list the sources",
59
+ key_list=key_pref + ["aggregated-sources"],
60
+ help="Otherwise, type N/A"
61
+ )
62
+ with st.expander("Language Data", expanded=False):
63
+ key_pref = ["curation", "language"]
64
+ st.session_state.card_dict["curation"]["language"] = st.session_state.card_dict["curation"].get("language", {})
65
+ make_multiselect(
66
+ label="How was the language data obtained?",
67
+ options=["found", "created for the dataset", "crowdsourced", "machine-generated", "other"],
68
+ key_list=key_pref+["obtained"],
69
+ )
70
+ make_multiselect(
71
+ label="If found, where from?",
72
+ options=["website", "offline media collection", "other", "N/A"],
73
+ key_list=key_pref+["found"],
74
+ help="select N/A if none of the language data was found"
75
+ )
76
+ make_multiselect(
77
+ label="If crowdsourced, where from?",
78
+ options=["Amazon Mechanical Turk", "other crowdworker platform", "participatory experiment", "other", "N/A"],
79
+ key_list=key_pref+["crowdsourced"],
80
+ help="select N/A if none of the language data was crowdsourced"
81
+ )
82
+ make_text_area(
83
+ label="If created for the dataset, describe the creation process.",
84
+ key_list=key_pref+["created"],
85
+ )
86
+ make_text_area(
87
+ label="What further information do we have on the language producers?",
88
+ key_list=key_pref+["producers-description"],
89
+ help="Provide a description of the context in which the language was produced and who produced it.",
90
+ )
91
+ make_text_input(
92
+ label="If text was machine-generated for the dataset, provide a link to the generation method if available (N/A otherwise).",
93
+ key_list=key_pref+["machine-generated"],
94
+ help="if the generation code is unavailable, enter N/A",
95
+ )
96
+ make_selectbox(
97
+ label="Was the text validated by a different worker or a data curator?",
98
+ options=["not validated", "validated by crowdworker", "validated by data curator", "other"],
99
+ key_list=key_pref+["validated"],
100
+ help="this question is about human or human-in-the-loop validation only"
101
+ )
102
+ make_multiselect(
103
+ label="In what kind of organization did the curation happen?",
104
+ options= ["industry", "academic", "independent", "other"],
105
+ key_list=key_pref+["organization-type"],
106
+ )
107
+ make_text_input(
108
+ label="Name the organization(s).",
109
+ key_list=key_pref+["organization-names"],
110
+ help="comma-separated",
111
+ )
112
+ make_text_area(
113
+ label="How was the text data pre-processed? (Enter N/A if the text was not pre-processed)",
114
+ key_list=key_pref+["pre-processed"],
115
+ help="List the steps in preprocessing the data for the dataset. Enter N/A if no steps were taken."
116
+ )
117
+ make_selectbox(
118
+ label="Were text instances selected or filtered?",
119
+ options=["not filtered", "manually", "algorithmically", "hybrid"],
120
+ key_list=key_pref+["is-filtered"],
121
+ )
122
+ make_text_area(
123
+ label="What were the selection criteria?",
124
+ key_list=key_pref+["filtered-criteria"],
125
+ help="Describe the process for selecting instances to include in the dataset, including any tools used. If no selection was done, enter N/A."
126
+ )
127
+ with st.expander("Structured Annotations", expanded=False):
128
+ key_pref = ["curation", "annotations"]
129
+ st.session_state.card_dict["curation"]["annotations"] = st.session_state.card_dict["curation"].get("annotations", {})
130
+ with st.expander("Consent", expanded=False):
131
+ key_pref = ["curation", "consent"]
132
+ st.session_state.card_dict["curation"]["consent"] = st.session_state.card_dict["curation"].get("consent", {})
133
+ with st.expander("Private Identifying Information (PII)", expanded=False):
134
+ key_pref = ["curation", "pii"]
135
+ st.session_state.card_dict["curation"]["pii"] = st.session_state.card_dict["curation"].get("pii", {})
136
+ with st.expander("Maintenance", expanded=False):
137
+ key_pref = ["curation", "maintenance"]
138
+ st.session_state.card_dict["curation"]["maintenance"] = st.session_state.card_dict["curation"].get("maintenance", {})
139
+ with st.expander("GEM Additional Curation", expanded=False):
140
+ key_pref = ["curation", "gem"]
141
+ st.session_state.card_dict["curation"]["gem"] = st.session_state.card_dict["curation"].get("gem", {})
142
+
143
 
144
  def curation_summary():
145
+ total_filled = sum([len(dct) for dct in st.session_state.card_dict.get('curation', {}).values()])
146
+ with st.expander(f"Dataset Curation Completion - {total_filled} of {N_FIELDS}", expanded=False):
147
+ completion_markdown = ""
148
+ completion_markdown += f"- **Overall competion:**\n - {total_filled} of {N_FIELDS} fields\n"
149
+ completion_markdown += f"- **Sub-section - Original Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('original', {}))} of {N_FIELDS_ORIGINAL} fields\n"
150
+ completion_markdown += f"- **Sub-section - Language Data:**\n - {len(st.session_state.card_dict.get('curation', {}).get('language', {}))} of {N_FIELDS_LANGUAGE} fields\n"
151
+ completion_markdown += f"- **Sub-section - Structured Annotations:**\n - {len(st.session_state.card_dict.get('curation', {}).get('annotations', {}))} of {N_FIELDS_ANNOTATIONS} fields\n"
152
+ completion_markdown += f"- **Sub-section - Consent:**\n - {len(st.session_state.card_dict.get('curation', {}).get('consent', {}))} of {N_FIELDS_CONSENT} fields\n"
153
+ completion_markdown += f"- **Sub-section - PII:**\n - {len(st.session_state.card_dict.get('curation', {}).get('pii', {}))} of {N_FIELDS_PII} fields\n"
154
+ completion_markdown += f"- **Sub-section - Maintenance:**\n - {len(st.session_state.card_dict.get('curation', {}).get('maintenance', {}))} of {N_FIELDS_MAINTENANCE} fields\n"
155
+ completion_markdown += f"- **Sub-section - GEM Curation:**\n - {len(st.session_state.card_dict.get('curation', {}).get('gem', {}))} of {N_FIELDS_GEM} fields\n"
156
+ st.markdown(completion_markdown)
datacards/gem.py CHANGED
@@ -5,10 +5,7 @@ from .streamlit_utils import (
5
  )
6
 
7
  from .streamlit_utils import (
8
- make_multiselect,
9
- make_selectbox,
10
  make_text_area,
11
- make_text_input,
12
  make_radio,
13
  )
14
 
 
5
  )
6
 
7
  from .streamlit_utils import (
 
 
8
  make_text_area,
 
9
  make_radio,
10
  )
11
 
datacards/overview.py CHANGED
@@ -12,7 +12,7 @@ from .streamlit_utils import (
12
  )
13
 
14
  N_FIELDS_WHERE = 9
15
- N_FIELDS_LANGUAGES = 6
16
  N_FIELDS_CREDIT = 3
17
  N_FIELDS_STRUCTURE = 7
18
 
@@ -98,6 +98,16 @@ def overview_page():
98
  ],
99
  help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
100
  )
 
 
 
 
 
 
 
 
 
 
101
  make_text_area(
102
  label="What is the intended use of the dataset?",
103
  key_list=key_pref + ["intended-use"],
 
12
  )
13
 
14
  N_FIELDS_WHERE = 9
15
+ N_FIELDS_LANGUAGES = 8
16
  N_FIELDS_CREDIT = 3
17
  N_FIELDS_STRUCTURE = 7
18
 
 
98
  ],
99
  help="This is a comprehensive list of languages obtained from the BCP-47 standard list.",
100
  )
101
+ make_text_area(
102
+ label="What dialects are covered? Are there multiple dialects per language?",
103
+ key_list=key_pref + ["language-dialects"],
104
+ help="[free text, paragraphs] - Describe the dialect(s) as appropriate.",
105
+ )
106
+ make_text_area(
107
+ label="Whose language is in the dataset?",
108
+ key_list=key_pref + ["language-speakers"],
109
+ help="[free text, paragraphs] - Provide locally appropriate demographic information about the language producers, if available. Use ranges where reasonable in order to protect individuals’ privacy.",
110
+ )
111
  make_text_area(
112
  label="What is the intended use of the dataset?",
113
  key_list=key_pref + ["intended-use"],