Sebastian Gehrmann commited on
Commit
4f8648b
1 Parent(s): e5b869b

Md formatting for the hub.

Browse files
Files changed (2) hide show
  1. formatting/construct_md.py +75 -0
  2. formatting/json_to_md.py +174 -18
formatting/construct_md.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+ from json import load
3
+
4
+ def parse_args():
5
+ parser = ArgumentParser()
6
+ parser.add_argument('input', type=str, nargs='+', \
7
+ help='Specify paths to files (e.g., path/to/*.json)')
8
+
9
+ return parser.parse_args()
10
+
11
+
12
+ def json_to_markdown(filename):
13
+ json = load(open(filename))
14
+
15
+ markdown = f'# Dataset Card for {json["name"]}\n\n'
16
+
17
+ markdown += f'You can find the '
18
+
19
+ markdown += json['summary'] + '\n\n'
20
+
21
+ for key in json:
22
+ if key not in ('name', 'summary', 'sections'):
23
+ markdown += f'#### {key}\n{json[key]}\n\n'
24
+
25
+ markdown += '\n'.join(section_to_markdown(section) \
26
+ for section in json['sections'])
27
+
28
+ with open(f'{filename[:-5]}.md', 'w') as f:
29
+ f.write(markdown)
30
+
31
+
32
+ def section_to_markdown(section):
33
+ markdown = f'{"#" * section["level"]} {section["title"]}\n\n'
34
+ markdown += '\n'.join(subsection_to_markdown(subsection) \
35
+ for subsection in section['subsections'])
36
+
37
+ return markdown + '\n'
38
+
39
+
40
+ def subsection_to_markdown(subsection):
41
+ markdown = f'{"#" * subsection["level"]} {subsection["title"]}\n\n'
42
+ markdown += '\n'.join(field_to_markdown(field) \
43
+ for field in subsection['fields'])
44
+
45
+ return markdown + '\n'
46
+
47
+
48
+ def field_to_markdown(field):
49
+ markdown = f'{"#" * field["level"]} {field["title"]}\n\n'
50
+
51
+ if 'flags' in field and 'quick' in field['flags']:
52
+ markdown += f'<!-- quick -->\n'
53
+
54
+ if field.get('info', False):
55
+ markdown += f'<!-- info: {field["info"]} -->\n'
56
+
57
+ if field.get('scope', False):
58
+ markdown += f'<!-- scope: {field["scope"]} -->\n'
59
+
60
+ markdown += field.get('content', '')
61
+
62
+ return markdown + '\n'
63
+
64
+
65
+ def main():
66
+ """Converts JSON output from `reformat_json.py`
67
+ to Markdown input for Data Cards Labs."""
68
+ args = parse_args()
69
+ for filename in args.input:
70
+ if filename[-5:] == '.json':
71
+ json_to_markdown(filename)
72
+
73
+
74
+ if __name__ == '__main__':
75
+ main()
formatting/json_to_md.py CHANGED
@@ -1,18 +1,162 @@
1
  from argparse import ArgumentParser
2
  from json import load
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- def parse_args():
5
- parser = ArgumentParser()
6
- parser.add_argument('input', type=str, nargs='+', \
7
- help='Specify paths to files (e.g., path/to/*.json)')
8
 
9
- return parser.parse_args()
 
10
 
 
11
 
12
- def json_to_markdown(filename):
13
- json = load(open(filename))
 
 
 
 
14
 
15
- markdown = f'# {json["name"]}\n\n'
16
  markdown += json['summary'] + '\n\n'
17
 
18
  for key in json:
@@ -22,7 +166,9 @@ def json_to_markdown(filename):
22
  markdown += '\n'.join(section_to_markdown(section) \
23
  for section in json['sections'])
24
 
25
- with open(f'{filename[:-5]}.md', 'w') as f:
 
 
26
  f.write(markdown)
27
 
28
 
@@ -59,14 +205,24 @@ def field_to_markdown(field):
59
  return markdown + '\n'
60
 
61
 
62
- def main():
63
- """Converts JSON output from `reformat_json.py`
64
- to Markdown input for Data Cards Labs."""
65
- args = parse_args()
66
- for filename in args.input:
67
- if filename[-5:] == '.json':
68
- json_to_markdown(filename)
 
 
 
 
 
 
 
 
 
69
 
 
70
 
71
- if __name__ == '__main__':
72
- main()
1
  from argparse import ArgumentParser
2
  from json import load
3
+ import pathlib
4
+ import os
5
+
6
+
7
+ def multi_grep(d, l1, l2, l3):
8
+ return d.get(l1, {}).get(l2, {}).get(l3, "[Needs More Information]")
9
+
10
+ def multi_grep2(d, l1, l2, l3):
11
+ return d.get(l1, {}).get(l2, {}).get(l3, ["unknown"])
12
+
13
+ def sanitize_md_url(s):
14
+ """Strip out MD fragments if they exist."""
15
+ if len(s.split("](")) > 1:
16
+ return s.split("](")[1].replace(")", "")
17
+ else:
18
+ return s
19
+
20
+ # ---
21
+ # annotations_creators:
22
+ # - expert-generated
23
+ # language_creators:
24
+ # - found
25
+ # languages:
26
+ # - en
27
+ # licenses:
28
+ # - unknown
29
+ # multilinguality:
30
+ # - monolingual
31
+ # pretty_name: FairytaleQA
32
+ # size_categories:
33
+ # - 10K<n<100K
34
+ # source_datasets:
35
+ # - original
36
+ # task_categories:
37
+ # - question-generation
38
+ # task_ids:
39
+ # - abstractive-qg
40
+ # ---
41
+
42
+ def construct_preamble(data, name):
43
+ pre = "---\n"
44
+ pre += "annotations_creators:\n"
45
+ # - expert-generated
46
+ s = multi_grep(data, "curation", "annotations", "origin")
47
+ if s == "[Needs More Information]":
48
+ pre += "- unknown\n"
49
+ else:
50
+ pre += "- " + s.replace(" ", "-") + "\n"
51
+
52
+ pre += "language_creators:\n- unknown\n"
53
+ pre += "languages:"
54
+ languages = multi_grep2(data, "overview", "languages", "language_names")
55
+ for l in languages:
56
+ pre += f"\n- {l}"
57
+ pre += "\nlicenses:\n"
58
+
59
+ s = multi_grep(data, "overview", "languages", "license")
60
+ if s == "[Needs More Information]":
61
+ pre += "- unknown\n"
62
+ else:
63
+ pre += "- " + s.split(":")[0] + "\n"
64
+
65
+ pre += "multilinguality:\n"
66
+ if languages == ["unknown"]:
67
+ pre += "- unknown"
68
+ elif len(languages) == 1:
69
+ pre += "- monolingual"
70
+ else:
71
+ pre += "- multilingual"
72
+
73
+ # - monolingual
74
+ pre += f"\npretty_name: {name}\n"
75
+ pre += "size_categories:\n- unknown\n"
76
+ pre += "source_datasets:\n- original\n"
77
+ pre += "task_categories:\n"
78
+
79
+ s = multi_grep(data, "overview", "languages", "task")
80
+ if s == "[Needs More Information]":
81
+ pre += "- unknown\n"
82
+ else:
83
+ pre += "- " + "-".join(s.lower().split(" ")) + "\n"
84
+ # - question-generation
85
+ pre += "task_ids:\n- unknown\n"
86
+ # - abstractive-qg
87
+
88
+ pre += "---\n\n"
89
+ return pre
90
+
91
+
92
+
93
+ ## Table of Contents
94
+ # - [Dataset Description](#dataset-description)
95
+ # - [Dataset Summary](#dataset-summary)
96
+ # - [Supported Tasks](#supported-tasks-and-leaderboards)
97
+ # - [Languages](#languages)
98
+ # - [Dataset Structure](#dataset-structure)
99
+ # - [Data Instances](#data-instances)
100
+ # - [Data Fields](#data-instances)
101
+ # - [Data Splits](#data-instances)
102
+ # - [Dataset Creation](#dataset-creation)
103
+ # - [Curation Rationale](#curation-rationale)
104
+ # - [Source Data](#source-data)
105
+ # - [Annotations](#annotations)
106
+ # - [Personal and Sensitive Information](#personal-and-sensitive-information)
107
+ # - [Considerations for Using the Data](#considerations-for-using-the-data)
108
+ # - [Social Impact of Dataset](#social-impact-of-dataset)
109
+ # - [Discussion of Biases](#discussion-of-biases)
110
+ # - [Other Known Limitations](#other-known-limitations)
111
+ # - [Additional Information](#additional-information)
112
+ # - [Dataset Curators](#dataset-curators)
113
+ # - [Licensing Information](#licensing-information)
114
+ # - [Citation Information](#citation-information)
115
+
116
+ def construct_toc(data):
117
+ pass
118
+
119
+ def construct_links(data):
120
+
121
+ links = "## Dataset Description\n\n"
122
+
123
+ s = sanitize_md_url(multi_grep(data, "overview", "where", "website"))
124
+ links += f"- **Homepage:** {s}\n"
125
+
126
+ s = sanitize_md_url(multi_grep(data, "overview", "where", "data-url"))
127
+ links += f"- **Repository:** {s}\n"
128
+
129
+ s = sanitize_md_url(multi_grep(data, "overview", "where", "paper-url"))
130
+ links += f"- **Paper:** {s}\n"
131
+
132
+ s = sanitize_md_url(multi_grep(data, "overview", "where", "leaderboard-url"))
133
+ links += f"- **Leaderboard:** {s}\n"
134
+
135
+ s = multi_grep(data, "overview", "where", "contact-name")
136
+ links += f"- **Point of Contact:** {s}\n\n"
137
+
138
+ return links
139
+
140
+
141
+ def json_to_markdown(filename, original_json_path):
142
+ json = load(open(filename))
143
+ original_json = load(open(original_json_path))
144
+ dataset_name = pathlib.Path(original_json_path).stem
145
 
 
 
 
 
146
 
147
+ preamble = construct_preamble(original_json, dataset_name)
148
+ markdown = preamble
149
 
150
+ markdown += f'# Dataset Card for GEM/{json["name"]}\n\n'
151
 
152
+ # ToC here.
153
+
154
+ markdown += construct_links(original_json)
155
+
156
+ markdown += "### Link to Main Data Card\n\n"
157
+ markdown += f'You can find the main data card on the [GEM Website](https://gem-benchmark.com/data_cards/{dataset_name}).\n\n'
158
 
159
+ markdown += "### Dataset Summary \n\n"
160
  markdown += json['summary'] + '\n\n'
161
 
162
  for key in json:
166
  markdown += '\n'.join(section_to_markdown(section) \
167
  for section in json['sections'])
168
 
169
+ readme_path = os.path.join(pathlib.Path(original_json_path).parents[0], "README.md")
170
+
171
+ with open(readme_path, 'w') as f:
172
  f.write(markdown)
173
 
174
 
205
  return markdown + '\n'
206
 
207
 
208
+ # def main():
209
+ # """Converts JSON output from `reformat_json.py`
210
+ # to Markdown input for Data Cards Labs."""
211
+ # args = parse_args()
212
+ # for filename in args.input:
213
+ # if filename[-5:] == '.json':
214
+ # json_to_markdown(filename)
215
+
216
+ if __name__ == "__main__":
217
+
218
+ for dataset in os.listdir("../../../GEMv2"):
219
+ data_card_path = f"../../../GEMv2/{dataset}/{dataset}.json"
220
+ if os.path.exists(data_card_path):
221
+ print(f"Now processing {dataset}.")
222
+ # This script assumes you have run reformat_json.py
223
+ new_path = f"datacards/{dataset}.json"
224
 
225
+ md_string = json_to_markdown(new_path, data_card_path)
226
 
227
+ else:
228
+ print(f"{dataset} has no data card!")