diff --git a/README.md b/README.md
index 47f703bafbbdc1cd78c53747d09e05d7652ec5b6..d141000c9d70595714289f5f3b6466f028ece4cd 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,6 @@
 # datasets-tagging
 A Streamlit app to add structured tags to the datasets
+
+```
+streamlit run tagging_app.py
+```
diff --git a/saved_tags/aeslc/default/tags.json b/saved_tags/aeslc/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..9397afae3668639f05c46f505231a6637df37169
--- /dev/null
+++ b/saved_tags/aeslc/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["summarization"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["crowdsourced", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original", "extended|other"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/ag_news/default/tags.json b/saved_tags/ag_news/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..c3b39d746ce18e9f6fd297dfa6df1d9e77befe22
--- /dev/null
+++ b/saved_tags/ag_news/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["topic-classification"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/ai2_arc/ARC-Challenge/tags.json b/saved_tags/ai2_arc/ARC-Challenge/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..86d328ac458fd89963152ca2f99a64b2b870a43e
--- /dev/null
+++ b/saved_tags/ai2_arc/ARC-Challenge/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["multiple-choice-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/ai2_arc/ARC-Easy/tags.json b/saved_tags/ai2_arc/ARC-Easy/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..86d328ac458fd89963152ca2f99a64b2b870a43e
--- /dev/null
+++ b/saved_tags/ai2_arc/ARC-Easy/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["multiple-choice-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/allocine/allocine/tags.json b/saved_tags/allocine/allocine/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba0b2812a5ffb36d51b0132d5cafefbb77ffaf0b
--- /dev/null
+++ b/saved_tags/allocine/allocine/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["sentiment-classification"], "multilinguality": ["monolingual"], "languages": ["fr"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["mit"]}
\ No newline at end of file
diff --git a/saved_tags/anli/plain_text/tags.json b/saved_tags/anli/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..c06616d16dd96885a2de7cfc40f0194137144b74
--- /dev/null
+++ b/saved_tags/anli/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification", "text-to-text"], "task_ids": ["natural-language-inference", "text-to-text-other-generate-explanation"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["crowdsourced"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["cc-by-nc-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/arcd/plain_text/tags.json b/saved_tags/arcd/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..336d35605ac8fd25e1114ae73ba9e6887732b9b3
--- /dev/null
+++ b/saved_tags/arcd/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["closed-domain-qa", "extractive-qa", "open-domain-qa"], "multilinguality": ["monolingual"], "languages": ["ar"], "language_creators": ["crowdsourced", "found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original", "extended|wikipedia"], "size_categories": ["1K<n<10K"], "licenses": ["mit"]}
\ No newline at end of file
diff --git a/saved_tags/art/anli/tags.json b/saved_tags/art/anli/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b3a1dc73edf8f9b99ea41548b524c0dc68f201f
--- /dev/null
+++ b/saved_tags/art/anli/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification", "question-answering", "conditional-text-generation"], "task_ids": ["natural-language-inference", "multiple-choice-qa", "explanation-generation"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["crowdsourced"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original", "extended|other-roc-stories"], "size_categories": ["100K<n<1M"], "licenses": ["apache-2.0", "unknown"]}
\ No newline at end of file
diff --git a/saved_tags/billsum/default/tags.json b/saved_tags/billsum/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..2206d58145a16c73c36890d5f8aafee39f62a57b
--- /dev/null
+++ b/saved_tags/billsum/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["summarization"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/biomrc/biomrc_large_A/tags.json b/saved_tags/biomrc/biomrc_large_A/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..dab5822e4103da0de6dda111a2476da2f67b0b32
--- /dev/null
+++ b/saved_tags/biomrc/biomrc_large_A/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/biomrc/biomrc_large_B/tags.json b/saved_tags/biomrc/biomrc_large_B/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..dab5822e4103da0de6dda111a2476da2f67b0b32
--- /dev/null
+++ b/saved_tags/biomrc/biomrc_large_B/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/biomrc/biomrc_small_A/tags.json b/saved_tags/biomrc/biomrc_small_A/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..dab5822e4103da0de6dda111a2476da2f67b0b32
--- /dev/null
+++ b/saved_tags/biomrc/biomrc_small_A/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/biomrc/biomrc_small_B/tags.json b/saved_tags/biomrc/biomrc_small_B/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..dab5822e4103da0de6dda111a2476da2f67b0b32
--- /dev/null
+++ b/saved_tags/biomrc/biomrc_small_B/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/biomrc/biomrc_tiny_A/tags.json b/saved_tags/biomrc/biomrc_tiny_A/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..747f195b8283f8a09036efafc02fc4a94b0fd8a0
--- /dev/null
+++ b/saved_tags/biomrc/biomrc_tiny_A/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["n<1K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/biomrc/biomrc_tiny_B/tags.json b/saved_tags/biomrc/biomrc_tiny_B/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..747f195b8283f8a09036efafc02fc4a94b0fd8a0
--- /dev/null
+++ b/saved_tags/biomrc/biomrc_tiny_B/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["n<1K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/blog_authorship_corpus/blog-authorship-corpus/tags.json b/saved_tags/blog_authorship_corpus/blog-authorship-corpus/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4fb0d2d4e893aa8e04ff0dfb41522587a7bab5b
--- /dev/null
+++ b/saved_tags/blog_authorship_corpus/blog-authorship-corpus/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["text-classification-other-author features classification"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/bookcorpus/plain_text/tags.json b/saved_tags/bookcorpus/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8a99a2bd13e19ce3363d362f416692ba5fd6962
--- /dev/null
+++ b/saved_tags/bookcorpus/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["sequence-modeling"], "task_ids": ["language-modeling"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["n>1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/break_data/QDMR-high-level-lexicon/tags.json b/saved_tags/break_data/QDMR-high-level-lexicon/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2f27d3e1346770866f90b07dc00aca8cd2e9145
--- /dev/null
+++ b/saved_tags/break_data/QDMR-high-level-lexicon/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["conditional-text-generation-other-QDMR lexicon prediction"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["extended|hotpot_qa", "extended|drop", "extended|com_qa"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/break_data/QDMR-high-level/tags.json b/saved_tags/break_data/QDMR-high-level/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd5d6a81d59073a1f87598b62df2294843b302a3
--- /dev/null
+++ b/saved_tags/break_data/QDMR-high-level/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["conditional-text-generation-other-QDMR decomposition"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["extended|hotpot_qa", "extended|drop", "extended|com_qa"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/break_data/QDMR-lexicon/tags.json b/saved_tags/break_data/QDMR-lexicon/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd5d6a81d59073a1f87598b62df2294843b302a3
--- /dev/null
+++ b/saved_tags/break_data/QDMR-lexicon/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["conditional-text-generation-other-QDMR decomposition"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["extended|hotpot_qa", "extended|drop", "extended|com_qa"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/break_data/QDMR/tags.json b/saved_tags/break_data/QDMR/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2f27d3e1346770866f90b07dc00aca8cd2e9145
--- /dev/null
+++ b/saved_tags/break_data/QDMR/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["conditional-text-generation-other-QDMR lexicon prediction"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["extended|hotpot_qa", "extended|drop", "extended|com_qa"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/break_data/logical-forms/tags.json b/saved_tags/break_data/logical-forms/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd5d6a81d59073a1f87598b62df2294843b302a3
--- /dev/null
+++ b/saved_tags/break_data/logical-forms/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["conditional-text-generation-other-QDMR decomposition"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["extended|hotpot_qa", "extended|drop", "extended|com_qa"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/cnn_dailymail/1.0.0/tags.json b/saved_tags/cnn_dailymail/1.0.0/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f4d16d3e242dee8eb426f30552dfaf46e6bfa4f
--- /dev/null
+++ b/saved_tags/cnn_dailymail/1.0.0/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["summarization"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/cnn_dailymail/2.0.0/tags.json b/saved_tags/cnn_dailymail/2.0.0/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f4d16d3e242dee8eb426f30552dfaf46e6bfa4f
--- /dev/null
+++ b/saved_tags/cnn_dailymail/2.0.0/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["summarization"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/cnn_dailymail/3.0.0/tags.json b/saved_tags/cnn_dailymail/3.0.0/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f4d16d3e242dee8eb426f30552dfaf46e6bfa4f
--- /dev/null
+++ b/saved_tags/cnn_dailymail/3.0.0/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["summarization"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/common_gen/default/tags.json b/saved_tags/common_gen/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2ebf8aa2c412f0330b8a7892b1affa900af7063
--- /dev/null
+++ b/saved_tags/common_gen/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["table-to-text"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["crowdsourced"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original", "extended|other-image-and-video-captions"], "size_categories": ["10K<n<100K"], "licenses": ["mit"]}
\ No newline at end of file
diff --git a/saved_tags/commonsense_qa/default/tags.json b/saved_tags/commonsense_qa/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..49dc8b399c0bb0cf342328b053c949d33d58e607
--- /dev/null
+++ b/saved_tags/commonsense_qa/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["multiple-choice-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/drop/default/tags.json b/saved_tags/drop/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..168685f3efdf5b4800afef37b61cb8e3b7a48104
--- /dev/null
+++ b/saved_tags/drop/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["extended|wikipedia"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/eli5/LFQA_reddit/tags.json b/saved_tags/eli5/LFQA_reddit/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..d72e9ecbf4b7c2adb0209f5a4793788f1c73cc04
--- /dev/null
+++ b/saved_tags/eli5/LFQA_reddit/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["abstractive-qa", "open-domain-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/emo/emo2019/tags.json b/saved_tags/emo/emo2019/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a5b03d7277977cfc0984c3fb489f07a563d5bcb
--- /dev/null
+++ b/saved_tags/emo/emo2019/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["sentiment-classification"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/natural_questions/default/tags.json b/saved_tags/natural_questions/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..109a3b6032642b4b8d092b9e3f0d9eca8a865236
--- /dev/null
+++ b/saved_tags/natural_questions/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa", "open-domain-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["cc-by-sa-3.0"]}
\ No newline at end of file
diff --git a/saved_tags/openwebtext/plain_text/tags.json b/saved_tags/openwebtext/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8a99a2bd13e19ce3363d362f416692ba5fd6962
--- /dev/null
+++ b/saved_tags/openwebtext/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["sequence-modeling"], "task_ids": ["language-modeling"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["n>1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/pg19/default/tags.json b/saved_tags/pg19/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..06969961f8d09a11ca27a97f284d99caa5eda298
--- /dev/null
+++ b/saved_tags/pg19/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["sequence-modeling"], "task_ids": ["language-modeling"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/quora/default/tags.json b/saved_tags/quora/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4d730499dbdba74351abe9ce8bd4c113937cb8b
--- /dev/null
+++ b/saved_tags/quora/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["semantic-similarity-classification"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["other-quora-terms-of-service"]}
\ No newline at end of file
diff --git a/saved_tags/reddit_tifu/long/tags.json b/saved_tags/reddit_tifu/long/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b8863bacb69999fcce5bf8dcc6a1fb67457f545
--- /dev/null
+++ b/saved_tags/reddit_tifu/long/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-to-text"], "task_ids": ["summarization"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original", "extended|reddit"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/reddit_tifu/short/tags.json b/saved_tags/reddit_tifu/short/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..01e2025fdded3201507fdd02df189d1b928e1520
--- /dev/null
+++ b/saved_tags/reddit_tifu/short/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["summarization"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original", "extended|reddit"], "size_categories": ["10K<n<100K"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/snli/plain_text/tags.json b/saved_tags/snli/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f37f4b9f02c9bc8930e9c0ebf0d5a8696cc569d
--- /dev/null
+++ b/saved_tags/snli/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["natural-language-inference"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["crowdsourced"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original", "extended|other"], "size_categories": ["100K<n<1M"], "licenses": ["cc-by-nc-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/squad/plain_text/tags.json b/saved_tags/squad/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..f6da72d197f083ba9b1f87b00b7e0c0a3d3cd590
--- /dev/null
+++ b/saved_tags/squad/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa", "open-domain-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original"], "size_categories": ["10K<n<100K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/tiny_shakespeare/default/tags.json b/saved_tags/tiny_shakespeare/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..dde2cb8e68fd9d4ece231f26cbbbbe33d7252f4a
--- /dev/null
+++ b/saved_tags/tiny_shakespeare/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["sequence-modeling"], "task_ids": ["language-modeling"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original", "extended|other"], "size_categories": ["n<1K"], "licenses": ["other-my-license"]}
\ No newline at end of file
diff --git a/saved_tags/wiki_split/default/tags.json b/saved_tags/wiki_split/default/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..98cdfb56f0ff46179161beee1297f331253fd1fb
--- /dev/null
+++ b/saved_tags/wiki_split/default/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["sentence-splitting-fusion"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original", "extended|wikipedia"], "size_categories": ["100K<n<1M"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/wmt14/cs-en/tags.json b/saved_tags/wmt14/cs-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..3aefb909144b36dcf6d8d134341ddfbec5b9ed7b
--- /dev/null
+++ b/saved_tags/wmt14/cs-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["cs", "en"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt14/de-en/tags.json b/saved_tags/wmt14/de-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..a734168621d410d67d2a2f91c690e1e7154722b9
--- /dev/null
+++ b/saved_tags/wmt14/de-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["en", "de"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt14/fr-en/tags.json b/saved_tags/wmt14/fr-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..85a7e58b70b9b3db2de02fa0d4cbd8f2bea094a2
--- /dev/null
+++ b/saved_tags/wmt14/fr-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["fr", "en"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt14/hi-en/tags.json b/saved_tags/wmt14/hi-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..42a6c36cd8b1f83c6e830616c1c64e1f0b98e130
--- /dev/null
+++ b/saved_tags/wmt14/hi-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["en", "hi"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt14/ru-en/tags.json b/saved_tags/wmt14/ru-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf2b99dbe20ed0406a51cd7e8a1b864246750a0a
--- /dev/null
+++ b/saved_tags/wmt14/ru-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["ru", "en"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["original"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt15/cs-en/tags.json b/saved_tags/wmt15/cs-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..598963ce5ce8fde2f24833c4bb0fe87297a61657
--- /dev/null
+++ b/saved_tags/wmt15/cs-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["cs", "en"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["extended|wmt14"], "size_categories": ["100K<n<1M"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt15/de-en/tags.json b/saved_tags/wmt15/de-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..60e043b34f722337b56ad75bade8d8ece5071678
--- /dev/null
+++ b/saved_tags/wmt15/de-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["en", "de"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["extended|wmt14"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt15/fi-en/tags.json b/saved_tags/wmt15/fi-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..efea3de16dc476e47be10fbe76001ff6adb040fb
--- /dev/null
+++ b/saved_tags/wmt15/fi-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["fi", "en"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["extended|wmt14"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt15/fr-en/tags.json b/saved_tags/wmt15/fr-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..5337d6ce64644d10defd31ad7558c25933775870
--- /dev/null
+++ b/saved_tags/wmt15/fr-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["en", "fr"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["extended|wmt14"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/wmt15/ru-en/tags.json b/saved_tags/wmt15/ru-en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..00cfbfc6f2141856311429a63264a0d96c88a75c
--- /dev/null
+++ b/saved_tags/wmt15/ru-en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["conditional-text-generation"], "task_ids": ["machine-translation"], "multilinguality": ["translation"], "languages": ["en", "ru"], "language_creators": ["expert-generated", "found"], "annotations_creators": ["no-annotation"], "source_datasets": ["extended|wmt14"], "size_categories": ["unknown"], "licenses": ["unknown"]}
\ No newline at end of file
diff --git a/saved_tags/xnli/plain_text/tags.json b/saved_tags/xnli/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f16283240afe5d9fe397b13614ef94e121eb8a5
--- /dev/null
+++ b/saved_tags/xnli/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["natural-language-inference"], "multilinguality": ["multilingual", "translation"], "languages": ["bg", "vi", "zh", "sw", "el", "ar", "en", "hi", "ru", "tr", "es", "th", "fr", "ur", "de"], "language_creators": ["crowdsourced", "expert-generated"], "annotations_creators": ["crowdsourced"], "source_datasets": ["original", "extended|snli"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-nc-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.ar/tags.json b/saved_tags/xquad/xquad.ar/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f52319616c6de63daa24f2b1473dd0a181a4d5b
--- /dev/null
+++ b/saved_tags/xquad/xquad.ar/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["ar"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.de/tags.json b/saved_tags/xquad/xquad.de/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1cb639afd7feffec9fa81bf56f82e5e073bd253
--- /dev/null
+++ b/saved_tags/xquad/xquad.de/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["de"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.el/tags.json b/saved_tags/xquad/xquad.el/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..3334a51908b1ec6f8eb448368709c3170ef70f33
--- /dev/null
+++ b/saved_tags/xquad/xquad.el/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["el"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.en/tags.json b/saved_tags/xquad/xquad.en/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b0322ea7bba0363d0bb9695280d0312ebba44fc
--- /dev/null
+++ b/saved_tags/xquad/xquad.en/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.es/tags.json b/saved_tags/xquad/xquad.es/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..9248397d5c8fdf8780b67af3cc2e25de0becdfb1
--- /dev/null
+++ b/saved_tags/xquad/xquad.es/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["es"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.hi/tags.json b/saved_tags/xquad/xquad.hi/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce715fd0e7b563a17bb2893f6db31042649981cb
--- /dev/null
+++ b/saved_tags/xquad/xquad.hi/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["hi"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.ru/tags.json b/saved_tags/xquad/xquad.ru/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae0f0e5427173041f3bc4f21fd8d5dbf9d62f272
--- /dev/null
+++ b/saved_tags/xquad/xquad.ru/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["ru"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.th/tags.json b/saved_tags/xquad/xquad.th/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..fb78861a37d656ae2b5602e005d75a2ce03195a7
--- /dev/null
+++ b/saved_tags/xquad/xquad.th/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["th"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.tr/tags.json b/saved_tags/xquad/xquad.tr/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..392b9129aadf31b31ad8de54c5a6da75fd23516c
--- /dev/null
+++ b/saved_tags/xquad/xquad.tr/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["tr"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.vi/tags.json b/saved_tags/xquad/xquad.vi/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..2335971f4d390dda4386c011298370a59f18d13e
--- /dev/null
+++ b/saved_tags/xquad/xquad.vi/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["vi"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/xquad/xquad.zh/tags.json b/saved_tags/xquad/xquad.zh/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..ae09870ed4558274bf56fedd7357411d1392391f
--- /dev/null
+++ b/saved_tags/xquad/xquad.zh/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["question-answering"], "task_ids": ["extractive-qa"], "multilinguality": ["monolingual"], "languages": ["zh"], "language_creators": ["found"], "annotations_creators": ["expert-generated"], "source_datasets": ["extended|squad"], "size_categories": ["1K<n<10K"], "licenses": ["cc-by-sa-4.0"]}
\ No newline at end of file
diff --git a/saved_tags/yelp_polarity/plain_text/tags.json b/saved_tags/yelp_polarity/plain_text/tags.json
new file mode 100644
index 0000000000000000000000000000000000000000..a95b9b8b9586ee43ca535af7485b8fcb9ab23a26
--- /dev/null
+++ b/saved_tags/yelp_polarity/plain_text/tags.json
@@ -0,0 +1 @@
+{"task_categories": ["text-classification"], "task_ids": ["sentiment-classification"], "multilinguality": ["monolingual"], "languages": ["en"], "language_creators": ["found"], "annotations_creators": ["found"], "source_datasets": ["original"], "size_categories": ["100K<n<1M"], "licenses": ["other-yelp-terms-of-use"]}
\ No newline at end of file
diff --git a/tag_set.json b/tag_set.json
new file mode 100644
index 0000000000000000000000000000000000000000..71cabf0c9a8214ea5f3298944342a77a65c071dc
--- /dev/null
+++ b/tag_set.json
@@ -0,0 +1 @@
+{"task_structure": {"Txt2Class": "text to classification task", "Txt2Class.Bi": "text to binary classification task", "Txt2Class.Multi.Sing": "text to multiple classes single label", "Txt2Class.Multi.Multi": "text to multiple classes multiple labels", "Strct2Txt": "structured information to text task", "Txt2Strct": "text to structured information task", "Txt2Txt": "text to text task", "Txt": "just text", "Oth": "other"}, "purpose": {"NLI": "natural language inference", "SentA": "sentiment analysis", "MT": "machine translation", "Summ.ext": "extractive summarization", "Summ.abs": "abstractive summarization", "QA.abs": "abstractive question answering", "QA.ext": "extractive question answering", "QA.open": "open domain question answering", "QA.closed": "closed domain question answering", "QA.open.abs": "open domain abstractive question answering", "QA.closed.abs": "closed domain abstractive question answering", "QA.open.ext": "open domain extractive question answering", "QA.closed.ext": "closed domain extractive question answering", "Dialog": "dialogue or multi-turn text", "LM": "language modeling", "NER": "named entity recognition", "Pars": "parsing", "TxtSimp": "text simplification", "Coref": "coreference resolution", "FactChk": "fact checking", "EntLink": "entity linking", "SSplitFus": "sentence splitting/fusion", "SlotFillClz": "slot filling / Cloze test", "InfoRet": "information retrieval", "IntentClass": "intent classification", "SemSim": "semantic similarity", "Oth": "other"}, "language_producers": {"crwdsrc_l": "data produced by crowdsource workers", "machgen_l": "machine-generated data", "found_l": "found data", "Oth": "other"}, "annotation": {"crwdsrc_a": "annotation produced by crowdsource workers", "machgen_a": "machine-generated annotation", "exp_a": "expert annotation", "no_a": "no annotation", "Oth": "other"}, "license": {"afl-3.0": "Academic Free License", "apache-2.0": "Apache license 2.0", "artistic-2.0": "Artistic license 2.0", "bsl-1.0": "Boost Software License 1.0", "bsd-2-clause": "BSD 2-clause \"Simplified\" license", "bsd-3-clause": "BSD 3-clause \"New\" or \"Revised\" license", "bsd-3-clause-clear": "BSD 3-clause Clear license", "cc": "Creative Commons license family", "cc0-1.0": "Creative Commons Zero v1.0 Universal", "cc-by-4.0": "Creative Commons Attribution 4.0", "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0", "wtfpl": "Do What The F*ck You Want To Public License", "ecl-2.0": "Educational Community License v2.0", "epl-1.0": "Eclipse Public License 1.0", "epl-2.0": "Eclipse Public License 2.0", "eupl-1.1": "European Union Public License 1.1", "agpl-3.0": "GNU Affero General Public License v3.0", "gpl": "GNU General Public License family", "gpl-2.0": "GNU General Public License v2.0", "gpl-3.0": "GNU General Public License v3.0", "lgpl": "GNU Lesser General Public License family", "lgpl-2.1": "GNU Lesser General Public License v2.1", "lgpl-3.0": "GNU Lesser General Public License v3.0", "isc": "ISC", "lppl-1.3c": "LaTeX Project Public License v1.3c", "ms-pl": "Microsoft Public License", "mit": "MIT", "mpl-2.0": "Mozilla Public License 2.0", "osl-3.0": "Open Software License 3.0", "postgresql": "PostgreSQL License", "ofl-1.1": "SIL Open Font License 1.1", "ncsa": "University of Illinois/NCSA Open Source License", "unlicense": "The Unlicense", "zlib": "zLib License", "Oth": "other"}, "language": {"cardinality": {"1ling": "monolingual; only one language in the dataset", "trsl": "translation; parallel language use", "multiling": "multilingual; more than one language being used within or across datasets over different content", "Oth": "other"}, "BCP-47": {"en": "English, dialect unknown", "es": "Spanish, dialect unknown", "fr": "French, dialect unknown", "sv": "Swedish, dialect unknown", "fi": "Finnish, dialect unknown", "de": "German, dialect unknown", "ru": "Russian, dialect unknown", "uk": "Ukranian, dialect unknown", "it": "Italian, dialect unknown", "eo": "Esperanto, dialect unknown", "ar": "Arabic, dialect unknown", "tr": "Turkish, dialect unknown", "bg": "Bulgarian, dialect unknown", "pl": "Polish, dialect unknown", "nl": "Dutch, dialect unknown", "id": "Indonesian, dialect unknown", "zh": "Chinese, dialect unknown", "af": "Afrikaans, dialect unknown", "ca": "Catalan, dialect unknown", "cs": "Czech, dialect unknown", "pt": "Portuguese, dialect unknown", "no": "Norwegian, dialect unknown", "he": "Hebrew, dialect unknown", "da": "Danish, dialect unknown", "is": "Icelandic, dialect unknown", "hu": "Hungarian, dialect unknown", "ro": "Romanian, dialect unknown", "ms": "Malay, dialect unknown", "ja": "Japanese, dialect unknown", "hi": "Hindi, dialect unknown", "sl": "Slovene, dialect unknown", "lt": "Lithuanian, dialect unknown", "ht": "Haitian, dialect unknown", "vi": "Vietnamese, dialect unknown", "et": "Estonian, dialect unknown", "el": "Greek, dialect unknown", "hr": "Croatian, dialect unknown", "mt": "Maltese, dialect unknown", "ts": "Tsonga, dialect unknown", "mk": "Macedonian, dialect unknown", "ln": "Lingala, dialect unknown", "ig": "Igbo, dialect unknown", "ee": "Ewe, dialect unknown", "xh": "Xhosa, dialect unknown", "sn": "Shona, dialect unknown", "rw": "Kinyarwanda, dialect unknown", "ny": "Chichewa, dialect unknown", "lv": "Latvian, dialect unknown", "lg": "Ganda, dialect unknown", "ko": "Korean, dialect unknown", "gl": "Galician, dialect unknown", "sg": "Sango, dialect unknown", "yo": "Yoruba, dialect unknown", "ur": "Urdu, dialect unknown", "rn": "Kirundi, dialect unknown", "mr": "Marathi, dialect unknown", "bn": "Bengali, dialect unknown", "nso": "Pedi, dialect unknown", "ty": "Tahitian, dialect unknown", "to": "Tonga, dialect unknown", "gu": "Gujarati, dialect unknown", "eu": "Basque, dialect unknown", "niu": "Niuean, dialect unknown", "guw": "Gun, dialect unknown", "gaa": "Ga, dialect unknown", "crs": "Seselwa Creole French, dialect unknown", "bcl": "Central Bikol, dialect unknown", "tn": "Tswana, dialect unknown", "sm": "Samoan, dialect unknown", "si": "Sinhala, dialect unknown", "nn": "Norwegian Nynorsk, dialect unknown", "nb": "Norwegian Bokm\u00e5l, dialect unknown", "fj": "Fijian, dialect unknown", "be": "Belarusian, dialect unknown", "pon": "Pohnpeian, dialect unknown", "pis": "Pijin, dialect unknown", "pap": "Papiamento, dialect unknown", "pag": "Pangasinan, dialect unknown", "lua": "Luba-Lulua, dialect unknown", "iso": "Isoko, dialect unknown", "ilo": "Iloko, dialect unknown", "gil": "Gilbertese, dialect unknown", "efi": "Efik, dialect unknown", "bzs": "Brazilian Sign Language, dialect unknown", "yi": "Yiddish, dialect unknown", "wa": "Walloon, dialect unknown", "sq": "Albanian, dialect unknown", "or": "Oriya, dialect unknown", "mh": "Marshallese, dialect unknown", "lb": "Luxembourgish, dialect unknown", "ha": "Hausa, dialect unknown", "fy": "Western Frisian, dialect unknown", "fo": "Faroese, dialect unknown", "as": "Assamese, dialect unknown", "tvl": "Tuvalua, dialect unknown", "tll": "Tetela, dialect unknown", "swc": "Congo Swahili, dialect unknown", "lus": "Lushai, dialect unknown", "loz": "Lozi, dialect unknown", "ceb": "Cebuano, dialect unknown", "ti": "Tigrinya, dialect unknown", "st": "Southern Sotho, dialect unknown", "rm": "Romansh, dialect unknown", "oc": "Occitan, dialect unknown", "kg": "Kongo, dialect unknown", "ga": "Irish, dialect unknown", "co": "Corsican, dialect unknown", "an": "Aragonese, dialect unknown", "war": "Waray, dialect unknown", "lue": "Luvale, dialect unknown", "hil": "Hiligaynon, dialect unknown", "bem": "Bemba, dialect unknown", "ase": "American Sign Language, dialect unknown", "zu": "Zulu, dialect unknown", "tw": "Twi, dialect unknown", "tl": "Tagalog, dialect unknown", "sk": "Slovak, dialect unknown", "lu": "Luba-Katanga, dialect unknown", "hy": "Armenian, dialect unknown", "gv": "Manx, dialect unknown", "cy": "Welsh, dialect unknown", "bi": "Bislama, dialect unknown", "am": "Amharic, dialect unknown", "srn": "Sranan Tongo, dialect unknown", "toi": "Tonga (Zambia), dialect unknown", "kqn": "Kaonde, dialect unknown", "se": "Northern Sami, dialect unknown", "ps": "Pashto, dialect unknown", "os": "Ossetian, dialect unknown", "zne": "Zande (individual language), dialect unknown", "wls": "Wallisian, dialect unknown", "tpi": "Tok Pisin, dialect unknown", "tiv": "Tiv, dialect unknown", "run": "Rundi, dialect unknown", "so": "Somali, dialect unknown", "kw": "Cornish, dialect unknown", "ho": "Hiri Motu, dialect unknown", "gd": "Scottish Gaelic, dialect unknown", "br": "Breton, dialect unknown", "tum": "Tumbuka, dialect unknown", "yap": "Yapese, dialect unknown", "rnd": "Ruund, dialect unknown", "mfe": "Morisyen, dialect unknown", "kwy": "San Salvador Kongo, dialect unknown", "chk": "Chuukese, dialect unknown", "ber": "Berber languages, dialect unknown", "wo": "Wolof, dialect unknown", "ve": "Venda, dialect unknown", "th": "Thai, dialect unknown", "sc": "Sardinian, dialect unknown", "ml": "Malayalam, dialect unknown", "mg": "Malagasy, dialect unknown", "km": "Khmer, dialect unknown", "ka": "Georgian, dialect unknown", "mos": "Mossi, dialect unknown", "ta": "Tamil, dialect unknown", "mn": "Mongolian, dialect unknown", "kn": "Kannada, dialect unknown", "az": "Azerbaijani, dialect unknown", "roa": "Romance languages, dialect unknown", "yue": "Yue Chinese, dialect unknown", "tt": "Tatar, dialect unknown", "tk": "Turkmen, dialect unknown", "te": "Telugu, dialect unknown", "na": "Nauru, dialect unknown", "mi": "M\u0101ori, dialect unknown", "cv": "Chuvash, dialect unknown", "ba": "Bashkir, dialect unknown", "cel": "Celtic languages, dialect unknown", "umb": "Umbundu, dialect unknown", "sa": "Sanskrit, dialect unknown", "my": "Burmese, dialect unknown", "lo": "Lao, dialect unknown", "kl": "Kalaallisut, dialect unknown", "io": "Ido, dialect unknown", "ce": "Chechen, dialect unknown", "ab": "Abkhaz, dialect unknown", "fse": "Finnish Sign Language, dialect unknown", "zai": "Isthmus Zapotec, dialect unknown", "tzo": "Tzotzil, dialect unknown", "prl": "Peruvian Sign Language, dialect unknown", "mfs": "Mexican Sign Language, dialect unknown", "nyk": "Nyaneka, dialect unknown", "luo": "Luo, dialect unknown", "lun": "Lunda, dialect unknown", "kwn": "Kwangali, dialect unknown", "csn": "Colombian Sign Language, dialect unknown", "csg": "Chilean Sign Language, dialect unknown", "aed": "Argentine Sign Language, dialect unknown", "sw": "Swahili, dialect unknown", "su": "Sundanese, dialect unknown", "ss": "Swati, dialect unknown", "om": "Oromo, dialect unknown", "nv": "Navajo, dialect unknown", "ng": "Ndonga, dialect unknown", "ne": "Nepali, dialect unknown", "kj": "Kwanyama, dialect unknown", "jv": "Javanese, dialect unknown", "gn": "Guaran\u00ed, dialect unknown", "fa": "Persian, dialect unknown", "ch": "Chamorro, dialect unknown", "bo": "Tibetan Standard, dialect unknown", "wal": "Wolaitta, dialect unknown", "vsl": "Venezuelan Sign Language, dialect unknown", "ssp": "Spanish Sign Language, dialect unknown", "kab": "Kabyle, dialect unknown", "yua": "Yucateco, dialect unknown", "tdt": "Tetun Dili, dialect unknown", "pa": "Punjabi, dialect unknown", "nr": "Southern Ndebele, dialect unknown", "kk": "Kazakh, dialect unknown", "dv": "Divehi, dialect unknown", "Oth": "other"}}}
diff --git a/tagging_app.py b/tagging_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..241b87325d0423ab1876a604cd34c6be21c39c28
--- /dev/null
+++ b/tagging_app.py
@@ -0,0 +1,549 @@
+import copy
+import datasets
+import json
+import os
+import streamlit as st
+import yaml
+
+from dataclasses import asdict
+from glob import glob
+from os.path import join as pjoin
+
+st.beta_set_page_config(
+    page_title="HF Dataset Tagging App",
+    page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+
+task_set = {
+    "conditional-text-generation": {
+        "description": "data-to-text and text transduction tasks such as translation or summarization",
+        "options": [
+            "machine-translation",
+            "sentence-splitting-fusion",
+            "summarization",
+            "table-to-text",
+            "text-simplification",
+            "explanation-generation",
+            "other",
+        ],
+    },
+    "question-answering": {
+        "description": "question answering tasks",
+        "options": [
+            "open-domain-qa",
+            "closed-domain-qa",
+            "multiple-choice-qa",
+            "extractive-qa",
+            "abstractive-qa",
+            "other",
+        ],
+    },
+    "sequence-modeling": {
+        "description": "such as language modeling or dialogue",
+        "options": [
+            "dialogue-modeling",
+            "language-modeling",
+            "other-multi-turn",
+            "slot-filling",
+            "other",
+        ],
+    },
+    "structure-prediction": {
+        "description": "predicting structural properties of the text, such as syntax",
+        "options": [
+            "coreference-resolution",
+            "named-entity-recognition",
+            "parsing",
+            "other",
+        ],
+    },
+    "text-classification": {
+        "description": "predicting a class index or boolean value",
+        "options": [
+            "acceptability-classification",
+            "entity-linking-classification",
+            "fact-checking",
+            "intent-classification",
+            "multi-class-classification",
+            "multi-label-classification",
+            "natural-language-inference",
+            "semantic-similarity-classification",
+            "sentiment-classification",
+            "topic-classification",
+            "other",
+        ],
+    },
+    "text-retrieval": {
+        "description": "information or text retrieval tasks",
+        "options": [
+            "document-retrieval",
+            "utterance-retrieval",
+            "entity-linking-retrieval",
+            "fact-checking-retrieval",
+            "other",
+        ],
+    },
+    "text-scoring": {
+        "description": "text scoring tasks, predicting a real valued score for some text",
+        "options": [
+            "semantic-similarity-scoring",
+            "sentiment-scoring",
+            "other",
+        ],
+    },
+    "other": {
+        "description": "other task family not mentioned here",
+        "options": [
+            "other",
+        ],
+    },
+}
+
+multilinguality_set = {
+    "monolingual": "contains a single language",
+    "multilingual": "contains multiple languages",
+    "translation": "contains translated or aligned text",
+    "other": "other type of language distribution",
+}        
+
+creator_set = {
+    "language": [
+        "found",
+        "crowdsourced",
+        "expert-generated",
+        "machine-generated",
+        "other",
+    ],
+    "annotations": [
+        "found",
+        "crowdsourced",
+        "expert-generated",
+        "machine-generated",
+        "no-annotation",
+        "other",
+    ],
+}
+
+license_set = {
+    'afl-3.0': 'Academic Free License',
+    'apache-2.0': 'Apache license 2.0',
+    'artistic-2.0': 'Artistic license 2.0',
+    'bsl-1.0': 'Boost Software License 1.0',
+    'bsd-2-clause': 'BSD 2-clause "Simplified" license',
+    'bsd-3-clause': 'BSD 3-clause "New" or "Revised" license',
+    'bsd-3-clause-clear': 'BSD 3-clause Clear license',
+    'cc': 'Creative Commons license family',
+    'cc0-1.0': 'Creative Commons Zero v1.0 Universal',
+    'cc-by-sa-3.0': 'Creative Commons Attribution Share Alike 3.0',
+    'cc-by-4.0': 'Creative Commons Attribution 4.0',
+    'cc-by-nc-4.0': 'Creative Commons Attribution Non Commercial 4.0',
+    'cc-by-nc-sa-4.0': 'Creative Commons Attribution Non Commercial Share Alike 4.0',
+    'cc-by-sa-4.0': 'Creative Commons Attribution Share Alike 4.0',
+    'wtfpl': 'Do What The F*ck You Want To Public License',
+    'ecl-2.0': 'Educational Community License v2.0',
+    'epl-1.0': 'Eclipse Public License 1.0',
+    'epl-2.0': 'Eclipse Public License 2.0',
+    'eupl-1.1': 'European Union Public License 1.1',
+    'agpl-3.0': 'GNU Affero General Public License v3.0',
+    'gpl': 'GNU General Public License family',
+    'gpl-2.0': 'GNU General Public License v2.0',
+    'gpl-3.0': 'GNU General Public License v3.0',
+    'lgpl': 'GNU Lesser General Public License family',
+    'lgpl-2.1': 'GNU Lesser General Public License v2.1',
+    'lgpl-3.0': 'GNU Lesser General Public License v3.0',
+    'isc': 'ISC',
+    'lppl-1.3c': 'LaTeX Project Public License v1.3c',
+    'ms-pl': 'Microsoft Public License',
+    'mit': 'MIT',
+    'mpl-2.0': 'Mozilla Public License 2.0',
+    'osl-3.0': 'Open Software License 3.0',
+    'postgresql': 'PostgreSQL License',
+    'ofl-1.1': 'SIL Open Font License 1.1',
+    'ncsa': 'University of Illinois/NCSA Open Source License',
+    'unlicense': 'The Unlicense',
+    'zlib': 'zLib License',
+    'other': 'other license',
+    'unknown': 'could not find license information',
+}
+
+tag_set = json.load(open('tag_set.json'))
+language_set = dict([(k, v.replace(', dialect unknown', ''))
+                     for k, v in tag_set['language']["BCP-47"].items()])
+
+########################
+## Helper functions
+########################
+
+@st.cache
+def filter_features(feature_dict):
+    print(feature_dict)
+    if feature_dict.get("_type", None) == 'Value':
+        return {
+            "feature_type": feature_dict["_type"],
+            "dtype": feature_dict["dtype"],
+        }
+    elif feature_dict.get("_type", None) == 'Sequence':
+        if "dtype" in feature_dict["feature"]:
+            return {
+                "feature_type": feature_dict["_type"],
+                "feature": filter_features(feature_dict["feature"]),
+            }
+        else:
+            return dict(
+                [("feature_type", feature_dict["_type"])] + \
+                [(k, filter_features(v)) for k, v in feature_dict["feature"].items()]
+            )
+    elif feature_dict.get("_type", None) == 'ClassLabel':
+        return {
+            "feature_type": feature_dict["_type"],
+            "dtype": "int32",
+            "class_names": feature_dict["names"],
+        }
+    elif feature_dict.get("_type", None) in ['Translation', 'TranslationVariableLanguages']:
+        return {
+            "feature_type": feature_dict["_type"],
+            "dtype": "string",
+            "languages": feature_dict["languages"],            
+        }
+    else:
+        return dict([(k, filter_features(v)) for k, v in feature_dict.items()])
+
+@st.cache
+def find_languages(feature_dict):
+    if type(feature_dict) in [dict, datasets.features.Features]:
+        languages = [l for l in feature_dict.get('languages', [])]
+        for k, v in feature_dict.items():
+            languages += [l  for l in find_languages(v)]
+        return languages
+    else:
+        return []
+
+@st.cache
+def get_info_dicts(dataset_id):
+    keep_keys = ['description', 'features', 'homepage', 'license', 'splits']
+    module_path = datasets.load.prepare_module(dataset_id, dataset=True)
+    builder_cls = datasets.load.import_main_class(module_path[0], dataset=True)
+    build_confs = builder_cls.BUILDER_CONFIGS
+    confs = [conf.name for conf in build_confs] if len(build_confs) > 0 else ['default']
+    all_info_dicts = {}
+    for conf in confs:
+        builder = builder_cls(name=conf)
+        conf_info_dict = dict([(k, v) for k, v in asdict(builder.info).items() if k in keep_keys])
+        all_info_dicts[conf] = conf_info_dict
+    return all_info_dicts
+
+@st.cache
+def get_dataset_list():
+    dataset_list = datasets.list_datasets()
+    all_dataset_ids = ["other"] + dataset_list
+    return all_dataset_ids
+
+@st.cache()
+def load_all_dataset_infos(dataset_list):
+    dataset_infos = {}
+    for did in dataset_list:
+        try:
+            dataset_infos[did] = get_info_dicts(did)
+        except:
+            print("+++++++++++ MISSED", did)
+    return dataset_infos
+
+def load_existing_tags():
+    has_tags = {}
+    for fname in glob("saved_tags/*/*/tags.json"):
+        _, did, cid, _ = fname.split('/')
+        has_tags[did] = has_tags.get(did, {})
+        has_tags[did][cid] = fname
+    return has_tags
+
+########################
+## Dataset selection
+########################
+
+st.sidebar.markdown(
+    """<center>
+<a href="https://github.com/huggingface/datasets">
+<img src="https://raw.githubusercontent.com/huggingface/datasets/master/docs/source/imgs/datasets_logo_name.jpg" width="200"></a>
+</center>""",
+    unsafe_allow_html=True,
+)
+
+app_desc = """
+### Dataset Tagger  
+
+This app aims to make it easier to add structured tags to the datasets present in the library.  
+
+Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
+to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.  
+
+The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
+"""
+
+all_dataset_ids = copy.deepcopy(get_dataset_list())
+existing_tag_sets = load_existing_tags()
+all_dataset_infos = load_all_dataset_infos(all_dataset_ids[1:])
+
+st.sidebar.markdown(app_desc)
+
+# option to only select from datasets that still need to be annotated
+only_missing = st.sidebar.checkbox("Show only un-annotated configs")
+
+if only_missing:
+    dataset_choose_list = [did for did, c_dict in all_dataset_infos.items()
+                               if not all([cid in existing_tag_sets.get(did, {}) for cid in c_dict])]
+else:
+    dataset_choose_list = list(all_dataset_infos.keys())
+
+dataset_id = st.sidebar.selectbox(
+    label="Choose dataset to tag",
+    options=dataset_choose_list,
+    index=0,
+)
+
+all_info_dicts = all_dataset_infos[dataset_id]
+
+if only_missing:
+    config_choose_list = [cid for cid in all_info_dicts
+                              if not cid in existing_tag_sets.get(dataset_id, {})]
+else:
+    config_choose_list = list(all_info_dicts.keys())
+
+config_id = st.sidebar.selectbox(
+    label="Choose configuration",
+    options=config_choose_list,
+)
+
+config_infos = all_info_dicts[config_id]
+
+c1, _, c2, _, c3 = st.beta_columns([8, 1, 14, 1, 10])
+
+########################
+## Dataset description
+########################
+
+data_desc = f"### Dataset: {dataset_id} | Configuration: {config_id}" + "\n"
+data_desc += f"[Homepage]({config_infos['homepage']})" + " | "
+data_desc += f"[Data script](https://github.com/huggingface/datasets/blob/master/datasets/{dataset_id}/{dataset_id}.py)" + " | "
+data_desc += f"[View examples](https://huggingface.co/nlp/viewer/?dataset={dataset_id}&config={config_id})"
+c1.markdown(data_desc)
+
+with c1.beta_expander("Dataset description:", expanded=True):
+    st.markdown(config_infos['description'])
+
+# "pretty-fy" the features to be a little easier to read
+features = filter_features(config_infos['features'])
+with c1.beta_expander(f"Dataset features for config: {config_id}", expanded=True):
+    st.write(features)
+
+########################
+## Dataset tagging
+########################
+
+c2.markdown(f"### Writing tags for: {dataset_id} / {config_id}")
+
+##########
+# Pre-load information to speed things up
+##########
+c2.markdown("#### Pre-loading an existing tag set")
+
+existing_tag_sets = load_existing_tags()
+
+pre_loaded = {
+    "task_categories": [],
+    "task_ids": [],
+    "multilinguality": [],
+    "languages": [],
+    "language_creators": [],
+    "annotations_creators": [],
+    "source_datasets": [],
+    "size_categories": [],
+    "licenses": [],
+}
+
+if existing_tag_sets.get(dataset_id, {}).get(config_id, None) is not None:
+    existing_tags_fname = existing_tag_sets[dataset_id][config_id]
+    c2.markdown(f"#### Attention: this config already has a tagset saved in {existing_tags_fname}\n---  \n")
+    if c2.checkbox("pre-load existing tag set"):
+        pre_loaded = json.load(open(existing_tags_fname))
+
+c2.markdown("> *You may choose to pre-load the tag set of another dataset or configuration:*")
+
+with c2.beta_expander("- Choose tag set to pre-load"):
+    did_choice_list = list(existing_tag_sets.keys())
+    if len(existing_tag_sets) > 0:
+        did = st.selectbox(
+            label="Choose dataset to load tag set from",
+            options=did_choice_list,
+            index=did_choice_list.index(dataset_id) if dataset_id in did_choice_list else 0,
+        )
+        cid = st.selectbox(
+            label="Choose config to load tag set from",
+            options=list(existing_tag_sets[did].keys()),
+            index=0,
+        )
+        if st.checkbox("pre-load this tag set"):
+            pre_loaded = json.load(open(existing_tag_sets[did][cid]))
+    else:
+        st.write("There are currently no other saved tag sets.")
+
+pre_loaded["languages"] = list(set(pre_loaded["languages"] + find_languages(features)))
+if config_infos["license"] in license_set:
+    pre_loaded["licenses"] = list(set(pre_loaded["licenses"] + [config_infos["license"]]))        
+
+##########
+# Modify or add new tags
+##########
+c2.markdown("#### Editing the tag set")
+c2.markdown("> *Expand the following boxes to edit the tag set. For each of the questions, choose all that apply, at least one option:*")        
+
+with c2.beta_expander("- Supported tasks"):
+    task_categories = st.multiselect(
+        "What categories of task does the dataset support?",
+        options=list(task_set.keys()),
+        default=pre_loaded["task_categories"],
+        format_func=lambda tg: f"{tg} : {task_set[tg]['description']}",
+    )
+    task_specifics = []
+    for tg in task_categories:
+        task_specs = st.multiselect(
+            f"What specific *{tg}* tasks does the dataset support?",
+            options=task_set[tg]["options"],
+            default=[ts for ts in pre_loaded["task_ids"] if ts in task_set[tg]["options"]],
+        )
+        if "other" in task_specs:
+            other_task = st.text_input(
+                "You selected 'other' task. Please enter a short hyphen-separated description for the task:", 
+                value='my-task-description',
+            )
+            st.write(f"Registering {tg}-other-{other_task} task")
+            task_specs[task_specs.index("other")] = f"{tg}-other-{other_task}"
+        task_specifics += task_specs
+        
+with c2.beta_expander("- Languages"):
+    multilinguality = st.multiselect(
+        "Does the dataset contain more than one language?",
+        options=list(multilinguality_set.keys()),
+        default=pre_loaded["multilinguality"],
+        format_func= lambda m: f"{m} : {multilinguality_set[m]}",
+    )
+    if "other" in multilinguality:
+        other_multilinguality = st.text_input(
+            "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:", 
+            value='my-multilinguality',
+        )
+        st.write(f"Registering other-{other_multilinguality} multilinguality")
+        multilinguality[multilinguality.index("other")] = f"other-{other_multilinguality}"
+    languages = st.multiselect(
+        "What languages are represented in the dataset?",
+        options=list(language_set.keys()),
+        default=pre_loaded["languages"],
+        format_func= lambda m: f"{m} : {language_set[m]}",
+    )
+
+with c2.beta_expander("- Dataset creators"):
+    language_creators = st.multiselect(
+        "Where does the text in the dataset come from?",
+        options=creator_set["language"],
+        default=pre_loaded["language_creators"],
+    )
+    annotations_creators = st.multiselect(
+        "Where do the annotations in the dataset come from?",
+        options=creator_set["annotations"],
+        default=pre_loaded["annotations_creators"],
+    )
+    licenses = st.multiselect(
+        "What licenses is the dataset under?",
+        options=list(license_set.keys()),
+        default=pre_loaded["licenses"],
+        format_func= lambda l: f"{l} : {license_set[l]}",
+    )
+    if "other" in licenses:
+        other_license = st.text_input(
+            "You selected 'other' type of license. Please enter a short hyphen-separated description:", 
+            value='my-license',
+        )
+        st.write(f"Registering other-{other_license} license")
+        licenses[licenses.index("other")] = f"other-{other_license}"
+    # link ro supported datasets
+    pre_select_ext_a = []
+    if "original" in pre_loaded["source_datasets"]:
+        pre_select_ext_a += ["original"]
+    if any([p.startswith("extended") for p in pre_loaded["source_datasets"]]):
+        pre_select_ext_a += ["extended"]
+    extended = st.multiselect(
+        "Does the dataset contain original data and/or was it extended from other datasets?",
+        options=["original", "extended"],
+        default=pre_select_ext_a,
+    )
+    source_datasets = ["original"] if "original" in extended else []
+    if "extended" in extended:
+        pre_select_ext_b = [p.split('|')[1] for p in pre_loaded["source_datasets"] if p.startswith("extended")]
+        extended_sources = st.multiselect(
+            "Which other datasets does this one use data from?",
+            options=all_dataset_ids,
+            default=pre_select_ext_b,
+        )
+        if "other" in extended_sources:
+            other_extended_sources = st.text_input(
+                "You selected 'other' dataset. Please enter a short hyphen-separated description:", 
+                value='my-dataset',
+            )
+            st.write(f"Registering other-{other_extended_sources} dataset")
+            extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
+        source_datasets += [f"extended|{src}" for src in extended_sources]
+        
+num_examples = (
+    sum([dct.get('num_examples', 0) for spl, dct in config_infos['splits'].items()])
+    if config_infos.get('splits', None) is not None
+    else -1
+)
+if num_examples < 0:
+    size_cat = "unknown"
+elif num_examples < 1000:
+    size_cat = "n<1K"
+elif num_examples < 10000:
+    size_cat = "1K<n<10K"
+elif num_examples < 100000:
+    size_cat = "10K<n<100K"
+elif num_examples < 1000000:
+    size_cat = "100K<n<1M"
+else:
+    size_cat = "n>1M"
+    
+res = {
+    "task_categories": task_categories,
+    "task_ids": task_specifics,
+    "multilinguality": multilinguality,
+    "languages": languages,
+    "language_creators": language_creators,
+    "annotations_creators": annotations_creators,
+    "source_datasets": source_datasets,
+    "size_categories": [size_cat],
+    "licenses": licenses,
+}
+
+########################
+## Show results
+########################
+c3.markdown("### Finalized tag set:")
+
+if c3.button("Done? Save to File!"):
+    if not os.path.isdir(pjoin('saved_tags', dataset_id)):
+        _ = os.mkdir(pjoin('saved_tags', dataset_id))
+    if not os.path.isdir(pjoin('saved_tags', dataset_id, config_id)):
+        _ = os.mkdir(pjoin('saved_tags', dataset_id, config_id))
+    json.dump(res, open(pjoin('saved_tags', dataset_id, config_id, 'tags.json'), 'w'))
+    
+with c3.beta_expander("Show JSON output"):
+    st.write(res)
+
+with c3.beta_expander("Show YAML output"):
+    st.text(yaml.dump(res))
+
+c3.markdown("---  ")
+
+with c3.beta_expander("----> show full task set <----", expanded=True):
+    st.write(task_set)
+