Yacine Jernite commited on
Commit
5d377d2
1 Parent(s): 208a5e1
Files changed (2) hide show
  1. app.py +79 -0
  2. resources/sources_with_info_cards.json +0 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import streamlit as st
4
+
5
+ st.set_page_config(
6
+ page_title="BigScience Training Corpus",
7
+ page_icon="https://avatars.githubusercontent.com/u/82455566",
8
+ layout="wide",
9
+ initial_sidebar_state="auto",
10
+ )
11
+
12
+ query_params = st.experimental_get_query_params()
13
+
14
+
15
+ @st.cache()
16
+ def load_catalogue():
17
+ full_catalogue = dict(
18
+ [
19
+ (source_name, source)
20
+ for source_name, source in json.load(
21
+ open("resources/sources_with_info_cards.json")
22
+ )
23
+ if source_name != "aggregated"
24
+ ]
25
+ )
26
+ language_catalogues = {
27
+ "all": full_catalogue,
28
+ }
29
+ for source_name, source in full_catalogue.items():
30
+ for ln_dct in source["languages"]:
31
+ ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"]
32
+ language_catalogues[ln_code] = language_catalogues.get(ln_code, {})
33
+ language_catalogues[ln_code][source_name] = source
34
+ for ln in language_catalogues:
35
+ if ln != "all":
36
+ language_catalogues[ln] = dict(
37
+ sorted(
38
+ language_catalogues[ln].items(),
39
+ key=lambda x: [
40
+ ln_dct["size"]
41
+ for ln_dct in x[1]["languages"]
42
+ if ln_dct["ln_code"] == ln
43
+ ][0],
44
+ reverse=True,
45
+ )
46
+ )
47
+ return dict(sorted(language_catalogues.items()))
48
+
49
+
50
+ catalogue_by_ln = load_catalogue()
51
+
52
+ with st.sidebar:
53
+ ln_select = st.selectbox(
54
+ "Show source for language:",
55
+ catalogue_by_ln,
56
+ )
57
+ source_select = st.selectbox(
58
+ "Show information for source:",
59
+ catalogue_by_ln[ln_select],
60
+ index=list(catalogue_by_ln[ln_select]).index(
61
+ query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0]
62
+ ),
63
+ )
64
+ st.experimental_set_query_params(**{"source": source_select})
65
+
66
+ with st.expander(f"Dataset Card for {source_select}", expanded=True):
67
+ st.markdown(catalogue_by_ln["all"][source_select]["data_card"])
68
+
69
+ if "catalogue_info" in catalogue_by_ln["all"][source_select]:
70
+ with st.expander(f"Catalogue Information for {source_select}"):
71
+ st.write(catalogue_by_ln["all"][source_select]["catalogue_info"])
72
+
73
+ if "seed_info" in catalogue_by_ln["all"][source_select]:
74
+ with st.expander(f"Pseudocrawl Seed Information for {source_select}"):
75
+ st.write(catalogue_by_ln["all"][source_select]["seed_info"])
76
+
77
+ if "hf_info" in catalogue_by_ln["all"][source_select]:
78
+ with st.expander(f"HF Dataset Information for {source_select}"):
79
+ st.write(catalogue_by_ln["all"][source_select]["hf_info"])
resources/sources_with_info_cards.json CHANGED
The diff for this file is too large to render. See raw diff