Spaces:

bigscience
/

BigScienceCorpus

Running

File size: 2,728 Bytes

import json

import streamlit as st

st.set_page_config(
    page_title="BigScience Training Corpus",
    page_icon="https://avatars.githubusercontent.com/u/82455566",
    layout="wide",
    initial_sidebar_state="auto",
)

query_params = st.experimental_get_query_params()


@st.cache()
def load_catalogue():
    full_catalogue = dict(
        [
            (source_name, source)
            for source_name, source in json.load(
                open("resources/sources_with_info_cards.json")
            )
            if source_name != "aggregated"
        ]
    )
    language_catalogues = {
        "all": full_catalogue,
    }
    for source_name, source in full_catalogue.items():
        for ln_dct in source["languages"]:
            ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"]
            language_catalogues[ln_code] = language_catalogues.get(ln_code, {})
            language_catalogues[ln_code][source_name] = source
    for ln in language_catalogues:
        if ln != "all":
            language_catalogues[ln] = dict(
                sorted(
                    language_catalogues[ln].items(),
                    key=lambda x: [
                        ln_dct["size"]
                        for ln_dct in x[1]["languages"]
                        if ln_dct["ln_code"] == ln
                    ][0],
                    reverse=True,
                )
            )
    return dict(sorted(language_catalogues.items()))


catalogue_by_ln = load_catalogue()

with st.sidebar:
    ln_select = st.selectbox(
        "Show source list for language:",
        catalogue_by_ln,
    )
    source_select = st.selectbox(
        "Show information for source:",
        catalogue_by_ln[ln_select],
        index=list(catalogue_by_ln[ln_select]).index(
            query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0]
        ) if ln_select == "all" else 0,
    )
    st.experimental_set_query_params(**{"source": source_select})

with st.expander(f"Dataset Card for {source_select}", expanded=True):
    st.markdown(catalogue_by_ln["all"][source_select]["data_card"])

if "catalogue_info" in catalogue_by_ln["all"][source_select]:
    with st.expander(f"Catalogue Information for {source_select}"):
        st.write(catalogue_by_ln["all"][source_select]["catalogue_info"])

if "seed_info" in catalogue_by_ln["all"][source_select]:
    with st.expander(f"Pseudocrawl Seed Information for {source_select}"):
        st.write(catalogue_by_ln["all"][source_select]["seed_info"])

if "hf_info" in catalogue_by_ln["all"][source_select]:
    with st.expander(f"HF Dataset Information for {source_select}"):
        st.write(catalogue_by_ln["all"][source_select]["hf_info"])