Spaces:

bigscience
/

SourcingCatalog

Running

File size: 12,361 Bytes

import json

import streamlit as st
from datasets import load_dataset
from streamlit_folium import folium_static

from catalogue import make_choro_map, region_tree

##################
## streamlit
##################
st.set_page_config(
    page_title="BigScience Language Resource Catalogue Input Form",
    page_icon="https://avatars.githubusercontent.com/u/82455566",
    layout="wide",
    initial_sidebar_state="auto",
)

query_params = st.experimental_get_query_params()


def main():
    if "save_state" not in st.session_state:
        st.session_state.save_state = {}

    viz_page()


##################
## SECTION: Explore the current catalogue
##################

app_categories = {
    "entry_types": {
        "primary": "Primary source",
        "processed": "Processed language dataset",
        "organization": "Language organization or advocate",
    },
    "language_lists": json.load(
        open("resources/language_lists.json", encoding="utf-8")
    ),
    "programming_languages": [
        x
        for x in json.load(
            open("resources/programming_languages.json", encoding="utf-8")
        )["itemListElement"]
    ],
    "languages_bcp47": [
        x
        for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"]
        if x["type"] == "language"
    ],
    "custodian_types": [
        "A private individual",
        "A commercial entity",
        "A library, museum, or archival institute",
        "A university or research institution",
        "A nonprofit/NGO (other)",
        "A government organization",
    ],
    "pii_categories": json.load(
        open("resources/pii_categories.json", encoding="utf-8")
    ),
    "licenses": json.load(open("resources/licenses.json", encoding="utf-8")),
    "primary_taxonomy": json.load(
        open("resources/primary_source_taxonomy.json", encoding="utf-8")
    ),
    "file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")),
}


def filter_entry(entry, filter_dct):
    res = True
    for k, v in entry.items():
        if k in filter_dct:
            if isinstance(v, dict):
                res = res and filter_entry(v, filter_dct[k])
            elif isinstance(v, list):
                res = res and (
                    len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v])
                )
            else:
                res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k])
    return res


def filter_catalogue_visualization(catalogue, options):
    st.markdown("### Select entries to visualize")
    st.markdown(
        "##### Select entries by category, language, type of custodian or media"
    )
    st.markdown(
        "You can select specific parts of the catalogue to visualize in this window."
        + " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values."
    )
    filter_by_options = [
        "resource type",
        "language names",
        "custodian type",
        "available for download",
        "license type",
        "source type",
        "media type",
    ]
    filter_by = st.multiselect(
        key="viz_filter_by",
        label="You can filter the catalogue to only visualize entries that have certain properties, such as:",
        options=filter_by_options,
    )
    filter_dict = {}
    if "resource type" in filter_by:
        filter_dict["type"] = st.multiselect(
            key="viz_filter_type",
            label="I want to only see entries that are of the following category:",
            options=options["entry_types"],
            format_func=lambda x: options["entry_types"][x],
        )
    if "language names" in filter_by:
        filter_dict["languages"] = {}
        filter_dict["languages"]["language_names"] = st.multiselect(
            key="viz_filter_languages_language_names",
            label="I want to only see entries that have one of the following languages:",
            options=list(options["language_lists"]["language_groups"].keys())
            + options["language_lists"]["niger_congo_languages"]
            + options["language_lists"]["indic_languages"],
        )
    if "custodian type" in filter_by:
        filter_dict["custodian"] = {}
        filter_dict["custodian"]["type"] = st.multiselect(
            key="viz_filter_custodian_type",
            label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:",
            options=options["custodian_types"],
        )
    if "available for download" in filter_by:
        filter_dict["availability"] = filter_dict.get("availability", {})
        filter_dict["availability"]["procurement"] = {}
        download_options = [
            "No - but the current owners/custodians have contact information for data queries",
            "No - we would need to spontaneously reach out to the current owners/custodians",
            "Yes - it has a direct download link or links",
            "Yes - after signing a user agreement",
        ]
        filter_dict["availability"]["procurement"]["for_download"] = st.multiselect(
            key="viz_availability_procurement_for_download",
            label="Select based on whether the data can be obtained online:",
            options=download_options,
        )
    if "license type" in filter_by:
        filter_dict["availability"] = filter_dict.get("availability", {})
        filter_dict["availability"]["licensing"] = {}
        filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect(
            key="viz_availability_licensing_license_properties",
            label="Select primary entries that have the following license types",
            options=[
                "public domain",
                "multiple licenses",
                "copyright - all rights reserved",
                "open license",
                "research use",
                "non-commercial use",
                "do not distribute",
            ],
        )
        primary_license_options = [
            "Unclear / I don't know",
            "Yes - the source material has an open license that allows re-use",
            "Yes - the dataset has the same license as the source material",
            "Yes - the dataset curators have obtained consent from the source material owners",
            "No - the license of the source material actually prohibits re-use in this manner",
        ]
        filter_dict["processed_from_primary"] = filter_dict.get(
            "processed_from_primary", {}
        )
        filter_dict["processed_from_primary"]["primary_license"] = st.multiselect(
            key="viz_processed_from_primary_primary_license",
            label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?",
            options=primary_license_options,
        )
    if "source type" in filter_by:
        filter_dict["source_category"] = {}
        filter_dict["source_category"]["category_type"] = st.multiselect(
            key="viz_source_category_category_type",
            label="Select primary sources that correspond to:",
            options=["collection", "website"],
        )
        filter_dict["source_category"]["category_web"] = st.multiselect(
            key="viz_source_category_category_web",
            label="Select web-based primary sources that contain:",
            options=options["primary_taxonomy"]["website"],
        )
        filter_dict["source_category"]["category_media"] = st.multiselect(
            key="viz_source_category_category_media",
            label="Select primary sources that are collections of:",
            options=options["primary_taxonomy"]["collection"],
        )
        filter_dict["processed_from_primary"] = filter_dict.get(
            "processed_from_primary", {}
        )
        filter_dict["processed_from_primary"]["primary_types"] = st.multiselect(
            key="viz_processed_from_primary_primary_types",
            label="Select processed datasets whose primary sources contain:",
            options=[f"web | {w}" for w in options["primary_taxonomy"]["website"]]
            + options["primary_taxonomy"]["collection"],
        )
    if "media type" in filter_by:
        filter_dict["media"] = {}
        filter_dict["media"]["category"] = st.multiselect(
            key="viz_media_category",
            label="Select language data resources that contain:",
            options=["text", "audiovisual", "image"],
            help="Media data provided with transcription should go into **text**, then select the *transcribed* option. PDFs that have pre-extracted text information should go into **text**, PDFs that need OCR should go into **images**, select the latter if you're unsure",
        )
    filtered_catalogue = [
        entry
        for entry in catalogue
        if filter_entry(entry, filter_dict) and not (entry["uid"] == "")
    ]
    st.markdown(
        f"##### Your query matched **{len(filtered_catalogue)}** entries in the current catalogue."
    )
    return filtered_catalogue


def viz_page():
    st.title("🌸 - BigScience Catalog of Language Resources")
    st.markdown("---\n")
    catalogue = load_dataset("bigscience/collaborative_catalog")["train"]
    with st.sidebar:
        filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories)
        entry_location_type = st.radio(
            label="I want to visualize",
            options=[
                "Where the organizations or data custodians are located",
                "Where the language data creators are located",
            ],
            key="viz_show_location_type",
        )
        show_by_org = (
            entry_location_type
            == "Where the organizations or data custodians are located"
        )
    with st.expander("Map of entries", expanded=True):
        filtered_counts = {}
        for entry in filtered_catalogue:
            locations = (
                [entry["custodian"]["location"]]
                if show_by_org
                else entry["languages"]["language_locations"]
            )
            # be as specific as possible
            locations = [
                loc
                for loc in locations
                if not any([l in region_tree.get(loc, []) for l in locations])
            ]
            for loc in locations:
                filtered_counts[loc] = filtered_counts.get(loc, 0) + 1
        world_map = make_choro_map(filtered_counts)
        folium_static(world_map, width=900, height=600)
    with st.expander("View selected resources", expanded=False):
        st.write("You can further select locations to select entries from here:")
        filter_region_choices = sorted(
            set(
                [
                    loc
                    for entry in filtered_catalogue
                    for loc in (
                        [entry["custodian"]["location"]]
                        if show_by_org
                        else entry["languages"]["language_locations"]
                    )
                ]
            )
        )
        filter_locs = st.multiselect(
            "View entries from the following locations:",
            options=filter_region_choices,
            key="viz_select_location",
        )
        filter_loc_dict = (
            {"custodian": {"location": filter_locs}}
            if show_by_org
            else {"languages": {"language_locations": filter_locs}}
        )
        filtered_catalogue_by_loc = [
            entry
            for entry in filtered_catalogue
            if filter_entry(entry, filter_loc_dict)
        ]
        view_entry = st.selectbox(
            label="Select an entry to see more detail:",
            options=filtered_catalogue_by_loc,
            format_func=lambda entry: f"{entry['uid']} | {entry['description']['name']} -- {entry['description']['description']}",
            key="viz_select_entry",
        )
        st.markdown(
            f"##### *Type:* {view_entry['type']} *UID:* {view_entry['uid']} - *Name:* {view_entry['description']['name']}\n\n{view_entry['description']['description']}"
        )
        st.write(view_entry)


if __name__ == "__main__":
    main()