import json import streamlit as st from datasets import load_dataset from streamlit_folium import folium_static from catalogue import make_choro_map, region_tree ################## ## streamlit ################## st.set_page_config( page_title="BigScience Language Resource Catalogue Input Form", page_icon="https://avatars.githubusercontent.com/u/82455566", layout="wide", initial_sidebar_state="auto", ) query_params = st.experimental_get_query_params() def main(): if "save_state" not in st.session_state: st.session_state.save_state = {} viz_page() ################## ## SECTION: Explore the current catalogue ################## app_categories = { "entry_types": { "primary": "Primary source", "processed": "Processed language dataset", "organization": "Language organization or advocate", }, "language_lists": json.load( open("resources/language_lists.json", encoding="utf-8") ), "programming_languages": [ x for x in json.load( open("resources/programming_languages.json", encoding="utf-8") )["itemListElement"] ], "languages_bcp47": [ x for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"] if x["type"] == "language" ], "custodian_types": [ "A private individual", "A commercial entity", "A library, museum, or archival institute", "A university or research institution", "A nonprofit/NGO (other)", "A government organization", ], "pii_categories": json.load( open("resources/pii_categories.json", encoding="utf-8") ), "licenses": json.load(open("resources/licenses.json", encoding="utf-8")), "primary_taxonomy": json.load( open("resources/primary_source_taxonomy.json", encoding="utf-8") ), "file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")), } def filter_entry(entry, filter_dct): res = True for k, v in entry.items(): if k in filter_dct: if isinstance(v, dict): res = res and filter_entry(v, filter_dct[k]) elif isinstance(v, list): res = res and ( len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v]) ) else: res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k]) return res def filter_catalogue_visualization(catalogue, options): st.markdown("### Select entries to visualize") st.markdown( "##### Select entries by category, language, type of custodian or media" ) st.markdown( "You can select specific parts of the catalogue to visualize in this window." + " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values." ) filter_by_options = [ "resource type", "language names", "custodian type", "available for download", "license type", "source type", "media type", ] filter_by = st.multiselect( key="viz_filter_by", label="You can filter the catalogue to only visualize entries that have certain properties, such as:", options=filter_by_options, ) filter_dict = {} if "resource type" in filter_by: filter_dict["type"] = st.multiselect( key="viz_filter_type", label="I want to only see entries that are of the following category:", options=options["entry_types"], format_func=lambda x: options["entry_types"][x], ) if "language names" in filter_by: filter_dict["languages"] = {} filter_dict["languages"]["language_names"] = st.multiselect( key="viz_filter_languages_language_names", label="I want to only see entries that have one of the following languages:", options=list(options["language_lists"]["language_groups"].keys()) + options["language_lists"]["niger_congo_languages"] + options["language_lists"]["indic_languages"], ) if "custodian type" in filter_by: filter_dict["custodian"] = {} filter_dict["custodian"]["type"] = st.multiselect( key="viz_filter_custodian_type", label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:", options=options["custodian_types"], ) if "available for download" in filter_by: filter_dict["availability"] = filter_dict.get("availability", {}) filter_dict["availability"]["procurement"] = {} download_options = [ "No - but the current owners/custodians have contact information for data queries", "No - we would need to spontaneously reach out to the current owners/custodians", "Yes - it has a direct download link or links", "Yes - after signing a user agreement", ] filter_dict["availability"]["procurement"]["for_download"] = st.multiselect( key="viz_availability_procurement_for_download", label="Select based on whether the data can be obtained online:", options=download_options, ) if "license type" in filter_by: filter_dict["availability"] = filter_dict.get("availability", {}) filter_dict["availability"]["licensing"] = {} filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect( key="viz_availability_licensing_license_properties", label="Select primary entries that have the following license types", options=[ "public domain", "multiple licenses", "copyright - all rights reserved", "open license", "research use", "non-commercial use", "do not distribute", ], ) primary_license_options = [ "Unclear / I don't know", "Yes - the source material has an open license that allows re-use", "Yes - the dataset has the same license as the source material", "Yes - the dataset curators have obtained consent from the source material owners", "No - the license of the source material actually prohibits re-use in this manner", ] filter_dict["processed_from_primary"] = filter_dict.get( "processed_from_primary", {} ) filter_dict["processed_from_primary"]["primary_license"] = st.multiselect( key="viz_processed_from_primary_primary_license", label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?", options=primary_license_options, ) if "source type" in filter_by: filter_dict["source_category"] = {} filter_dict["source_category"]["category_type"] = st.multiselect( key="viz_source_category_category_type", label="Select primary sources that correspond to:", options=["collection", "website"], ) filter_dict["source_category"]["category_web"] = st.multiselect( key="viz_source_category_category_web", label="Select web-based primary sources that contain:", options=options["primary_taxonomy"]["website"], ) filter_dict["source_category"]["category_media"] = st.multiselect( key="viz_source_category_category_media", label="Select primary sources that are collections of:", options=options["primary_taxonomy"]["collection"], ) filter_dict["processed_from_primary"] = filter_dict.get( "processed_from_primary", {} ) filter_dict["processed_from_primary"]["primary_types"] = st.multiselect( key="viz_processed_from_primary_primary_types", label="Select processed datasets whose primary sources contain:", options=[f"web | {w}" for w in options["primary_taxonomy"]["website"]] + options["primary_taxonomy"]["collection"], ) if "media type" in filter_by: filter_dict["media"] = {} filter_dict["media"]["category"] = st.multiselect( key="viz_media_category", label="Select language data resources that contain:", options=["text", "audiovisual", "image"], help="Media data provided with transcription should go into **text**, then select the *transcribed* option. PDFs that have pre-extracted text information should go into **text**, PDFs that need OCR should go into **images**, select the latter if you're unsure", ) filtered_catalogue = [ entry for entry in catalogue if filter_entry(entry, filter_dict) and not (entry["uid"] == "") ] st.markdown( f"##### Your query matched **{len(filtered_catalogue)}** entries in the current catalogue." ) return filtered_catalogue def viz_page(): st.title("🌸 - BigScience Catalog of Language Resources") st.markdown("---\n") catalogue = load_dataset("bigscience/collaborative_catalog")["train"] with st.sidebar: filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories) entry_location_type = st.radio( label="I want to visualize", options=[ "Where the organizations or data custodians are located", "Where the language data creators are located", ], key="viz_show_location_type", ) show_by_org = ( entry_location_type == "Where the organizations or data custodians are located" ) with st.expander("Map of entries", expanded=True): filtered_counts = {} for entry in filtered_catalogue: locations = ( [entry["custodian"]["location"]] if show_by_org else entry["languages"]["language_locations"] ) # be as specific as possible locations = [ loc for loc in locations if not any([l in region_tree.get(loc, []) for l in locations]) ] for loc in locations: filtered_counts[loc] = filtered_counts.get(loc, 0) + 1 world_map = make_choro_map(filtered_counts) folium_static(world_map, width=900, height=600) with st.expander("View selected resources", expanded=False): st.write("You can further select locations to select entries from here:") filter_region_choices = sorted( set( [ loc for entry in filtered_catalogue for loc in ( [entry["custodian"]["location"]] if show_by_org else entry["languages"]["language_locations"] ) ] ) ) filter_locs = st.multiselect( "View entries from the following locations:", options=filter_region_choices, key="viz_select_location", ) filter_loc_dict = ( {"custodian": {"location": filter_locs}} if show_by_org else {"languages": {"language_locations": filter_locs}} ) filtered_catalogue_by_loc = [ entry for entry in filtered_catalogue if filter_entry(entry, filter_loc_dict) ] view_entry = st.selectbox( label="Select an entry to see more detail:", options=filtered_catalogue_by_loc, format_func=lambda entry: f"{entry['uid']} | {entry['description']['name']} -- {entry['description']['description']}", key="viz_select_entry", ) st.markdown( f"##### *Type:* {view_entry['type']} *UID:* {view_entry['uid']} - *Name:* {view_entry['description']['name']}\n\n{view_entry['description']['description']}" ) st.write(view_entry) if __name__ == "__main__": main()