SourcingCatalog / app.py
Yacine Jernite
narrower
0b3f7a0
import json
import streamlit as st
from datasets import load_dataset
from streamlit_folium import folium_static
from catalogue import make_choro_map, region_tree
##################
## streamlit
##################
st.set_page_config(
page_title="BigScience Language Resource Catalogue Input Form",
page_icon="https://avatars.githubusercontent.com/u/82455566",
layout="wide",
initial_sidebar_state="auto",
)
query_params = st.experimental_get_query_params()
def main():
if "save_state" not in st.session_state:
st.session_state.save_state = {}
viz_page()
##################
## SECTION: Explore the current catalogue
##################
app_categories = {
"entry_types": {
"primary": "Primary source",
"processed": "Processed language dataset",
"organization": "Language organization or advocate",
},
"language_lists": json.load(
open("resources/language_lists.json", encoding="utf-8")
),
"programming_languages": [
x
for x in json.load(
open("resources/programming_languages.json", encoding="utf-8")
)["itemListElement"]
],
"languages_bcp47": [
x
for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"]
if x["type"] == "language"
],
"custodian_types": [
"A private individual",
"A commercial entity",
"A library, museum, or archival institute",
"A university or research institution",
"A nonprofit/NGO (other)",
"A government organization",
],
"pii_categories": json.load(
open("resources/pii_categories.json", encoding="utf-8")
),
"licenses": json.load(open("resources/licenses.json", encoding="utf-8")),
"primary_taxonomy": json.load(
open("resources/primary_source_taxonomy.json", encoding="utf-8")
),
"file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")),
}
def filter_entry(entry, filter_dct):
res = True
for k, v in entry.items():
if k in filter_dct:
if isinstance(v, dict):
res = res and filter_entry(v, filter_dct[k])
elif isinstance(v, list):
res = res and (
len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v])
)
else:
res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k])
return res
def filter_catalogue_visualization(catalogue, options):
st.markdown("### Select entries to visualize")
st.markdown(
"##### Select entries by category, language, type of custodian or media"
)
st.markdown(
"You can select specific parts of the catalogue to visualize in this window."
+ " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values."
)
filter_by_options = [
"resource type",
"language names",
"custodian type",
"available for download",
"license type",
"source type",
"media type",
]
filter_by = st.multiselect(
key="viz_filter_by",
label="You can filter the catalogue to only visualize entries that have certain properties, such as:",
options=filter_by_options,
)
filter_dict = {}
if "resource type" in filter_by:
filter_dict["type"] = st.multiselect(
key="viz_filter_type",
label="I want to only see entries that are of the following category:",
options=options["entry_types"],
format_func=lambda x: options["entry_types"][x],
)
if "language names" in filter_by:
filter_dict["languages"] = {}
filter_dict["languages"]["language_names"] = st.multiselect(
key="viz_filter_languages_language_names",
label="I want to only see entries that have one of the following languages:",
options=list(options["language_lists"]["language_groups"].keys())
+ options["language_lists"]["niger_congo_languages"]
+ options["language_lists"]["indic_languages"],
)
if "custodian type" in filter_by:
filter_dict["custodian"] = {}
filter_dict["custodian"]["type"] = st.multiselect(
key="viz_filter_custodian_type",
label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:",
options=options["custodian_types"],
)
if "available for download" in filter_by:
filter_dict["availability"] = filter_dict.get("availability", {})
filter_dict["availability"]["procurement"] = {}
download_options = [
"No - but the current owners/custodians have contact information for data queries",
"No - we would need to spontaneously reach out to the current owners/custodians",
"Yes - it has a direct download link or links",
"Yes - after signing a user agreement",
]
filter_dict["availability"]["procurement"]["for_download"] = st.multiselect(
key="viz_availability_procurement_for_download",
label="Select based on whether the data can be obtained online:",
options=download_options,
)
if "license type" in filter_by:
filter_dict["availability"] = filter_dict.get("availability", {})
filter_dict["availability"]["licensing"] = {}
filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect(
key="viz_availability_licensing_license_properties",
label="Select primary entries that have the following license types",
options=[
"public domain",
"multiple licenses",
"copyright - all rights reserved",
"open license",
"research use",
"non-commercial use",
"do not distribute",
],
)
primary_license_options = [
"Unclear / I don't know",
"Yes - the source material has an open license that allows re-use",
"Yes - the dataset has the same license as the source material",
"Yes - the dataset curators have obtained consent from the source material owners",
"No - the license of the source material actually prohibits re-use in this manner",
]
filter_dict["processed_from_primary"] = filter_dict.get(
"processed_from_primary", {}
)
filter_dict["processed_from_primary"]["primary_license"] = st.multiselect(
key="viz_processed_from_primary_primary_license",
label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?",
options=primary_license_options,
)
if "source type" in filter_by:
filter_dict["source_category"] = {}
filter_dict["source_category"]["category_type"] = st.multiselect(
key="viz_source_category_category_type",
label="Select primary sources that correspond to:",
options=["collection", "website"],
)
filter_dict["source_category"]["category_web"] = st.multiselect(
key="viz_source_category_category_web",
label="Select web-based primary sources that contain:",
options=options["primary_taxonomy"]["website"],
)
filter_dict["source_category"]["category_media"] = st.multiselect(
key="viz_source_category_category_media",
label="Select primary sources that are collections of:",
options=options["primary_taxonomy"]["collection"],
)
filter_dict["processed_from_primary"] = filter_dict.get(
"processed_from_primary", {}
)
filter_dict["processed_from_primary"]["primary_types"] = st.multiselect(
key="viz_processed_from_primary_primary_types",
label="Select processed datasets whose primary sources contain:",
options=[f"web | {w}" for w in options["primary_taxonomy"]["website"]]
+ options["primary_taxonomy"]["collection"],
)
if "media type" in filter_by:
filter_dict["media"] = {}
filter_dict["media"]["category"] = st.multiselect(
key="viz_media_category",
label="Select language data resources that contain:",
options=["text", "audiovisual", "image"],
help="Media data provided with transcription should go into **text**, then select the *transcribed* option. PDFs that have pre-extracted text information should go into **text**, PDFs that need OCR should go into **images**, select the latter if you're unsure",
)
filtered_catalogue = [
entry
for entry in catalogue
if filter_entry(entry, filter_dict) and not (entry["uid"] == "")
]
st.markdown(
f"##### Your query matched **{len(filtered_catalogue)}** entries in the current catalogue."
)
return filtered_catalogue
def viz_page():
st.title("🌸 - BigScience Catalog of Language Resources")
st.markdown("---\n")
catalogue = load_dataset("bigscience/collaborative_catalog")["train"]
with st.sidebar:
filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories)
entry_location_type = st.radio(
label="I want to visualize",
options=[
"Where the organizations or data custodians are located",
"Where the language data creators are located",
],
key="viz_show_location_type",
)
show_by_org = (
entry_location_type
== "Where the organizations or data custodians are located"
)
with st.expander("Map of entries", expanded=True):
filtered_counts = {}
for entry in filtered_catalogue:
locations = (
[entry["custodian"]["location"]]
if show_by_org
else entry["languages"]["language_locations"]
)
# be as specific as possible
locations = [
loc
for loc in locations
if not any([l in region_tree.get(loc, []) for l in locations])
]
for loc in locations:
filtered_counts[loc] = filtered_counts.get(loc, 0) + 1
world_map = make_choro_map(filtered_counts)
folium_static(world_map, width=900, height=600)
with st.expander("View selected resources", expanded=False):
st.write("You can further select locations to select entries from here:")
filter_region_choices = sorted(
set(
[
loc
for entry in filtered_catalogue
for loc in (
[entry["custodian"]["location"]]
if show_by_org
else entry["languages"]["language_locations"]
)
]
)
)
filter_locs = st.multiselect(
"View entries from the following locations:",
options=filter_region_choices,
key="viz_select_location",
)
filter_loc_dict = (
{"custodian": {"location": filter_locs}}
if show_by_org
else {"languages": {"language_locations": filter_locs}}
)
filtered_catalogue_by_loc = [
entry
for entry in filtered_catalogue
if filter_entry(entry, filter_loc_dict)
]
view_entry = st.selectbox(
label="Select an entry to see more detail:",
options=filtered_catalogue_by_loc,
format_func=lambda entry: f"{entry['uid']} | {entry['description']['name']} -- {entry['description']['description']}",
key="viz_select_entry",
)
st.markdown(
f"##### *Type:* {view_entry['type']} *UID:* {view_entry['uid']} - *Name:* {view_entry['description']['name']}\n\n{view_entry['description']['description']}"
)
st.write(view_entry)
if __name__ == "__main__":
main()