Spaces:
Runtime error
Runtime error
File size: 12,361 Bytes
bcc2d25 0b3f7a0 bcc2d25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 |
import json
import streamlit as st
from datasets import load_dataset
from streamlit_folium import folium_static
from catalogue import make_choro_map, region_tree
##################
## streamlit
##################
st.set_page_config(
page_title="BigScience Language Resource Catalogue Input Form",
page_icon="https://avatars.githubusercontent.com/u/82455566",
layout="wide",
initial_sidebar_state="auto",
)
query_params = st.experimental_get_query_params()
def main():
if "save_state" not in st.session_state:
st.session_state.save_state = {}
viz_page()
##################
## SECTION: Explore the current catalogue
##################
app_categories = {
"entry_types": {
"primary": "Primary source",
"processed": "Processed language dataset",
"organization": "Language organization or advocate",
},
"language_lists": json.load(
open("resources/language_lists.json", encoding="utf-8")
),
"programming_languages": [
x
for x in json.load(
open("resources/programming_languages.json", encoding="utf-8")
)["itemListElement"]
],
"languages_bcp47": [
x
for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"]
if x["type"] == "language"
],
"custodian_types": [
"A private individual",
"A commercial entity",
"A library, museum, or archival institute",
"A university or research institution",
"A nonprofit/NGO (other)",
"A government organization",
],
"pii_categories": json.load(
open("resources/pii_categories.json", encoding="utf-8")
),
"licenses": json.load(open("resources/licenses.json", encoding="utf-8")),
"primary_taxonomy": json.load(
open("resources/primary_source_taxonomy.json", encoding="utf-8")
),
"file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")),
}
def filter_entry(entry, filter_dct):
res = True
for k, v in entry.items():
if k in filter_dct:
if isinstance(v, dict):
res = res and filter_entry(v, filter_dct[k])
elif isinstance(v, list):
res = res and (
len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v])
)
else:
res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k])
return res
def filter_catalogue_visualization(catalogue, options):
st.markdown("### Select entries to visualize")
st.markdown(
"##### Select entries by category, language, type of custodian or media"
)
st.markdown(
"You can select specific parts of the catalogue to visualize in this window."
+ " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values."
)
filter_by_options = [
"resource type",
"language names",
"custodian type",
"available for download",
"license type",
"source type",
"media type",
]
filter_by = st.multiselect(
key="viz_filter_by",
label="You can filter the catalogue to only visualize entries that have certain properties, such as:",
options=filter_by_options,
)
filter_dict = {}
if "resource type" in filter_by:
filter_dict["type"] = st.multiselect(
key="viz_filter_type",
label="I want to only see entries that are of the following category:",
options=options["entry_types"],
format_func=lambda x: options["entry_types"][x],
)
if "language names" in filter_by:
filter_dict["languages"] = {}
filter_dict["languages"]["language_names"] = st.multiselect(
key="viz_filter_languages_language_names",
label="I want to only see entries that have one of the following languages:",
options=list(options["language_lists"]["language_groups"].keys())
+ options["language_lists"]["niger_congo_languages"]
+ options["language_lists"]["indic_languages"],
)
if "custodian type" in filter_by:
filter_dict["custodian"] = {}
filter_dict["custodian"]["type"] = st.multiselect(
key="viz_filter_custodian_type",
label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:",
options=options["custodian_types"],
)
if "available for download" in filter_by:
filter_dict["availability"] = filter_dict.get("availability", {})
filter_dict["availability"]["procurement"] = {}
download_options = [
"No - but the current owners/custodians have contact information for data queries",
"No - we would need to spontaneously reach out to the current owners/custodians",
"Yes - it has a direct download link or links",
"Yes - after signing a user agreement",
]
filter_dict["availability"]["procurement"]["for_download"] = st.multiselect(
key="viz_availability_procurement_for_download",
label="Select based on whether the data can be obtained online:",
options=download_options,
)
if "license type" in filter_by:
filter_dict["availability"] = filter_dict.get("availability", {})
filter_dict["availability"]["licensing"] = {}
filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect(
key="viz_availability_licensing_license_properties",
label="Select primary entries that have the following license types",
options=[
"public domain",
"multiple licenses",
"copyright - all rights reserved",
"open license",
"research use",
"non-commercial use",
"do not distribute",
],
)
primary_license_options = [
"Unclear / I don't know",
"Yes - the source material has an open license that allows re-use",
"Yes - the dataset has the same license as the source material",
"Yes - the dataset curators have obtained consent from the source material owners",
"No - the license of the source material actually prohibits re-use in this manner",
]
filter_dict["processed_from_primary"] = filter_dict.get(
"processed_from_primary", {}
)
filter_dict["processed_from_primary"]["primary_license"] = st.multiselect(
key="viz_processed_from_primary_primary_license",
label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?",
options=primary_license_options,
)
if "source type" in filter_by:
filter_dict["source_category"] = {}
filter_dict["source_category"]["category_type"] = st.multiselect(
key="viz_source_category_category_type",
label="Select primary sources that correspond to:",
options=["collection", "website"],
)
filter_dict["source_category"]["category_web"] = st.multiselect(
key="viz_source_category_category_web",
label="Select web-based primary sources that contain:",
options=options["primary_taxonomy"]["website"],
)
filter_dict["source_category"]["category_media"] = st.multiselect(
key="viz_source_category_category_media",
label="Select primary sources that are collections of:",
options=options["primary_taxonomy"]["collection"],
)
filter_dict["processed_from_primary"] = filter_dict.get(
"processed_from_primary", {}
)
filter_dict["processed_from_primary"]["primary_types"] = st.multiselect(
key="viz_processed_from_primary_primary_types",
label="Select processed datasets whose primary sources contain:",
options=[f"web | {w}" for w in options["primary_taxonomy"]["website"]]
+ options["primary_taxonomy"]["collection"],
)
if "media type" in filter_by:
filter_dict["media"] = {}
filter_dict["media"]["category"] = st.multiselect(
key="viz_media_category",
label="Select language data resources that contain:",
options=["text", "audiovisual", "image"],
help="Media data provided with transcription should go into **text**, then select the *transcribed* option. PDFs that have pre-extracted text information should go into **text**, PDFs that need OCR should go into **images**, select the latter if you're unsure",
)
filtered_catalogue = [
entry
for entry in catalogue
if filter_entry(entry, filter_dict) and not (entry["uid"] == "")
]
st.markdown(
f"##### Your query matched **{len(filtered_catalogue)}** entries in the current catalogue."
)
return filtered_catalogue
def viz_page():
st.title("🌸 - BigScience Catalog of Language Resources")
st.markdown("---\n")
catalogue = load_dataset("bigscience/collaborative_catalog")["train"]
with st.sidebar:
filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories)
entry_location_type = st.radio(
label="I want to visualize",
options=[
"Where the organizations or data custodians are located",
"Where the language data creators are located",
],
key="viz_show_location_type",
)
show_by_org = (
entry_location_type
== "Where the organizations or data custodians are located"
)
with st.expander("Map of entries", expanded=True):
filtered_counts = {}
for entry in filtered_catalogue:
locations = (
[entry["custodian"]["location"]]
if show_by_org
else entry["languages"]["language_locations"]
)
# be as specific as possible
locations = [
loc
for loc in locations
if not any([l in region_tree.get(loc, []) for l in locations])
]
for loc in locations:
filtered_counts[loc] = filtered_counts.get(loc, 0) + 1
world_map = make_choro_map(filtered_counts)
folium_static(world_map, width=900, height=600)
with st.expander("View selected resources", expanded=False):
st.write("You can further select locations to select entries from here:")
filter_region_choices = sorted(
set(
[
loc
for entry in filtered_catalogue
for loc in (
[entry["custodian"]["location"]]
if show_by_org
else entry["languages"]["language_locations"]
)
]
)
)
filter_locs = st.multiselect(
"View entries from the following locations:",
options=filter_region_choices,
key="viz_select_location",
)
filter_loc_dict = (
{"custodian": {"location": filter_locs}}
if show_by_org
else {"languages": {"language_locations": filter_locs}}
)
filtered_catalogue_by_loc = [
entry
for entry in filtered_catalogue
if filter_entry(entry, filter_loc_dict)
]
view_entry = st.selectbox(
label="Select an entry to see more detail:",
options=filtered_catalogue_by_loc,
format_func=lambda entry: f"{entry['uid']} | {entry['description']['name']} -- {entry['description']['description']}",
key="viz_select_entry",
)
st.markdown(
f"##### *Type:* {view_entry['type']} *UID:* {view_entry['uid']} - *Name:* {view_entry['description']['name']}\n\n{view_entry['description']['description']}"
)
st.write(view_entry)
if __name__ == "__main__":
main()
|