File size: 12,361 Bytes
bcc2d25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b3f7a0
bcc2d25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
import json

import streamlit as st
from datasets import load_dataset
from streamlit_folium import folium_static

from catalogue import make_choro_map, region_tree

##################
## streamlit
##################
st.set_page_config(
    page_title="BigScience Language Resource Catalogue Input Form",
    page_icon="https://avatars.githubusercontent.com/u/82455566",
    layout="wide",
    initial_sidebar_state="auto",
)

query_params = st.experimental_get_query_params()


def main():
    if "save_state" not in st.session_state:
        st.session_state.save_state = {}

    viz_page()


##################
## SECTION: Explore the current catalogue
##################

app_categories = {
    "entry_types": {
        "primary": "Primary source",
        "processed": "Processed language dataset",
        "organization": "Language organization or advocate",
    },
    "language_lists": json.load(
        open("resources/language_lists.json", encoding="utf-8")
    ),
    "programming_languages": [
        x
        for x in json.load(
            open("resources/programming_languages.json", encoding="utf-8")
        )["itemListElement"]
    ],
    "languages_bcp47": [
        x
        for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"]
        if x["type"] == "language"
    ],
    "custodian_types": [
        "A private individual",
        "A commercial entity",
        "A library, museum, or archival institute",
        "A university or research institution",
        "A nonprofit/NGO (other)",
        "A government organization",
    ],
    "pii_categories": json.load(
        open("resources/pii_categories.json", encoding="utf-8")
    ),
    "licenses": json.load(open("resources/licenses.json", encoding="utf-8")),
    "primary_taxonomy": json.load(
        open("resources/primary_source_taxonomy.json", encoding="utf-8")
    ),
    "file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")),
}


def filter_entry(entry, filter_dct):
    res = True
    for k, v in entry.items():
        if k in filter_dct:
            if isinstance(v, dict):
                res = res and filter_entry(v, filter_dct[k])
            elif isinstance(v, list):
                res = res and (
                    len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v])
                )
            else:
                res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k])
    return res


def filter_catalogue_visualization(catalogue, options):
    st.markdown("### Select entries to visualize")
    st.markdown(
        "##### Select entries by category, language, type of custodian or media"
    )
    st.markdown(
        "You can select specific parts of the catalogue to visualize in this window."
        + " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values."
    )
    filter_by_options = [
        "resource type",
        "language names",
        "custodian type",
        "available for download",
        "license type",
        "source type",
        "media type",
    ]
    filter_by = st.multiselect(
        key="viz_filter_by",
        label="You can filter the catalogue to only visualize entries that have certain properties, such as:",
        options=filter_by_options,
    )
    filter_dict = {}
    if "resource type" in filter_by:
        filter_dict["type"] = st.multiselect(
            key="viz_filter_type",
            label="I want to only see entries that are of the following category:",
            options=options["entry_types"],
            format_func=lambda x: options["entry_types"][x],
        )
    if "language names" in filter_by:
        filter_dict["languages"] = {}
        filter_dict["languages"]["language_names"] = st.multiselect(
            key="viz_filter_languages_language_names",
            label="I want to only see entries that have one of the following languages:",
            options=list(options["language_lists"]["language_groups"].keys())
            + options["language_lists"]["niger_congo_languages"]
            + options["language_lists"]["indic_languages"],
        )
    if "custodian type" in filter_by:
        filter_dict["custodian"] = {}
        filter_dict["custodian"]["type"] = st.multiselect(
            key="viz_filter_custodian_type",
            label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:",
            options=options["custodian_types"],
        )
    if "available for download" in filter_by:
        filter_dict["availability"] = filter_dict.get("availability", {})
        filter_dict["availability"]["procurement"] = {}
        download_options = [
            "No - but the current owners/custodians have contact information for data queries",
            "No - we would need to spontaneously reach out to the current owners/custodians",
            "Yes - it has a direct download link or links",
            "Yes - after signing a user agreement",
        ]
        filter_dict["availability"]["procurement"]["for_download"] = st.multiselect(
            key="viz_availability_procurement_for_download",
            label="Select based on whether the data can be obtained online:",
            options=download_options,
        )
    if "license type" in filter_by:
        filter_dict["availability"] = filter_dict.get("availability", {})
        filter_dict["availability"]["licensing"] = {}
        filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect(
            key="viz_availability_licensing_license_properties",
            label="Select primary entries that have the following license types",
            options=[
                "public domain",
                "multiple licenses",
                "copyright - all rights reserved",
                "open license",
                "research use",
                "non-commercial use",
                "do not distribute",
            ],
        )
        primary_license_options = [
            "Unclear / I don't know",
            "Yes - the source material has an open license that allows re-use",
            "Yes - the dataset has the same license as the source material",
            "Yes - the dataset curators have obtained consent from the source material owners",
            "No - the license of the source material actually prohibits re-use in this manner",
        ]
        filter_dict["processed_from_primary"] = filter_dict.get(
            "processed_from_primary", {}
        )
        filter_dict["processed_from_primary"]["primary_license"] = st.multiselect(
            key="viz_processed_from_primary_primary_license",
            label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?",
            options=primary_license_options,
        )
    if "source type" in filter_by:
        filter_dict["source_category"] = {}
        filter_dict["source_category"]["category_type"] = st.multiselect(
            key="viz_source_category_category_type",
            label="Select primary sources that correspond to:",
            options=["collection", "website"],
        )
        filter_dict["source_category"]["category_web"] = st.multiselect(
            key="viz_source_category_category_web",
            label="Select web-based primary sources that contain:",
            options=options["primary_taxonomy"]["website"],
        )
        filter_dict["source_category"]["category_media"] = st.multiselect(
            key="viz_source_category_category_media",
            label="Select primary sources that are collections of:",
            options=options["primary_taxonomy"]["collection"],
        )
        filter_dict["processed_from_primary"] = filter_dict.get(
            "processed_from_primary", {}
        )
        filter_dict["processed_from_primary"]["primary_types"] = st.multiselect(
            key="viz_processed_from_primary_primary_types",
            label="Select processed datasets whose primary sources contain:",
            options=[f"web | {w}" for w in options["primary_taxonomy"]["website"]]
            + options["primary_taxonomy"]["collection"],
        )
    if "media type" in filter_by:
        filter_dict["media"] = {}
        filter_dict["media"]["category"] = st.multiselect(
            key="viz_media_category",
            label="Select language data resources that contain:",
            options=["text", "audiovisual", "image"],
            help="Media data provided with transcription should go into **text**, then select the *transcribed* option. PDFs that have pre-extracted text information should go into **text**, PDFs that need OCR should go into **images**, select the latter if you're unsure",
        )
    filtered_catalogue = [
        entry
        for entry in catalogue
        if filter_entry(entry, filter_dict) and not (entry["uid"] == "")
    ]
    st.markdown(
        f"##### Your query matched **{len(filtered_catalogue)}** entries in the current catalogue."
    )
    return filtered_catalogue


def viz_page():
    st.title("🌸 - BigScience Catalog of Language Resources")
    st.markdown("---\n")
    catalogue = load_dataset("bigscience/collaborative_catalog")["train"]
    with st.sidebar:
        filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories)
        entry_location_type = st.radio(
            label="I want to visualize",
            options=[
                "Where the organizations or data custodians are located",
                "Where the language data creators are located",
            ],
            key="viz_show_location_type",
        )
        show_by_org = (
            entry_location_type
            == "Where the organizations or data custodians are located"
        )
    with st.expander("Map of entries", expanded=True):
        filtered_counts = {}
        for entry in filtered_catalogue:
            locations = (
                [entry["custodian"]["location"]]
                if show_by_org
                else entry["languages"]["language_locations"]
            )
            # be as specific as possible
            locations = [
                loc
                for loc in locations
                if not any([l in region_tree.get(loc, []) for l in locations])
            ]
            for loc in locations:
                filtered_counts[loc] = filtered_counts.get(loc, 0) + 1
        world_map = make_choro_map(filtered_counts)
        folium_static(world_map, width=900, height=600)
    with st.expander("View selected resources", expanded=False):
        st.write("You can further select locations to select entries from here:")
        filter_region_choices = sorted(
            set(
                [
                    loc
                    for entry in filtered_catalogue
                    for loc in (
                        [entry["custodian"]["location"]]
                        if show_by_org
                        else entry["languages"]["language_locations"]
                    )
                ]
            )
        )
        filter_locs = st.multiselect(
            "View entries from the following locations:",
            options=filter_region_choices,
            key="viz_select_location",
        )
        filter_loc_dict = (
            {"custodian": {"location": filter_locs}}
            if show_by_org
            else {"languages": {"language_locations": filter_locs}}
        )
        filtered_catalogue_by_loc = [
            entry
            for entry in filtered_catalogue
            if filter_entry(entry, filter_loc_dict)
        ]
        view_entry = st.selectbox(
            label="Select an entry to see more detail:",
            options=filtered_catalogue_by_loc,
            format_func=lambda entry: f"{entry['uid']} | {entry['description']['name']} -- {entry['description']['description']}",
            key="viz_select_entry",
        )
        st.markdown(
            f"##### *Type:* {view_entry['type']} *UID:* {view_entry['uid']} - *Name:* {view_entry['description']['name']}\n\n{view_entry['description']['description']}"
        )
        st.write(view_entry)


if __name__ == "__main__":
    main()