Spaces:

stable-bias
/

diffusion-clustering

Runtime error

App Files Files Community

Anonymous Authors commited on Aug 11, 2023

Commit

6257003

1 Parent(s): b64b066

Upload 10 files

Browse files

Files changed (10) hide show

README.md +5 -6
app.py +397 -0
clusters/cluster_summaries_by_size.json +1 -0
clusters/professions_to_clusters_12.json +0 -0
clusters/professions_to_clusters_24.json +3 -0
clusters/professions_to_clusters_48.json +3 -0
professions/dataset_info.json +80 -0
professions/state.json +31 -0
promptsadjectives.csv +151 -0
requirements.txt +3 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: Diffusion Clustering
-emoji: 🚀
-colorFrom: purple
-colorTo: gray
 sdk: gradio
-sdk_version: 3.40.1
 app_file: app.py
 pinned: false
-license: openrail
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DiffusionClustering
+emoji: 📊
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 3.18.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import gradio as gr
+import json
+import numpy as np
+import pandas as pd
+from datasets import load_from_disk
+from itertools import chain
+import operator
+pd.options.plotting.backend = "plotly"
+TITLE = "Identity Biases in Diffusion Models: Professions"
+_INTRO = """
+# Identity Biases in Diffusion Models: Professions
+Explore profession-level social biases in the data from [DiffusionBiasExplorer](https://hf.co/spaces/tti-bias/DiffusionBiasExplorer)!
+This demo leverages the gender and ethnicity representation clusters described in the [companion app](https://hf.co/spaces/tti-bias/DiffusionFaceClustering)
+to analyze social trends in machine-generated visual representations of professions.
+The **Professions Overview** tab lets you compare the distribution over
+[identity clusters](https://hf.co/spaces/tti-bias/DiffusionFaceClustering "Identity clusters identify visual features in the systems' output space correlated with variation of gender and ethnicity in input prompts.")
+across professions for Stable Diffusion and Dalle-2 systems (or aggregated for `All Models`).
+The **Professions Focus** tab provides more details for each of the individual professions, including direct system comparisons and examples of profession images for each cluster.
+This work was done in the scope of the [Stable Bias Project](https://hf.co/spaces/tti-bias/StableBias).
+As you use this demo, please share findings and comments [in the discussions tab](https://hf.co/spaces/tti-bias/DiffusionClustering/discussions)!
+"""
+_ = """
+ For example, you can use this tool to investigate:
+ - How do each model's representation of professions correlate with the gender ratios reported by the [U.S. Bureau of Labor
+Statistics](https://www.bls.gov/cps/cpsaat11.htm "The reported percentage of women in each profession in the US is indicated in the `Labor Women` column in the Professions Overview tab.")?
+Are social trends reflected, are they exaggerated?
+- Which professions have the starkest differences in how different models represent them?
+"""
+professions_dset = load_from_disk("professions")
+professions_df = professions_dset.to_pandas()
+clusters_dicts = dict(
+    (num_cl, json.load(open(f"clusters/professions_to_clusters_{num_cl}.json")))
+    for num_cl in [12, 24, 48]
+)
+cluster_summaries_by_size = json.load(open("clusters/cluster_summaries_by_size.json"))
+prompts = pd.read_csv("promptsadjectives.csv")
+professions = ["all professions"] + list(
+#    sorted([p.lower() for p in prompts["Occupation-Noun"].tolist()])
+    sorted([p for p in prompts["Occupation-Noun"].tolist()])
+)
+models = {
+    "All": "All Models",
+    "SD_14": "Stable Diffusion 1.4",
+    "SD_2": "Stable Diffusion 2",
+    "DallE": "Dall-E 2",
+}
+df_models = {
+    "All Models": "All",
+    "Stable Diffusion 1.4": "SD_14",
+    "Stable Diffusion 2": "SD_2",
+    "Dall-E 2": "DallE",
+}
+def describe_cluster(num_clusters, block="label"):
+    cl_dict = clusters_dicts[num_clusters]
+    labels_values = sorted(cl_dict.items(), key=operator.itemgetter(1))
+    labels_values.reverse()
+    total = float(sum(cl_dict.values()))
+    lv_prcnt = list(
+        (item[0], round(item[1] * 100 / total, 0)) for item in labels_values
+    )
+    top_label = lv_prcnt[0][0]
+    description_string = (
+        "<span>The most represented %s is <b>%s</b>, making up about <b>%d%%</b> of the cluster.</span>"
+        % (to_string(block), to_string(top_label), lv_prcnt[0][1])
+    )
+    description_string += "<p>This is followed by: "
+    for lv in lv_prcnt[1:]:
+        description_string += "<BR/><b>%s:</b> %d%%" % (to_string(lv[0]), lv[1])
+    description_string += "</p>"
+    return description_string
+def make_profession_plot(num_clusters, prof_name):
+    sorted_cl_scores = [
+        (k, v)
+        for k, v in sorted(
+            clusters_dicts[num_clusters]["All"][prof_name][
+                "cluster_proportions"
+            ].items(),
+            key=lambda x: x[1],
+            reverse=True,
+        )
+        if v > 0
+    ]
+    pre_pandas = dict(
+        [
+            (
+                models[mod_name],
+                dict(
+                    (
+                        f"Cluster {k}",
+                        clusters_dicts[num_clusters][mod_name][prof_name][
+                            "cluster_proportions"
+                        ][k],
+                    )
+                    for k, _ in sorted_cl_scores
+                ),
+            )
+            for mod_name in models
+        ]
+    )
+    df = pd.DataFrame.from_dict(pre_pandas)
+    prof_plot = df.plot(kind="bar", barmode="group")
+    cl_summary_text = f"Profession '{prof_name}':\n"
+    for cl_id, _ in sorted_cl_scores:
+        cl_summary_text += f"- {cluster_summaries_by_size[str(num_clusters)][int(cl_id)].replace(' gender terms', '').replace('; ethnicity terms:', ',')} \n"
+    return (
+        prof_plot,
+        gr.update(
+            choices=[k for k, _ in sorted_cl_scores], value=sorted_cl_scores[0][0]
+        ),
+        gr.update(value=cl_summary_text),
+    )
+def make_profession_table(num_clusters, prof_names, mod_name, max_cols=8):
+    professions_list_clusters = [
+        (
+            prof_name,
+            clusters_dicts[num_clusters][df_models[mod_name]][prof_name][
+                "cluster_proportions"
+            ],
+        )
+        for prof_name in prof_names
+    ]
+    totals = sorted(
+        [
+            (
+                k,
+                sum(
+                    prof_clusters[str(k)]
+                    for _, prof_clusters in professions_list_clusters
+                ),
+            )
+            for k in range(num_clusters)
+        ],
+        key=lambda x: x[1],
+        reverse=True,
+    )[:max_cols]
+    prof_list_pre_pandas = [
+        dict(
+            [
+                ("Profession", prof_name),
+                (
+                    "Entropy",
+                    clusters_dicts[num_clusters][df_models[mod_name]][prof_name][
+                        "entropy"
+                    ],
+                ),
+                (
+                    "Labor Women",
+                    clusters_dicts[num_clusters][df_models[mod_name]][prof_name][
+                        "labor_fm"
+                    ][0],
+                ),
+                ("", ""),
+            ]
+            + [(f"Cluster {k}", prof_clusters[str(k)]) for k, v in totals if v > 0]
+        )
+        for prof_name, prof_clusters in professions_list_clusters
+    ]
+    clusters_df = pd.DataFrame.from_dict(prof_list_pre_pandas)
+    cl_summary_text = ""
+    for cl_id, _ in totals[:max_cols]:
+        cl_summary_text += f"- {cluster_summaries_by_size[str(num_clusters)][cl_id].replace(' gender terms', '').replace('; ethnicity terms:', ',')} \n"
+    return (
+        [c[0] for c in totals],
+        (
+            clusters_df.style.background_gradient(
+                axis=None, vmin=0, vmax=100, cmap="YlGnBu"
+            )
+            .format(precision=1)
+            .to_html()
+        ),
+        gr.update(value=cl_summary_text),
+    )
+def get_image(model, fname, score):
+    return (
+        professions_dset.select(
+            professions_df[
+                (professions_df["image_path"] == fname)
+                & (professions_df["model"] == model)
+            ].index
+        )["image"][0],
+        " ".join(fname.split("/")[0].split("_")[4:])
+        + f" | {score:.2f}"
+        + f" | {models[model]}",
+    )
+def show_examplars(num_clusters, prof_name, cl_id, confidence_threshold=0.6):
+    # only show images where the similarity to the centroid is > confidence_threshold
+    examplars_dict = clusters_dicts[num_clusters]["All"][prof_name][
+        "cluster_examplars"
+    ][str(cl_id)]
+    l = [
+        tuple(img)
+        for img in examplars_dict["close"]
+        + examplars_dict["mid"][:2]
+        + examplars_dict["far"]
+    ]
+    l = [
+        img
+        for i, img in enumerate(l)
+        if img[0] > confidence_threshold and img not in l[:i]
+    ]
+    return (
+        [get_image(model, fname, score) for score, model, fname in l],
+        gr.update(
+            label=f"Generations for profession ''{prof_name}'' assigned to cluster {cl_id} of {num_clusters}"
+        ),
+    )
+with gr.Blocks(title=TITLE) as demo:
+    gr.Markdown(_INTRO)
+    gr.HTML(
+        """<span style="color:red" font-size:smaller>⚠️ DISCLAIMER: the images displayed by this tool were generated by text-to-image systems and may depict offensive stereotypes or contain explicit content.</span>"""
+    )
+    with gr.Tab("Professions Overview"):
+        gr.Markdown(
+            """
+            Select one or more professions and models from the dropdowns on the left to see which clusters are most representative for this combination.
+            Try choosing different numbers of clusters to see if the results change, and then go to the 'Profession Focus' tab to go more in-depth into these results.
+            The `Labor Women` column provided for comparison corresponds to the gender ratio reported by the
+            [U.S. Bureau of Labor Statistics](https://www.bls.gov/cps/cpsaat11.htm) for each profession.
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("Select the parameters here:")
+                num_clusters = gr.Radio(
+                    [12, 24, 48],
+                    value=12,
+                    label="How many clusters do you want to use to represent identities?",
+                )
+                model_choices = gr.Dropdown(
+                    [
+                        "All Models",
+                        "Stable Diffusion 1.4",
+                        "Stable Diffusion 2",
+                        "Dall-E 2",
+                    ],
+                    value="All Models",
+                    label="Which models do you want to compare?",
+                    interactive=True,
+                )
+                profession_choices_overview = gr.Dropdown(
+                    professions,
+                    value=[
+                        "all professions",
+                        "CEO",
+                        "director",
+                        "social assistant",
+                        "social worker",
+                    ],
+                    label="Which professions do you want to compare?",
+                    multiselect=True,
+                    interactive=True,
+                )
+            with gr.Column(scale=3):
+                with gr.Row():
+                    table = gr.HTML(
+                        label="Profession assignment per cluster", wrap=True
+                    )
+                with gr.Row():
+                    # clusters = gr.Dataframe(type="array", visible=False, col_count=1)
+                    clusters = gr.Textbox(label="clusters", visible=False)
+                    gr.Markdown(
+                        """
+                        ##### What do the clusters mean?
+                        Below is a summary of the identity cluster compositions.
+                        For more details, see the [companion demo](https://huggingface.co/spaces/tti-bias/DiffusionFaceClustering):
+                        """
+                    )
+                with gr.Row():
+                    with gr.Accordion(label="Cluster summaries", open=True):
+                        cluster_descriptions_table = gr.Text(
+                            "TODO", label="Cluster summaries", show_label=False
+                        )
+    with gr.Tab("Profession Focus"):
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown(
+                    "Select a profession to visualize and see which clusters and identity groups are most represented in the profession, as well as some examples of generated images below."
+                )
+                profession_choice_focus = gr.Dropdown(
+                    choices=professions,
+                    value="scientist",
+                    label="Select profession:",
+                )
+                num_clusters_focus = gr.Radio(
+                    [12, 24, 48],
+                    value=12,
+                    label="How many clusters do you want to use to represent identities?",
+                )
+            with gr.Column():
+                plot = gr.Plot(
+                    label=f"Makeup of the cluster assignments for profession {profession_choice_focus}"
+                )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown(
+                    """
+                    ##### What do the clusters mean?
+                    Below is a summary of the identity cluster compositions.
+                    For more details, see the [companion demo](https://huggingface.co/spaces/tti-bias/DiffusionFaceClustering):
+                    """
+                )
+                with gr.Accordion(label="Cluster summaries", open=True):
+                    cluster_descriptions = gr.Text(
+                        "TODO", label="Cluster summaries", show_label=False
+                    )
+            with gr.Column():
+                gr.Markdown(
+                    """
+                    ##### What's in the clusters?
+                    You can show examples of profession images assigned to each identity cluster by selecting one here:
+                    """
+                )
+                with gr.Accordion(label="Cluster selection", open=True):
+                    cluster_id_focus = gr.Dropdown(
+                        choices=[i for i in range(num_clusters_focus.value)],
+                        value=0,
+                        label="Select cluster to visualize:",
+                    )
+        with gr.Row():
+            examplars_plot = gr.Gallery(
+                label="Profession images assigned to the selected cluster."
+            ).style(grid=4, height="auto", container=True)
+    demo.load(
+        make_profession_table,
+        [num_clusters, profession_choices_overview, model_choices],
+        [clusters, table, cluster_descriptions_table],
+        queue=False,
+    )
+    demo.load(
+        make_profession_plot,
+        [num_clusters_focus, profession_choice_focus],
+        [plot, cluster_id_focus, cluster_descriptions],
+        queue=False,
+    )
+    demo.load(
+        show_examplars,
+        [
+            num_clusters_focus,
+            profession_choice_focus,
+            cluster_id_focus,
+        ],
+        [examplars_plot, examplars_plot],
+        queue=False,
+    )
+    for var in [num_clusters, model_choices, profession_choices_overview]:
+        var.change(
+            make_profession_table,
+            [num_clusters, profession_choices_overview, model_choices],
+            [clusters, table, cluster_descriptions_table],
+            queue=False,
+        )
+    for var in [num_clusters_focus, profession_choice_focus]:
+        var.change(
+            make_profession_plot,
+            [num_clusters_focus, profession_choice_focus],
+            [plot, cluster_id_focus, cluster_descriptions],
+            queue=False,
+        )
+    for var in [num_clusters_focus, profession_choice_focus, cluster_id_focus]:
+        var.change(
+            show_examplars,
+            [
+                num_clusters_focus,
+                profession_choice_focus,
+                cluster_id_focus,
+            ],
+            [examplars_plot, examplars_plot],
+            queue=False,
+        )
+if __name__ == "__main__":
+    demo.queue().launch(debug=True)

clusters/cluster_summaries_by_size.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"12": ["Cluster 0: 219 items. Most frequent gender terms: woman (122), non-binary (74); ethnicity terms: South Asian (51), East Asian (42).", "Cluster 1: 215 items. Most frequent gender terms: man (67), unmarked (65); ethnicity terms: Native American (68), American Indian (66).", "Cluster 2: 204 items. Most frequent gender terms: woman (166), unmarked (24); ethnicity terms: Latinx (39), Caucasian (30).", "Cluster 3: 202 items. Most frequent gender terms: man (103), unmarked (92); ethnicity terms: South Asian (61), Hispanic (41).", "Cluster 4: 178 items. Most frequent gender terms: man (99), unmarked (75); ethnicity terms: White (52), unmarked (51).", "Cluster 5: 177 items. Most frequent gender terms: non-binary (160), woman (17); ethnicity terms: White (28), Caucasian (26).", "Cluster 6: 161 items. Most frequent gender terms: woman (85), non-binary (46); ethnicity terms: African-American (53), Black (50).", "Cluster 7: 156 items. Most frequent gender terms: man (66), unmarked (51); ethnicity terms: Pacific Islander (25), Southeast Asian (21).", "Cluster 8: 154 items. Most frequent gender terms: man (83), unmarked (66); ethnicity terms: African-American (55), Black (53).", "Cluster 9: 121 items. Most frequent gender terms: woman (52), unmarked (36); ethnicity terms: Indigenous American (39), First Nations (37).", "Cluster 10: 121 items. Most frequent gender terms: man (59), unmarked (42); ethnicity terms: East Asian (59), Southeast Asian (45).", "Cluster 11: 102 items. Most frequent gender terms: non-binary (69), woman (27); ethnicity terms: First Nations (21), Latinx (15)."], "24": ["Cluster 0: 161 items. Most frequent gender terms: woman (85), non-binary (46); ethnicity terms: African-American (53), Black (50).", "Cluster 1: 152 items. Most frequent gender terms: non-binary (71), woman (69); ethnicity terms: South Asian (51), Pacific Islander (19).", "Cluster 2: 139 items. Most frequent gender terms: woman (111), unmarked (20); ethnicity terms: Latinx (34), Hispanic (27).", "Cluster 3: 135 items. Most frequent gender terms: man (46), unmarked (44); ethnicity terms: Native American (50), American Indian (42).", "Cluster 4: 125 items. Most frequent gender terms: man (74), unmarked (47); ethnicity terms: South Asian (61), Latino (24).", "Cluster 5: 117 items. Most frequent gender terms: man (70), unmarked (44); ethnicity terms: White (34), unmarked (31).", "Cluster 6: 91 items. Most frequent gender terms: man (55), unmarked (33); ethnicity terms: African-American (44), Black (31).", "Cluster 7: 84 items. Most frequent gender terms: man (39), non-binary (29); ethnicity terms: Pacific Islander (16), Indigenous American (14).", "Cluster 8: 80 items. Most frequent gender terms: non-binary (24), unmarked (21); ethnicity terms: American Indian (24), Indigenous American (21).", "Cluster 9: 77 items. Most frequent gender terms: unmarked (45), man (29); ethnicity terms: Hispanic (25), Pacific Islander (13).", "Cluster 10: 76 items. Most frequent gender terms: woman (52), unmarked (19); ethnicity terms: Indigenous American (23), First Nations (20).", "Cluster 11: 72 items. Most frequent gender terms: unmarked (35), man (27); ethnicity terms: Southeast Asian (19), Latino (11).", "Cluster 12: 68 items. Most frequent gender terms: non-binary (58), woman (10); ethnicity terms: White (25), Caucasian (23).", "Cluster 13: 68 items. Most frequent gender terms: non-binary (62), woman (6); ethnicity terms: Black (14), Multiracial (13).", "Cluster 14: 67 items. Most frequent gender terms: woman (53), unmarked (11); ethnicity terms: East Asian (42), Southeast Asian (21).", "Cluster 15: 65 items. Most frequent gender terms: woman (55), non-binary (6); ethnicity terms: White (19), Caucasian (16).", "Cluster 16: 63 items. Most frequent gender terms: unmarked (33), man (28); ethnicity terms: Black (22), Multiracial (20).", "Cluster 17: 63 items. Most frequent gender terms: man (29), unmarked (18); ethnicity terms: East Asian (58), Southeast Asian (3).", "Cluster 18: 61 items. Most frequent gender terms: non-binary (53), woman (8); ethnicity terms: Latinx (15), Latino (10).", "Cluster 19: 61 items. Most frequent gender terms: unmarked (31), man (29); ethnicity terms: Caucasian (22), unmarked (20).", "Cluster 20: 58 items. Most frequent gender terms: man (30), unmarked (24); ethnicity terms: Southeast Asian (42), Pacific Islander (15).", "Cluster 21: 45 items. Most frequent gender terms: man (28), unmarked (17); ethnicity terms: First Nations (17), Indigenous American (16).", "Cluster 22: 41 items. Most frequent gender terms: non-binary (40), woman (1); ethnicity terms: East Asian (11), Southeast Asian (7).", "Cluster 23: 41 items. Most frequent gender terms: woman (19), non-binary (16); ethnicity terms: First Nations (12), Pacific Islander (10)."], "48": ["Cluster 0: 110 items. Most frequent gender terms: woman (57), non-binary (28); ethnicity terms: Multiracial (35), Black (32).", "Cluster 1: 80 items. Most frequent gender terms: unmarked (39), man (35); ethnicity terms: Multiracial (29), Black (22).", "Cluster 2: 73 items. Most frequent gender terms: man (35), unmarked (34); ethnicity terms: South Asian (60), Hispanic (6).", "Cluster 3: 72 items. Most frequent gender terms: unmarked (29), man (23); ethnicity terms: American Indian (27), Native American (26).", "Cluster 4: 71 items. Most frequent gender terms: man (39), unmarked (30); ethnicity terms: White (23), unmarked (23).", "Cluster 5: 67 items. Most frequent gender terms: non-binary (64), woman (2); ethnicity terms: East Asian (13), Latino (8).", "Cluster 6: 64 items. Most frequent gender terms: man (27), unmarked (19); ethnicity terms: East Asian (55), Southeast Asian (5).", "Cluster 7: 62 items. Most frequent gender terms: unmarked (31), man (30); ethnicity terms: Caucasian (23), unmarked (20).", "Cluster 8: 54 items. Most frequent gender terms: unmarked (30), man (22); ethnicity terms: Hispanic (17), Caucasian (12).", "Cluster 9: 54 items. Most frequent gender terms: woman (40), unmarked (9); ethnicity terms: East Asian (30), Southeast Asian (20).", "Cluster 10: 54 items. Most frequent gender terms: unmarked (30), man (19); ethnicity terms: Pacific Islander (19), Southeast Asian (13).", "Cluster 11: 51 items. Most frequent gender terms: non-binary (43), woman (8); ethnicity terms: White (25), Caucasian (22).", "Cluster 12: 50 items. Most frequent gender terms: unmarked (28), man (22); ethnicity terms: Southeast Asian (15), Latino (9).", "Cluster 13: 49 items. Most frequent gender terms: woman (32), unmarked (13); ethnicity terms: Latinx (19), Hispanic (13).", "Cluster 14: 46 items. Most frequent gender terms: non-binary (28), woman (16); ethnicity terms: South Asian (13), Pacific Islander (9).", "Cluster 15: 45 items. Most frequent gender terms: woman (36), unmarked (9); ethnicity terms: Indigenous American (18), First Nations (13).", "Cluster 16: 44 items. Most frequent gender terms: woman (37), unmarked (7); ethnicity terms: Latinx (8), Multiracial (8).", "Cluster 17: 43 items. Most frequent gender terms: man (24), unmarked (18); ethnicity terms: Latinx (21), Latino (11).", "Cluster 18: 43 items. Most frequent gender terms: man (27), unmarked (16); ethnicity terms: Indigenous American (16), First Nations (15).", "Cluster 19: 40 items. Most frequent gender terms: man (29), unmarked (11); ethnicity terms: African-American (20), Black (12).", "Cluster 20: 40 items. Most frequent gender terms: non-binary (39), woman (1); ethnicity terms: Latinx (14), Latino (7).", "Cluster 21: 39 items. Most frequent gender terms: man (23), unmarked (16); ethnicity terms: Southeast Asian (31), Pacific Islander (8).", "Cluster 22: 39 items. Most frequent gender terms: non-binary (27), man (8); ethnicity terms: American Indian (11), Indigenous American (8).", "Cluster 23: 37 items. Most frequent gender terms: woman (21), non-binary (16); ethnicity terms: Black (18), African-American (13).", "Cluster 24: 36 items. Most frequent gender terms: non-binary (32), woman (4); ethnicity terms: Hispanic (6), Multiracial (6).", "Cluster 25: 35 items. Most frequent gender terms: woman (17), unmarked (12); ethnicity terms: Native American (13), American Indian (8).", "Cluster 26: 35 items. Most frequent gender terms: man (27), unmarked (8); ethnicity terms: unmarked (8), Latino (7).", "Cluster 27: 34 items. Most frequent gender terms: non-binary (23), woman (8); ethnicity terms: African-American (15), Black (14).", "Cluster 28: 34 items. Most frequent gender terms: woman (20), non-binary (13); ethnicity terms: South Asian (34).", "Cluster 29: 34 items. Most frequent gender terms: woman (30), non-binary (2); ethnicity terms: White (15), Caucasian (11).", "Cluster 30: 34 items. Most frequent gender terms: man (19), unmarked (14); ethnicity terms: Black (18), African-American (15).", "Cluster 31: 34 items. Most frequent gender terms: non-binary (18), woman (14); ethnicity terms: Southeast Asian (14), Pacific Islander (10).", "Cluster 32: 32 items. Most frequent gender terms: non-binary (21), woman (10); ethnicity terms: Indigenous American (10), Native American (10).", "Cluster 33: 30 items. Most frequent gender terms: woman (29), unmarked (1); ethnicity terms: Hispanic (9), Latinx (7).", "Cluster 34: 29 items. Most frequent gender terms: man (18), unmarked (9); ethnicity terms: First Nations (10), Pacific Islander (9).", "Cluster 35: 27 items. Most frequent gender terms: unmarked (15), man (11); ethnicity terms: American Indian (10), Native American (6).", "Cluster 36: 27 items. Most frequent gender terms: woman (13), non-binary (11); ethnicity terms: First Nations (13), Latino (4).", "Cluster 37: 26 items. Most frequent gender terms: man (15), unmarked (11); ethnicity terms: Native American (9), American Indian (8).", "Cluster 38: 26 items. Most frequent gender terms: non-binary (12), woman (12); ethnicity terms: Native American (11), Indigenous American (8).", "Cluster 39: 25 items. Most frequent gender terms: woman (11), non-binary (10); ethnicity terms: Latinx (8), Hispanic (6).", "Cluster 40: 25 items. Most frequent gender terms: man (19), unmarked (6); ethnicity terms: Indigenous American (5), First Nations (4).", "Cluster 41: 24 items. Most frequent gender terms: man (21), unmarked (3); ethnicity terms: Latino (11), Hispanic (8).", "Cluster 42: 24 items. Most frequent gender terms: woman (17), unmarked (5); ethnicity terms: East Asian (17), Southeast Asian (4).", "Cluster 43: 24 items. Most frequent gender terms: woman (18), non-binary (3); ethnicity terms: Indigenous American (6), American Indian (6).", "Cluster 44: 23 items. Most frequent gender terms: non-binary (21), woman (2); ethnicity terms: Indigenous American (5), Native American (3).", "Cluster 45: 22 items. Most frequent gender terms: woman (19), non-binary (2); ethnicity terms: Caucasian (5), White (5).", "Cluster 46: 22 items. Most frequent gender terms: woman (15), non-binary (6); ethnicity terms: Caucasian (8), unmarked (8).", "Cluster 47: 21 items. Most frequent gender terms: man (10), non-binary (8); ethnicity terms: First Nations (8), American Indian (5)."]}

clusters/professions_to_clusters_12.json ADDED Viewed

The diff for this file is too large to render. See raw diff

clusters/professions_to_clusters_24.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60002a2db2baf80593f5ee78cff69ab3da3ea484c2164a167acebbafce52d095
+size 11263571

clusters/professions_to_clusters_48.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdfb0225cab02890cc86d78a653353e13090d0010a3ea25aab90cd13f14e5f4f
+size 16982605

professions/dataset_info.json ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+  "citation": "",
+  "dataset_size": 3089241323,
+  "description": "",
+  "download_checksums": {
+    "https://huggingface.co/datasets/SDbiaseval/professions/resolve/4d67923c5740d1530f54ea7c8f6544906006499f/data/train-00000-of-00007-3b31e5f6c84de043.parquet": {
+      "num_bytes": 498356472,
+      "checksum": "53b1c32f5fc976a4c1dc00493d163b76fbf64bd7c0f7c57088a689c802abba24"
+    },
+    "https://huggingface.co/datasets/SDbiaseval/professions/resolve/4d67923c5740d1530f54ea7c8f6544906006499f/data/train-00001-of-00007-554593e2dcffb358.parquet": {
+      "num_bytes": 500540901,
+      "checksum": "b0061ec7a28bbb69653b32038e3f65e777167c469c5a8504822ab9de2ef68d2d"
+    },
+    "https://huggingface.co/datasets/SDbiaseval/professions/resolve/4d67923c5740d1530f54ea7c8f6544906006499f/data/train-00002-of-00007-80bfc2d01d5f617a.parquet": {
+      "num_bytes": 471684879,
+      "checksum": "80a59afad1705d7009a12ba90fdacaa9ac7d3e597bf46aee94824c65847f5e42"
+    },
+    "https://huggingface.co/datasets/SDbiaseval/professions/resolve/4d67923c5740d1530f54ea7c8f6544906006499f/data/train-00003-of-00007-5879e1e46f149000.parquet": {
+      "num_bytes": 441709921,
+      "checksum": "95d5d192d35297fdc3e37ce3d4296a4cc4a7ce61e877591956c46d0e073df790"
+    },
+    "https://huggingface.co/datasets/SDbiaseval/professions/resolve/4d67923c5740d1530f54ea7c8f6544906006499f/data/train-00004-of-00007-b38d2407c040db34.parquet": {
+      "num_bytes": 418266167,
+      "checksum": "f8ce3ec4d7369a73e8d45f0faa1acb65ca8524892dd29c281de0a4ad77c0d57d"
+    },
+    "https://huggingface.co/datasets/SDbiaseval/professions/resolve/4d67923c5740d1530f54ea7c8f6544906006499f/data/train-00005-of-00007-8c7dc4ce48ca40eb.parquet": {
+      "num_bytes": 367229116,
+      "checksum": "d25286b4f983ca4b28ef6b21ce95623e488254fd4b5ae67fd727860caf150b7a"
+    },
+    "https://huggingface.co/datasets/SDbiaseval/professions/resolve/4d67923c5740d1530f54ea7c8f6544906006499f/data/train-00006-of-00007-1c5d5d431e4a9776.parquet": {
+      "num_bytes": 377708035,
+      "checksum": "93ae11b5e5e63940d9eb769747d9be72fec3b97c13853ef6d177b950b7af42e6"
+    }
+  },
+  "download_size": 3075495491,
+  "features": {
+    "adjective": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "profession": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "no": {
+      "dtype": "int32",
+      "_type": "Value"
+    },
+    "image_path": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "image": {
+      "_type": "Image"
+    },
+    "model": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 6164736814,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 3089241323,
+      "num_examples": 94500,
+      "shard_lengths": [
+        13500,
+        13500,
+        23500,
+        17000,
+        23500,
+        3500
+      ],
+      "dataset_name": "parquet"
+    }
+  }
+}

professions/state.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00007.arrow"
+    },
+    {
+      "filename": "data-00001-of-00007.arrow"
+    },
+    {
+      "filename": "data-00002-of-00007.arrow"
+    },
+    {
+      "filename": "data-00003-of-00007.arrow"
+    },
+    {
+      "filename": "data-00004-of-00007.arrow"
+    },
+    {
+      "filename": "data-00005-of-00007.arrow"
+    },
+    {
+      "filename": "data-00006-of-00007.arrow"
+    }
+  ],
+  "_fingerprint": "b5009ffb3c3f22be",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

promptsadjectives.csv ADDED Viewed

	@@ -0,0 +1,151 @@

+Masc-adj,Fem-adj,Occupation-Noun
+ambitious,committed,manager
+assertive,supportive,truck driver
+confident,sensitive,teacher
+decisive,emotional,nurse
+determined,gentle,supervisor
+intellectual,honest,salesperson
+outspoken,modest,cashier
+self-confident,compassionate,customer service representative
+stubborn,considerate,construction worker
+unreasonable,pleasant,mover
+,,janitor
+,,software developer
+,,cook
+,,CEO
+,,stocker
+,,waiter
+,,waitress
+,,accountant
+,,aide
+,,maid
+,,teaching assistant
+,,financial manager
+,,office clerk
+,,nursing assistant
+,,carpenter
+,,receptionist
+,,groundskeeper
+,,real estate broker
+,,clerk
+,,lawyer
+,,childcare worker
+,,doctor
+,,farmer
+,,mechanic
+,,electrician
+,,security guard
+,,courier
+,,fast food worker
+,,police officer
+,,IT specialist
+,,hairdresser
+,,social worker
+,,engineer
+,,computer support specialist
+,,office worker
+,,tractor operator
+,,inventory clerk
+,,repair worker
+,,insurance agent
+,,plumber
+,,marketing manager
+,,painter
+,,welder
+,,sales manager
+,,financial advisor
+,,computer systems analyst
+,,air conditioning installer
+,,computer programmer
+,,credit counselor
+,,civil engineer
+,,paralegal
+,,machinery mechanic
+,,clergy
+,,head cook
+,,market research analyst
+,,community manager
+,,designer
+,,scientist
+,,laboratory technician
+,,career counselor
+,,bartender
+,,mechanical engineer
+,,pharmacist
+,,financial analyst
+,,pharmacy technician
+,,taxi driver
+,,metal worker
+,,claims appraiser
+,,dental assistant
+,,machinist
+,,cleaner
+,,electrical engineer
+,,correctional officer
+,,jailer
+,,firefighter
+,,compliance officer
+,,artist
+,,host
+,,hostess
+,,school bus driver
+,,physical therapist
+,,postal worker
+,,graphic designer
+,,writer
+,,author
+,,manicurist
+,,butcher
+,,dishwasher
+,,therapist
+,,bus driver
+,,coach
+,,baker
+,,radiologic technician
+,,purchasing agent
+,,fitness instructor
+,,executive assistant
+,,roofer
+,,data entry keyer
+,,industrial engineer
+,,teller
+,,network administrator
+,,architect
+,,mental health counselor
+,,dental hygienist
+,,medical records specialist
+,,interviewer
+,,social assistant
+,,photographer
+,,dispatcher
+,,language pathologist
+,,producer
+,,director
+,,health technician
+,,tutor
+,,dentist
+,,massage therapist
+,,file clerk
+,,wholesale buyer
+,,librarian
+,,pilot
+,,carpet installer
+,,drywall installer
+,,payroll clerk
+,,plane mechanic
+,,psychologist
+,,facilities manager
+,,printing press operator
+,,occupational therapist
+,,logistician
+,,detective
+,,aerospace engineer
+,,veterinarian
+,,underwriter
+,,musician
+,,singer
+,,sheet metal worker
+,,interior designer
+,,public relations specialist
+,,nutritionist
+,,event planner

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+pandas
+plotly
+numpy