yjernite commited on
Commit
f450e5b
1 Parent(s): d4eecc2

done with subsection 1.1

Browse files
Files changed (1) hide show
  1. app.py +190 -22
app.py CHANGED
@@ -2,6 +2,36 @@ import gradio as gr
2
  from PIL import Image
3
  import os
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def get_images(path):
7
  images = [Image.open(os.path.join(path, im)) for im in os.listdir(path)]
@@ -9,6 +39,26 @@ def get_images(path):
9
  return [(im, path) for im, path in zip(images, paths)]
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  with gr.Blocks() as demo:
13
  gr.Markdown(
14
  """
@@ -20,6 +70,9 @@ with gr.Blocks() as demo:
20
  <p style="margin-bottom: 10px; font-size: 94%">This is the demo page for the "Stable Bias" paper, which aims to explore and quantify social biases in text-to-image systems. <br> This work was done by <a href='https://huggingface.co/sasha' style='text-decoration: underline;' target='_blank'> Alexandra Sasha Luccioni (Hugging Face) </a>, <a href='https://huggingface.co/cakiki' style='text-decoration: underline;' target='_blank'> Christopher Akiki (ScaDS.AI, Leipzig University)</a>, <a href='https://huggingface.co/meg' style='text-decoration: underline;' target='_blank'> Margaret Mitchell (Hugging Face) </a> and <a href='https://huggingface.co/yjernite' style='text-decoration: underline;' target='_blank'> Yacine Jernite (Hugging Face) </a> .</p>
21
  """
22
  )
 
 
 
23
  examples_path = "images/examples"
24
  examples_gallery = gr.Gallery(
25
  get_images(examples_path),
@@ -35,37 +88,150 @@ with gr.Blocks() as demo:
35
 
36
  gr.Markdown(
37
  """
38
- ### Looking at Identity Groups
39
- """
40
- )
41
 
42
- gr.Markdown(
 
 
 
 
 
 
 
 
43
  """
44
- One of the goals of our study was to look at the ways in which different identity groups (ethnicity and gender) are represented by text-to-image models. Since artificial depictions of fictive humans have no inherent gender or ethnicity nor do they belong to socially-constructed groups, we pursued our analysis <i> without </i> ascribing identity categories to the images generated, using unsupervised techniques such as clustering. We find clear evidence of ethnicity and gender biases, which you can see by expanding the accordion below or directly via the [Identity Representation Demo](https://huggingface.co/spaces/society-ethics/DiffusionFaceClustering).
45
- """
46
  )
47
 
48
- with gr.Accordion("Looking at Identity Groups", open=False):
49
- gr.HTML(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  """
51
- <p style="margin-bottom: 14px; font-size: 100%"> One of the approaches that we adopted in our work is hierarchical clustering of the images generated by the text-to-image systems in response to prompts that include identity terms with regards to ethnicity and gender. <br> We computed 3 different numbers of clusters (12, 24 and 48) and created an <a href='https://huggingface.co/spaces/society-ethics/DiffusionFaceClustering' style='text-decoration: underline;' target='_blank'> Identity Representation Demo </a> that allows for the exploration of the different clusters and their contents. </p>
52
- """
53
  )
 
54
  with gr.Row():
55
- with gr.Column(scale=2):
56
- impath = "images/identities"
57
- identity_gallery = gr.Gallery(
58
- [os.path.join(impath, im) for im in os.listdir(impath)],
59
- label="Identity cluster images",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  show_label=False,
61
- elem_id="gallery",
62
- ).style(grid=3, height="auto")
 
 
 
 
63
  with gr.Column(scale=1):
64
- gr.HTML(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  """
66
- <p style="margin-bottom: 14px; font-size: 100%"> You can see that the models reflect many societal biases -- for instance representing Native Americans wearing traditional headdresses, non-binary people with stereotypical haircuts and glasses, and East Asian men with features that amplify ethnic stereotypes. <br> <br> This is problematic because it reinforces existing cultural stereotypes and fails to represent the diversity that is present in all identity groups.</p>
67
- """
68
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  gr.Markdown(
70
  """
71
  ### Exploring Biases
@@ -206,11 +372,13 @@ with gr.Blocks() as demo:
206
  """
207
  <h4>Bag of Visual Words</h4>
208
  <p style="margin-bottom: 14px; font-size: 100%"> Another way of providing the means for a structured traversal of the dataset is a nearest-neighbor explorer based on visual features provided by an image's SIFT features, which we quantize into a visual vocabulary to represent the entire image dataset as a TF-IDF matrix. These tools are especially useful in honing in on stereotypical content that is often encoded visually, but also failure modes of the model such as the misinterpetation of the "stocker" profession as an imagined dog-breed. The screenshot to the right shows how SIFT visual patterns tend to cluster together, namely in this instance the booksheves in the background, or the gibberish pseudo-English text that often plagues TTI systems. </p>
209
- """)
 
210
  with gr.Column():
211
  gr.Image("images/bovw/bookshelves.png")
212
  gr.Image("images/bovw/gibberish.png")
213
- gr.Markdown("""
 
214
  ### All of the tools created as part of this project:
215
  """
216
  )
 
2
  from PIL import Image
3
  import os
4
 
5
+ _ID_CLUSTER_SCREEN_SHOTS = {
6
+ 19: ("cluster_19_of_24_unmarked_white_unmarked_man.JPG", "Cluster 19 of 24"),
7
+ 2: ("cluster_2_of_24_latinx_woman.JPG", "Cluster 2 of 24"),
8
+ 18: ("cluster_18_of_24_hispanic_nonbinary.JPG", "Cluster 18 of 24"),
9
+ 0: ("cluster_0_of_24_black_woman.JPG", "Cluster 0 of 24"),
10
+ 6: ("cluster_6_of_24_black_man.JPG", "Cluster 6 of 24"),
11
+ 7: ("cluster_7_of_24_pacific_indigenous_man_nonbinary.JPG", "Cluster 7 of 24"),
12
+ 3: (
13
+ "cluster_3_of_24_native_american_stereetotypical.JPG",
14
+ "Cluster 3 of 24",
15
+ ),
16
+ 8: (
17
+ "cluster_8_of_24_indigenous_nonbinary.JPG",
18
+ "Cluster 8 of 24",
19
+ ),
20
+ 23: (
21
+ "cluster_23_of_24_woman_nonbinbary_indigenous.JPG",
22
+ "Cluster 23 of 24",
23
+ ),
24
+ 12: (
25
+ "cluster_12_of_24_white_nonbinary.JPG",
26
+ "Cluster 12 of 24",
27
+ ),
28
+ 13: (
29
+ "cluster_13_of_24_nonbinary_woman_black.JPG",
30
+ "Cluster 13 of 24",
31
+ ),
32
+ 15: ("cluster_15_of_24_woman_white.JPG", "Cluster 15 of 24"),
33
+ }
34
+
35
 
36
  def get_images(path):
37
  images = [Image.open(os.path.join(path, im)) for im in os.listdir(path)]
 
39
  return [(im, path) for im, path in zip(images, paths)]
40
 
41
 
42
+ def show_id_images(cl_id_1, cl_id_2, cl_id_3):
43
+ img_path_1, cluster_name_1 = _ID_CLUSTER_SCREEN_SHOTS[cl_id_1]
44
+ img_path_2, cluster_name_2 = _ID_CLUSTER_SCREEN_SHOTS[cl_id_2]
45
+ img_path_3, cluster_name_3 = _ID_CLUSTER_SCREEN_SHOTS[cl_id_3]
46
+ return (
47
+ gr.update(
48
+ value=os.path.join(impath, img_path_1),
49
+ label=f"Screenshot of the Identity Exploration tool for: {cluster_name_1}",
50
+ ),
51
+ gr.update(
52
+ value=os.path.join(impath, img_path_2),
53
+ label=f"Screenshot of the Identity Exploration tool for: {cluster_name_2}",
54
+ ),
55
+ gr.update(
56
+ value=os.path.join(impath, img_path_3),
57
+ label=f"Screenshot of the Identity Exploration tool for: {cluster_name_3}",
58
+ ),
59
+ )
60
+
61
+
62
  with gr.Blocks() as demo:
63
  gr.Markdown(
64
  """
 
70
  <p style="margin-bottom: 10px; font-size: 94%">This is the demo page for the "Stable Bias" paper, which aims to explore and quantify social biases in text-to-image systems. <br> This work was done by <a href='https://huggingface.co/sasha' style='text-decoration: underline;' target='_blank'> Alexandra Sasha Luccioni (Hugging Face) </a>, <a href='https://huggingface.co/cakiki' style='text-decoration: underline;' target='_blank'> Christopher Akiki (ScaDS.AI, Leipzig University)</a>, <a href='https://huggingface.co/meg' style='text-decoration: underline;' target='_blank'> Margaret Mitchell (Hugging Face) </a> and <a href='https://huggingface.co/yjernite' style='text-decoration: underline;' target='_blank'> Yacine Jernite (Hugging Face) </a> .</p>
71
  """
72
  )
73
+ gr.HTML(
74
+ """<span style="color:red" font-size:smaller>⚠️ DISCLAIMER: the images displayed by this tool were generated by text-to-image systems and may depict offensive stereotypes or contain explicit content.</span>"""
75
+ )
76
  examples_path = "images/examples"
77
  examples_gallery = gr.Gallery(
78
  get_images(examples_path),
 
88
 
89
  gr.Markdown(
90
  """
91
+ ### How do Diffusion Models Represent Identity?
 
 
92
 
93
+ One of the goals of our study was to look at the ways in which pictures generated by text-to-image models depict different notions of gender and ethnicity.
94
+ These concepts are inherently difficult to describe, however: gender and identity are multi-dimensional, inter-related, and, most importantly, socially constructed:
95
+ they cannot (and should not) be predicted based on appearance features alone.
96
+ Since we are working with depictions of fictive humans when analyzing text-to-image model behaviors,
97
+ we cannot rely on self-identification either to assign identity categories to individual data points.
98
+ Instead, we develop a clustering-based method to identify relevant dataset-level trends in the generated images, as described in our companion
99
+ [Identity Representation Demo](https://huggingface.co/spaces/society-ethics/DiffusionFaceClustering).
100
+ You can read about how this tool helps us delineate **relevant regions** in the models' generation space, outline **stereotypical associations**,
101
+ and understand what **markedness** has to do with biases by expanding the accordion below:
102
  """
 
 
103
  )
104
 
105
+ with gr.Accordion("How do Diffusion Models Represent Identity?", open=False):
106
+ gr.Markdown(
107
+ """
108
+ <br/>
109
+
110
+ Diffusion Models are commonly used in text-conditioned image generation systems, such as Stable Diffusion or Dall-E 2.
111
+ In those systems, a user writes a "*prompt*" as input, and receives an image that corresponds to what the prompt is describing as output.
112
+ For example, if the user asks for a "*Photo protrait of a **scientist***", they expect to get an image that looks photorealistic,
113
+ prominently features at least one person, and this person might be wearing a lab coat or safety goggles.
114
+ A "*Photo portrait of a **carpenter***", on the other other hand, might be set against a background depicting wooden scaffolding or a workshop (see pictures above).
115
+
116
+ At the start of this project, we found that while systems do make good use of background and context cues to represent different professions,
117
+ there were also some concerning trends about the perceived genders and ethnicities of the people depicted in these professional situations.
118
+ After trying a few such prompts, we were left asking: why do all the people depicted in these pictures **look like white men**?
119
+ Why do the only exceptions appear to be fast food workers and other lower wage professions?
120
+ And finally, what could be the **consequences of such a lack of diversity** in the system outputs?
121
+
122
+ **Look like** is the operative phrase here, however, as the people depicted in the pictures do not exist, nor do they belong to socially-constructed groups.
123
+ This means that we cannot assign a gender or ethnicity label to each data point to support traditional measures of social diversity or fairness -
124
+ we instead focus on dataset-level trends in visual features that are correlated with social variation in the text prompts.
125
+ We do this through *controlled prompting* and *hierarchical clustering*: for each system,
126
+ we obtain a dataset of generations for prompts of the format "*Photo portrait of a **(identity terms)** person at work*",
127
+ where ***(identity terms)*** jointly enumerate phrases describing ethnicities and phrases denoting gender.
128
+ We then cluster these images by similarity and create an [Identity Representation Demo](https://hf.co/spaces/society-ethics/DiffusionFaceClustering)
129
+ to showcase the visual trends encoded in these clusters - as well as their relation to the social variables under consideration.
130
  """
 
 
131
  )
132
+ impath = "images/identities"
133
  with gr.Row():
134
+ with gr.Column(scale=1):
135
+ gr.Markdown(
136
+ """
137
+ #### [Clusters, Gender, and Ethnicity](https://hf.co/spaces/society-ethics/DiffusionFaceClustering "Select cluster to visualize to the right or go straight to the interactive demo")
138
+
139
+ Our goal with this strategy is to trigger variation in the images that viewers will associate with social markers.
140
+ We use three phrases denoting genders (*man*, *woman*, *non-binary*), and 18 phrases describing ethnicities -
141
+ some of which are sometimes understood as similar in the US context, such as First Nations and Indigenous American.
142
+ We also left the gender and ethnicity phrases unspecified in some prompts.
143
+
144
+ This approach places visual features on a multidimensional spectrum without ascribing a prior number of distinct values for social categories or their intersection.
145
+ It is also limited by the training data of the models under consideration and the set of identity terms use,
146
+ which in our application are more relevant to the North American context than to other regions.
147
+
148
+ How then can we use these clusters in practice?
149
+ Let's look for example at **cluster 2 in the 24-cluster setting**:
150
+ we see that most prompts for images in the cluster used the word *woman* and one of the words denoting *Hispanic* origin.
151
+ This tells us that images that are similar to the ones in this cluster will **likely look like** Hispanic women to viewers.
152
+ You can cycle through [a few other examples right](https://hf.co/spaces/society-ethics/DiffusionFaceClustering "or even better, visualize them in the app"),
153
+ such as cluster 19 which mostly features the words *Caucasian* and *man*, different gender term distributions for *African American* in 0 and 6,
154
+ as well as clusters like 7 that showcase the limitations of mapping visual features to ethnicity by grouping together *Pacific Islander*, *Indigenous American*, and *Latino*.
155
+ """
156
+ )
157
+ with gr.Column(scale=1):
158
+ id_cl_id_1 = gr.Dropdown(
159
+ choices=[2, 19, 0, 6, 7],
160
+ value=2,
161
  show_label=False,
162
+ )
163
+ identity_screenshot_1 = gr.Image(
164
+ value=os.path.join(impath, "cluster_2_of_24_latinx_woman.JPG"),
165
+ label="Screenshot of the Identity Exploration tool for: Cluster 2 of 24",
166
+ )
167
+ with gr.Row():
168
  with gr.Column(scale=1):
169
+ id_cl_id_2 = gr.Dropdown(
170
+ choices=[3, 8, 23, 12, 13, 7],
171
+ value=3,
172
+ show_label=False,
173
+ )
174
+ identity_screenshot_2 = gr.Image(
175
+ value=os.path.join(
176
+ impath, "cluster_3_of_24_native_american_stereetotypical.JPG"
177
+ ),
178
+ label="Screenshot of the Identity Exploration tool for: Cluster 3 of 24",
179
+ )
180
+ with gr.Column(scale=1):
181
+ gr.Markdown(
182
+ """
183
+ #### [Stereotypical Representations and Associations](https://hf.co/spaces/society-ethics/DiffusionFaceClustering "Select cluster to visualize to the left or go straight to the interactive demo")
184
+ - Native american:
185
+ - stereotypical (man 3) vs modern and salient (8) vs less stereotypical (23 woman + nonbinary)
186
+ - shows the importance of flexibe categories!!!
187
+ - Non-binary: stereotype "depends on ethnicity" - associated with only "woman" + haircut + glasses for caucasian, more diverse for others
188
+ - different stereotype and power dynamics, but still all the same haircut and glasses (down to the collar!) in cluster 12 - also only associated with women + white
189
+ - compare to clusters 13 (black, also +woman), 8 (native american, gender diversity!). Other clusters with NB mostly + visually diverse, + women, except 7 +men
190
+
191
+ You can see that the models reflect many societal biases -- for instance representing Native Americans wearing traditional headdresses,
192
+ non-binary people with stereotypical haircuts and glasses, and East Asian men with features that amplify ethnic stereotypes.
193
+
194
+ This is problematic because it reinforces existing cultural stereotypes and fails to represent the diversity that is present in all identity groups.
195
  """
 
 
196
  )
197
+ with gr.Row():
198
+ with gr.Column(scale=1):
199
+ gr.Markdown(
200
+ """
201
+ #### [Specification, Markedness, and Bias](https://hf.co/spaces/society-ethics/DiffusionFaceClustering "Select cluster to visualize to the right or go straight to the interactive demo")
202
+ - Cluster 19: both axes
203
+ - Unmarked gender across ethnicity: 6 and 0 have the most AfrAm, 36% vs 18%
204
+ - Unmarked ethnicity across genders: 15 has the most unmarked ethnicity of woman>man clusters
205
+ """
206
+ )
207
+ with gr.Column(scale=1):
208
+ id_cl_id_3 = gr.Dropdown(
209
+ choices=[19, 0, 6, 15],
210
+ value=6,
211
+ show_label=False,
212
+ )
213
+ identity_screenshot_3 = gr.Image(
214
+ value=os.path.join(
215
+ impath, "cluster_19_of_24_unmarked_white_unmarked_man.JPG"
216
+ ),
217
+ label="Screenshot of the Identity Exploration tool for: Cluster 19 of 24",
218
+ )
219
+ gr.Markdown(
220
+ """
221
+ Conclusion: let's use those to measure other outputs of the model that represent people!!!
222
+ """
223
+ )
224
+ for var in [id_cl_id_1, id_cl_id_2, id_cl_id_3]:
225
+ var.change(
226
+ show_id_images,
227
+ inputs=[id_cl_id_1, id_cl_id_2, id_cl_id_3],
228
+ outputs=[
229
+ identity_screenshot_1,
230
+ identity_screenshot_2,
231
+ identity_screenshot_3,
232
+ ],
233
+ )
234
+
235
  gr.Markdown(
236
  """
237
  ### Exploring Biases
 
372
  """
373
  <h4>Bag of Visual Words</h4>
374
  <p style="margin-bottom: 14px; font-size: 100%"> Another way of providing the means for a structured traversal of the dataset is a nearest-neighbor explorer based on visual features provided by an image's SIFT features, which we quantize into a visual vocabulary to represent the entire image dataset as a TF-IDF matrix. These tools are especially useful in honing in on stereotypical content that is often encoded visually, but also failure modes of the model such as the misinterpetation of the "stocker" profession as an imagined dog-breed. The screenshot to the right shows how SIFT visual patterns tend to cluster together, namely in this instance the booksheves in the background, or the gibberish pseudo-English text that often plagues TTI systems. </p>
375
+ """
376
+ )
377
  with gr.Column():
378
  gr.Image("images/bovw/bookshelves.png")
379
  gr.Image("images/bovw/gibberish.png")
380
+ gr.Markdown(
381
+ """
382
  ### All of the tools created as part of this project:
383
  """
384
  )