|
import json |
|
import random |
|
import sys |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
|
|
from transformers import pipeline |
|
|
|
title = "Model Exploration" |
|
description = "Comparison of hate speech detection models" |
|
date = "2022-01-26" |
|
thumbnail = "images/robot.png" |
|
|
|
__HATE_DETECTION = """ |
|
Once the data has been collected using the definitions identified for the |
|
task, you can start training your model. At training, the model takes in |
|
the data with labels and learns the associated context in the input data |
|
for each label. Depending on the task design, the labels may be binary like |
|
'hateful' and 'non-hateful' or multiclass like 'neutral', 'offensive', and |
|
'attack'. |
|
|
|
When presented with a new input string, the model then predicts the |
|
likelihood that the input is classified as each of the available labels and |
|
returns the label with the highest likelihood as well as how confident the |
|
model is in its selection using a score from 0 to 1. |
|
|
|
Neural models such as transformers are frequently trained as general |
|
language models and then fine-tuned on specific classification tasks. |
|
These models can vary in their architecture and the optimization |
|
algorithms, sometimes resulting in very different output for the same |
|
input text. |
|
|
|
The models used below include: |
|
- [RoBERTa trained on FRENK dataset](https://huggingface.co/classla/roberta-base-frenk-hate) |
|
- [RoBERTa trained on Twitter Hate Speech](https://huggingface.co/cardiffnlp/twitter-roberta-base-hate) |
|
- [DeHateBERT model (trained on Twitter and StormFront)](https://huggingface.co/Hate-speech-CNERG/dehatebert-mono-english) |
|
- [RoBERTa trained on 11 English hate speech datasets](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r1-target) |
|
- [RoBERTa trained on 11 English hate speech datasets and Round 1 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r2-target) |
|
- [RoBERTa trained on 11 English hate speech datasets and Rounds 1 and 2 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r3-target) |
|
- [RoBERTa trained on 11 English hate speech datasets and Rounds 1, 2, and 3 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target) |
|
""" |
|
|
|
__HATECHECK = """ |
|
[Röttinger et al. (2021)](https://aclanthology.org/2021.acl-long.4.pdf) |
|
developed a list of 3,901 test cases for hate speech detection models called |
|
HateCheck. HateCheck provides a number of templates long with placeholders for |
|
identity categories and hateful terms along with labels indicating whether a |
|
model should or should not categorize the instance as hate speech. For each |
|
case, they created several examples with different |
|
identity attributes to test models' abilities to detect hate speech towards |
|
a range of groups of people. Additionally, they used more difficult |
|
linguistic contexts such as adding negation or more nuanced words to try to fool the |
|
model. See some of there examples using the button or try to make |
|
your own examples to test the models in the tools below. |
|
|
|
*** Warning: these examples may include hateful and violent content as |
|
well as slurs and other offensive languages *** |
|
""" |
|
|
|
__RANKING = """ |
|
When models process a given input, they calculate the probability of |
|
that input being labeled with each of the possible labels (in binary |
|
cases for example, either 'hateful' or 'not hateful'). The label with |
|
the highest probably is returned. If we test multiple input sentences |
|
for a given model, we can see which input sentences have the |
|
highest probabilities, indicating which examples the model is most |
|
confident in classifying. |
|
|
|
Try comparing different input sentences for a given model |
|
using the tool below. |
|
""" |
|
|
|
__COMPARISON = """ |
|
Depending on their training data and parameters, models can return very |
|
different outputs for the same input. Knowing how models differ in |
|
their behavior can help with choosing an appropriate model for your |
|
given use case. |
|
|
|
Additionally, models trained on one kind of data can perform very |
|
differently when tested on novel data. To show the models' performance |
|
in a variety of settings, we also show the results of each model on |
|
four datasets: the entirety of the [HateCheck Dataset](https://github.com/paul-rottger/hatecheck-data), |
|
the [Dynamically Generated Hate Speech Dataset](https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset), |
|
the [Hateful Memes Dataset](https://huggingface.co/datasets/limjiayi/hateful_memes_expanded) |
|
(without the images), and a 2000-instance subsection of the |
|
[Open Subtitles](https://opus.nlpl.eu/OpenSubtitles-v2018.php) |
|
(English) Dataset. While the first three are hate speech datasets and have |
|
labels for whether or not each instance is hateful, the Open Subtitles dataset |
|
contains movie subtitles and is not labeled for instances of hate. |
|
|
|
Try comparing two different models' output for the same comparison |
|
sentence using the tool below. |
|
""" |
|
|
|
__MODELS = [ |
|
"classla/roberta-base-frenk-hate", |
|
"cardiffnlp/twitter-roberta-base-hate", |
|
"Hate-speech-CNERG/dehatebert-mono-english", |
|
"facebook/roberta-hate-speech-dynabench-r1-target", |
|
"facebook/roberta-hate-speech-dynabench-r2-target", |
|
"facebook/roberta-hate-speech-dynabench-r3-target", |
|
"facebook/roberta-hate-speech-dynabench-r4-target", |
|
] |
|
|
|
autopop_ranking = [ |
|
[ |
|
"I don't hate black people.", |
|
"Black people should die.", |
|
"My friend likes pancakes.", |
|
], |
|
[ |
|
"Those dirty redskins are all drunks.", |
|
"I can't stand Chinese food. It's too spicy.", |
|
"I lived near many Chinese people and I hated every single one of them.", |
|
], |
|
[ |
|
"Stop saying that Mexicans don't belong in our country!", |
|
"We walked along the dyke down the road.", |
|
"Ah shit, I fucked up.", |
|
], |
|
] |
|
|
|
|
|
|
|
def run_article(): |
|
st.markdown("# Making a Hate Speech Detection Model") |
|
with st.expander("Hate Speech Detection Models", expanded=False): |
|
st.markdown(__HATE_DETECTION) |
|
hc_path = "posts/resources/" |
|
hc_pholders = json.load( |
|
open(hc_path + "template_placeholders.json", encoding="utf-8") |
|
) |
|
hc_templates = json.load( |
|
open(hc_path + "hatecheck_category_templates.json", encoding="utf-8") |
|
) |
|
hc_info = json.load( |
|
open(hc_path + "hatecheck_category_info.json", encoding="utf-8") |
|
) |
|
hc_cats = [""] + list(hc_info.keys()) |
|
|
|
st.markdown("## Testing Models' Behavior") |
|
with st.expander("HateCheck Examples", expanded=False): |
|
st.markdown(__HATECHECK) |
|
category = st.selectbox( |
|
"Select a category of examples from HateCheck", hc_cats, key="hc_cat_select" |
|
) |
|
if category: |
|
with st.form(key="hate_check"): |
|
hc_cat = hc_info[category] |
|
templates = [] |
|
names = [] |
|
for hc_temp in hc_cat: |
|
templates.append(hc_temp) |
|
names.append(hc_cat[hc_temp]["name"]) |
|
selected_names = st.multiselect( |
|
"Select one or more HateCheck templates to generate examples for", |
|
names, |
|
key="hc_temp_multiselect", |
|
) |
|
num_exs = st.number_input( |
|
"Select a number of examples to generate for each selected template", |
|
min_value=1, |
|
max_value=5, |
|
value=3, |
|
) |
|
if st.form_submit_button(label="Generate Examples"): |
|
for name in selected_names: |
|
index = names.index(name) |
|
template = templates[index] |
|
examples = generate_hc_ex( |
|
hc_templates[template], hc_pholders, num_exs |
|
) |
|
st.header(name) |
|
st.subheader("Label: " + hc_cat[template]["value"]) |
|
st.caption(hc_cat[template]["desc"]) |
|
for num in range(num_exs): |
|
ex = examples[num] |
|
st.write("Example #" + str(num + 1) + ": " + ex) |
|
|
|
st.markdown("## Model Output Ranking") |
|
with st.expander("Model Output Ranking Tool", expanded=False): |
|
st.markdown(__RANKING) |
|
with st.form(key="ranking"): |
|
model_name = st.selectbox( |
|
"Select a model to test", |
|
__MODELS, |
|
) |
|
|
|
|
|
input_1 = st.text_input( |
|
"Input 1", |
|
help="Try a phrase like 'We shouldn't let [IDENTITY] suffer.'", |
|
|
|
) |
|
input_2 = st.text_input( |
|
"Input 2", |
|
help="Try a phrase like 'I'd rather die than date [IDENTITY].'", |
|
|
|
) |
|
input_3 = st.text_input( |
|
"Input 3", |
|
help="Try a phrase like 'Good morning'", |
|
|
|
) |
|
autopop = st.checkbox( |
|
"Choose examples for me", |
|
key="rank_autopop_ckbx", |
|
help="Check this box to run the model with 3 preselected sentences.", |
|
) |
|
if st.form_submit_button(label="Rank inputs"): |
|
if autopop: |
|
rank_inputs = random.choice(autopop_ranking) |
|
else: |
|
rank_inputs = [input_1, input_2, input_3] |
|
sys.stderr.write("\n" + str(rank_inputs) + "\n") |
|
results = run_ranked(model_name, rank_inputs) |
|
st.dataframe(results) |
|
|
|
st.markdown("## Model Comparison") |
|
with st.expander("Model Comparison Tool", expanded=False): |
|
st.markdown(__COMPARISON) |
|
with st.form(key="comparison"): |
|
model_name_1 = st.selectbox( |
|
"Select a model to compare", |
|
__MODELS, |
|
key="compare_model_1", |
|
) |
|
model_name_2 = st.selectbox( |
|
"Select another model to compare", |
|
__MODELS, |
|
key="compare_model_2", |
|
) |
|
autopop = st.checkbox( |
|
"Choose an example for me", |
|
key="comp_autopop_ckbx", |
|
help="Check this box to compare the models with a preselected sentence.", |
|
) |
|
input_text = st.text_input("Comparison input") |
|
if st.form_submit_button(label="Compare models"): |
|
if autopop: |
|
input_text = random.choice(random.choice(autopop_ranking)) |
|
results = run_compare(model_name_1, model_name_2, input_text) |
|
st.write("### Showing results for: " + input_text) |
|
st.dataframe(results) |
|
outside_ds = ["hatecheck", "dynabench", "hatefulmemes", "opensubtitles"] |
|
name_1_short = model_name_1.split("/")[1] |
|
name_2_short = model_name_2.split("/")[1] |
|
for calib_ds in outside_ds: |
|
ds_loc = "posts/resources/charts/" + calib_ds + "/" |
|
images, captions = [], [] |
|
for model in [name_1_short, name_2_short]: |
|
images.append(ds_loc + model + "_" + calib_ds + ".png") |
|
captions.append("Counts of dataset instances by hate score.") |
|
st.write("#### Model performance comparison on " + calib_ds) |
|
st.image(images, captions) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_hc_ex(template, placeholders, gen_num): |
|
sampled = random.sample(template, gen_num) |
|
ph_cats = list(placeholders.keys()) |
|
for index in range(len(sampled)): |
|
sample = sampled[index] |
|
for ph_cat in ph_cats: |
|
if ph_cat in sample: |
|
insert = random.choice(placeholders[ph_cat]) |
|
sampled[index] = sample.replace(ph_cat, insert).capitalize() |
|
return sampled |
|
|
|
|
|
|
|
|
|
def run_ranked(model, input_list): |
|
classifier = pipeline("text-classification", model=model, return_all_scores=True) |
|
output = {} |
|
results = classifier(input_list) |
|
for result in results: |
|
for index in range(len(result)): |
|
label = result[index]["label"] |
|
score = result[index]["score"] |
|
if label in output: |
|
output[label].append(score) |
|
else: |
|
new_out = [score] |
|
output[label] = new_out |
|
return pd.DataFrame(output, index=input_list) |
|
|
|
|
|
|
|
|
|
def run_compare(name_1, name_2, text): |
|
classifier_1 = pipeline("text-classification", model=name_1) |
|
result_1 = classifier_1(text) |
|
out_1 = {} |
|
out_1["Model"] = name_1 |
|
out_1["Label"] = result_1[0]["label"] |
|
out_1["Score"] = result_1[0]["score"] |
|
classifier_2 = pipeline("text-classification", model=name_2) |
|
result_2 = classifier_2(text) |
|
out_2 = {} |
|
out_2["Model"] = name_2 |
|
out_2["Label"] = result_2[0]["label"] |
|
out_2["Score"] = result_2[0]["score"] |
|
return [out_1, out_2] |
|
|