Spaces:

hf-task-exploration
/

ExploreACMnaacl

Runtime error

App Files Files Community

Yacine Jernite commited on Jul 8, 2022

Commit

7bffaaf

1 Parent(s): 89e8e87

initial commit

Browse files

Files changed (11) hide show

.gitattributes +2 -0
README.md +13 -7
app.py +75 -0
data_measurements_clusters/__init__.py +1 -0
data_measurements_clusters/clustering.py +691 -0
data_measurements_clusters/dataset_utils.py +292 -0
posts/conclusion.py +58 -0
posts/context.py +104 -0
posts/dataset_exploration.py +143 -0
posts/model_exploration.py +340 -0
posts/welcome.py +74 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,19 @@
 ---
-title: ExploringAutomaticContentModeratio
-emoji: 😻
-colorFrom: purple
-colorTo: purple
 sdk: streamlit
-sdk_version: 1.10.0
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Task Exploration - Automatic Content Moderation
+emoji: 🤗
+colorFrom: blue
+colorTo: red
 sdk: streamlit
 app_file: app.py
 pinned: false
 ---
+# Task Exploration
+[![Generic badge](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/aymm/Task-Exploration-Hate-Speech)
+The context and definition of hate speech detection as a modeling task.
+---
+Autogenerated using [this template](https://github.com/nateraw/spaces-template)

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import importlib
+import re
+from pathlib import Path
+import streamlit as st
+import yaml
+REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r](.*)", re.DOTALL)
+def render_preview(image, title, description):
+    with st.container():
+        image_col, text_col = st.columns((1, 4))
+        with image_col:
+            st.image(image)
+        with text_col:
+            st.subheader(title)
+            st.write(description)
+def render_page(post_path: Path):
+    mod = importlib.import_module(str(post_path))
+    mod.run_article()
+def get_page_data(post_path: Path):
+    mod = importlib.import_module(str(post_path))
+    return {
+        "title": mod.title,
+        "description": mod.description,
+        "date": mod.date,
+        "thumbnail": mod.thumbnail,
+    }
+def main():
+    st.set_page_config(layout="wide")
+    posts = {
+        "posts.welcome": "Welcome",
+        "posts.context": "Hate Speech in ACM",
+        "posts.dataset_exploration": "ACM Datasets",
+        "posts.model_exploration": "ACM Models",
+        "posts.conclusion": "Key Takeaways",
+    }
+    page_to_show = list(posts.keys())[0]
+    with st.sidebar:
+        st.markdown(
+            """
+            <div align="center">
+                <h1>Task Exploration: Hate Speech Detection</h1>
+            </div>
+        """,
+            unsafe_allow_html=True,
+        )
+        st.markdown("---")
+        page_to_show = st.selectbox(
+            "Navigation menu:",
+            posts,
+            format_func=lambda x:posts[x],
+        )
+        for post in posts:
+            data = get_page_data(Path(post))
+            clicked = render_preview(
+                data.get("thumbnail"), data.get("title"), data.get("description")
+            )
+    if page_to_show:
+        render_page(Path(page_to_show))
+main()

data_measurements_clusters/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clustering import Clustering

data_measurements_clusters/clustering.py ADDED Viewed

	@@ -0,0 +1,691 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gzip
+import json
+import math
+import os
+from os.path import exists
+from os.path import join as pjoin
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import torch
+import transformers
+from datasets import load_dataset
+from huggingface_hub import HfApi
+from tqdm import tqdm
+# from .dataset_utils import prepare_clustering_dataset
+pd.options.display.max_colwidth = 256
+_CACHE_DIR = "cache_dir"
+_DEFAULT_MODEL = "sentence-transformers/all-mpnet-base-v2"
+_MAX_MERGE = 20000000 # to run on 64GB RAM laptop
+def sentence_mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[
+        0
+    ]  # First element of model_output contains all token embeddings
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    )
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+# get nearest neighbors of a centroid by dot product
+def get_examplars(example_ids, centroid, embeddings, dset, n_examplars):
+    example_embeds = embeddings[example_ids]
+    example_scores = torch.mv(example_embeds, centroid)
+    s_scores, s_ids = example_scores.sort(dim=-1, descending=True)
+    examplars = [
+        (example_ids[i.item()], s.item())
+        for i, s in zip(s_ids[:n_examplars], s_scores[:n_examplars])
+    ]
+    res = []
+    for eid, score in examplars:
+        dct = dict(dset[eid])
+        dct["score"] = score
+        res += [dct]
+    return res
+# order node children so that the large ones are in the middle
+# makes visualization more balanced
+def pretty_order(nodes, node_ids):
+    sorted_ids = sorted(node_ids, key=lambda nid: nodes[nid]["weight"])
+    sorted_a = [nid for i, nid in enumerate(sorted_ids) if i % 2 == 0]
+    sorted_b = [nid for i, nid in enumerate(sorted_ids) if i % 2 == 1]
+    sorted_b.reverse()
+    return sorted_a + sorted_b
+def make_tree_plot(node_list, root_id, max_depth=-1):
+    # make plot nodes
+    plot_nodes = [{} for _ in node_list]
+    root = {
+        "parent_id": -1,
+        "node_id": root_id,
+        "label": node_list[root_id]["hover_text"],
+        "weight": node_list[root_id]["weight"],
+        "num_leaves": 0,
+        "children_ids": node_list[root_id]["children_ids"],
+        "Xmin": 0,
+        "Y": 0,
+    }
+    plot_nodes[root_id] = root
+    root_depth = node_list[root_id]["depth"]
+    def rec_make_coordinates(node):
+        total_weight = 0
+        recurse = (max_depth == -1) or (
+            node_list[node["node_id"]]["depth"] - root_depth < max_depth - 1
+        )
+        for cid in node["children_ids"]:
+            plot_nodes[cid] = {
+                "parent_id": node["node_id"],
+                "node_id": cid,
+                "label": node_list[cid]["hover_text"],
+                "weight": node_list[cid]["weight"],
+                "children_ids": node_list[cid]["children_ids"] if recurse else [],
+                "Xmin": node["Xmin"] + total_weight,
+                "Y": node["Y"] - 1,
+            }
+            plot_nodes[cid]["num_leaves"] = 1 if len(plot_nodes[cid]["children_ids"]) == 0 else 0
+            rec_make_coordinates(plot_nodes[cid])
+            total_weight += plot_nodes[cid]["num_leaves"]
+            node["num_leaves"] += plot_nodes[cid]["num_leaves"]
+        node["Xmax"] = node["Xmin"] + node["num_leaves"]
+        node["X"] = node["Xmin"] + (node["num_leaves"] / 2)
+    rec_make_coordinates(root)
+    subtree_nodes = [node for node in plot_nodes if len(node) > 0]
+    nid_map = dict([(node["node_id"], nid) for nid, node in enumerate(subtree_nodes)])
+    labels = [node["label"] for node in subtree_nodes]
+    E = []  # list of edges
+    Xn = []
+    Yn = []
+    Xe = []
+    Ye = []
+    for nid, node in enumerate(subtree_nodes):
+        Xn += [node["X"]]
+        Yn += [node["Y"]]
+        for cid in node["children_ids"]:
+            child = plot_nodes[cid]
+            E += [(nid, nid_map[child["node_id"]])]
+            Xe += [node["X"], child["X"], None]
+            Ye += [node["Y"], child["Y"], None]
+    # make figure
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=Xe,
+            y=Ye,
+            mode="lines",
+            name="",
+            line=dict(color="rgb(210,210,210)", width=1),
+            hoverinfo="none",
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=Xn,
+            y=Yn,
+            mode="markers",
+            name="nodes",
+            marker=dict(
+                symbol="circle-dot",
+                size=18,
+                color="#6175c1",
+                line=dict(color="rgb(50,50,50)", width=1)
+                # '#DB4551',
+            ),
+            text=labels,
+            hoverinfo="text",
+            opacity=0.8,
+        )
+    )
+    fig.layout.showlegend = False
+    return fig
+class ClusteringBuilder:
+    def __init__(
+        self,
+        dataset_name,
+        config_name,
+        split_name,
+        input_field_path,
+        label_name,
+        num_rows,
+        model_name=_DEFAULT_MODEL,
+    ):
+        """Item embeddings and clustering"""
+        self.dataset_name = dataset_name
+        self.config_name = config_name
+        self.split_name = split_name
+        self.input_field_path = input_field_path
+        self.label_name = label_name
+        self.num_rows = num_rows
+        self.cache_path_list = [
+            _CACHE_DIR,
+            dataset_name.replace("/", "---"),
+            f"{'default' if config_name is None else config_name}",
+            f"{'train' if split_name is None else split_name}",
+            f"field-{'->'.join(input_field_path)}-label-{label_name}",
+            f"{num_rows}_rows",
+            model_name.replace("/", "---"),
+        ]
+        self.cache_path = pjoin(*self.cache_path_list)
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.model_name = model_name
+    # prepare embeddings for the dataset
+    def set_model(self):
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name)
+        self.model = transformers.AutoModel.from_pretrained(self.model_name).to(
+            self.device
+        )
+    def set_features_dataset(self, use_streaming, use_auth_token, use_dataset):
+        dset, dset_path = prepare_clustering_dataset(
+            dataset_name=self.dataset_name,
+            input_field_path=self.input_field_path,
+            label_name=self.label_name,
+            config_name=self.config_name,
+            split_name=self.split_name,
+            num_rows=self.num_rows,
+            use_streaming=use_streaming,
+            use_auth_token=use_auth_token,
+            use_dataset=use_dataset,
+        )
+        self.features_dset = dset
+    def compute_feature_embeddings(self, sentences):
+        batch = self.tokenizer(
+            sentences, padding=True, truncation=True, return_tensors="pt"
+        )
+        batch = {k: v.to(self.device) for k, v in batch.items()}
+        with torch.no_grad():
+            model_output = self.model(**batch)
+            sentence_embeds = sentence_mean_pooling(
+                model_output, batch["attention_mask"]
+            )
+            sentence_embeds /= sentence_embeds.norm(dim=-1, keepdim=True)
+            return sentence_embeds
+    def set_embeddings_dataset(self):
+        def batch_embed(examples):
+            return {
+                "embedding": [
+                    embed.tolist()
+                    for embed in self.compute_feature_embeddings(examples["field"])
+                ]
+            }
+        if not exists(self.cache_path):
+            os.mkdir(self.cache_path)
+        self.embeddings_dset = self.features_dset.map(
+            batch_embed,
+            batched=True,
+            batch_size=32,
+            cache_file_name=pjoin(self.cache_path, "embeddings_dset"),
+        )
+    def prepare_embeddings(
+        self,
+        use_streaming=True,
+        use_auth_token=None,
+        use_dataset=None,
+    ):
+        self.set_model()
+        self.set_features_dataset(use_streaming, use_auth_token, use_dataset)
+        self.set_embeddings_dataset()
+    # make cluster tree
+    def prepare_merges(self, batch_size, low_thres):
+        self.embeddings = torch.Tensor(self.embeddings_dset["embedding"])
+        all_indices = torch.LongTensor(torch.Size([0, 2]))
+        all_scores = torch.Tensor(torch.Size([0]))
+        n_batches = math.ceil(self.embeddings_dset.num_rows / batch_size)
+        for a in range(n_batches):
+            for b in tqdm(range(a, n_batches)):
+                cos_scores = torch.mm(
+                    self.embeddings[a * batch_size : (a + 1) * batch_size],
+                    self.embeddings[b * batch_size : (b + 1) * batch_size].t(),
+                )
+                if a == b:
+                    cos_scores = cos_scores.triu(diagonal=1)
+                merge_indices = torch.nonzero(cos_scores > low_thres)
+                merge_indices[:, 0] += a * batch_size
+                merge_indices[:, 1] += b * batch_size
+                merge_scores = cos_scores[cos_scores > low_thres]
+                all_indices = torch.cat([all_indices, merge_indices], dim=0)
+                all_scores = torch.cat([all_scores, merge_scores], dim=0)
+        self.sorted_scores, sorted_score_ids = all_scores.sort(dim=0, descending=True)
+        self.sorted_scores = self.sorted_scores[:_MAX_MERGE]
+        sorted_score_ids = sorted_score_ids[:_MAX_MERGE]
+        self.sorted_indices = all_indices[sorted_score_ids]
+    def make_starting_nodes(self, identical_threshold):
+        identical_indices = self.sorted_indices[
+            self.sorted_scores >= identical_threshold
+        ]
+        identical_inter = identical_indices[
+            identical_indices[:, 1].sort(stable=True).indices
+        ]
+        identical_sorted = identical_inter[
+            identical_inter[:, 0].sort(stable=True).indices
+        ]
+        self.parents = {}
+        for a_pre, b_pre in identical_sorted:
+            a = a_pre.item()
+            b = b_pre.item()
+            while self.parents.get(a, -1) != -1:
+                a = self.parents[a]
+            self.parents[b] = a
+        self.duplicates = {}
+        for a, b in self.parents.items():
+            self.duplicates[b] = self.duplicates.get(b, []) + [a]
+        self.nodes = {}
+        for node_id in range(self.features_dset.num_rows):
+            if node_id in self.parents:
+                continue
+            else:
+                self.nodes[node_id] = {
+                    "node_id": node_id,
+                    "parent_id": -1,
+                    "children": [],
+                    "children_ids": [],
+                    "example_ids": [node_id],
+                    "weight": 1,
+                    "merge_threshold": 0.98,
+                    "depth": 0,
+                }
+    def make_merge_nodes(self, identical_threshold, thres_step):
+        new_node_id = self.features_dset.num_rows
+        current_thres = identical_threshold
+        depth = 1
+        merge_ids = self.sorted_indices[self.sorted_scores < identical_threshold]
+        merge_scores = self.sorted_scores[self.sorted_scores < identical_threshold]
+        for (node_id_a, node_id_b), merge_score in tqdm(
+            zip(merge_ids, merge_scores), total=len(merge_ids)
+        ):
+            if merge_score.item() < current_thres:
+                current_thres -= thres_step
+            merge_a = node_id_a.item()
+            while self.parents.get(merge_a, -1) != -1:
+                merge_a = self.parents[merge_a]
+                self.parents[node_id_a] = merge_a
+            merge_b = node_id_b.item()
+            while self.parents.get(merge_b, -1) != -1:
+                merge_b = self.parents[merge_b]
+                self.parents[node_id_b] = merge_b
+            if merge_a == merge_b:
+                continue
+            else:
+                merge_b, merge_a = sorted([merge_a, merge_b])
+                node_a = self.nodes[merge_a]
+                node_b = self.nodes[merge_b]
+                if (node_a["depth"]) > 0 and min(
+                    node_a["merge_threshold"], node_b["merge_threshold"]
+                ) == current_thres:
+                    node_a["depth"] = max(node_a["depth"], node_b["depth"])
+                    node_a["weight"] += node_b["weight"]
+                    node_a["children_ids"] += (
+                        node_b["children_ids"]
+                        if node_b["depth"] > 0
+                        else [node_b["node_id"]]
+                    )
+                    for cid in node_b["children_ids"]:
+                        self.nodes[cid]["parent_id"] = node_a["node_id"]
+                        self.parents[cid] = node_a["node_id"]
+                    node_b["parent_id"] = node_a["node_id"]
+                    self.parents[node_b["node_id"]] = node_a["node_id"]
+                else:
+                    new_nid = new_node_id
+                    new_node_id += 1
+                    new_node = {
+                        "node_id": new_nid,
+                        "parent_id": -1,
+                        "children_ids": [node_a["node_id"], node_b["node_id"]],
+                        "example_ids": [],
+                        "weight": node_a["weight"] + node_b["weight"],
+                        "merge_threshold": current_thres,
+                        "depth": max(node_a["depth"], node_b["depth"]) + 1,
+                    }
+                    depth = max(depth, new_node["depth"])
+                    node_a["parent_id"] = new_nid
+                    node_b["parent_id"] = new_nid
+                    self.parents[node_a["node_id"]] = new_nid
+                    self.parents[node_b["node_id"]] = new_nid
+                    self.parents[node_id_a] = new_nid
+                    self.parents[node_id_b] = new_nid
+                    self.nodes[new_nid] = new_node
+        return new_node_id
+    def collapse_nodes(self, node, min_weight):
+        children = [
+            self.collapse_nodes(self.nodes[cid], min_weight)
+            for cid in node["children_ids"]
+            if self.nodes[cid]["weight"] >= min_weight
+        ]
+        extras = [
+            lid
+            for cid in node["children_ids"]
+            if self.nodes[cid]["weight"] < min_weight
+            for lid in self.collapse_nodes(self.nodes[cid], min_weight)["example_ids"]
+        ] + node["example_ids"]
+        extras_embed = (
+            torch.cat(
+                [self.embeddings[eid][None, :] for eid in extras],
+                dim=0,
+            ).sum(dim=0)
+            if len(extras) > 0
+            else torch.zeros(self.embeddings.shape[-1])
+        )
+        if len(children) == 0:
+            node["extras"] = extras
+            node["children_ids"] = []
+            node["example_ids"] = extras
+            node["embedding_sum"] = extras_embed
+        elif len(children) == 1:
+            node["extras"] = extras + children[0]["extras"]
+            node["children_ids"] = children[0]["children_ids"]
+            node["example_ids"] = extras + children[0]["example_ids"]
+            node["embedding_sum"] = extras_embed + children[0]["embedding_sum"]
+        else:
+            node["extras"] = extras
+            node["children_ids"] = [child["node_id"] for child in children]
+            node["example_ids"] = extras + [
+                eid for child in children for eid in child["example_ids"]
+            ]
+            node["embedding_sum"] = (
+                extras_embed
+                + torch.cat(
+                    [child["embedding_sum"][None, :] for child in children],
+                    dim=0,
+                ).sum(dim=0)
+            )
+        assert (
+            len(node["example_ids"]) == node["weight"]
+        ), f"stuck at {node['node_id']} - {len(node['example_ids'])} - {node['weight']}"
+        return node
+    def finalize_node(self, node, parent_id, n_examplars, with_labels):
+        new_node_id = len(self.tree_node_list)
+        new_node = {
+            "node_id": new_node_id,
+            "parent_id": parent_id,
+            "depth": 0
+            if parent_id == -1
+            else self.tree_node_list[parent_id]["depth"] + 1,
+            "merged_at": node["merge_threshold"],
+            "weight": node["weight"],
+            "is_extra": False,
+        }
+        self.tree_node_list += [new_node]
+        centroid = node["embedding_sum"] / node["embedding_sum"].norm()
+        new_node["centroid"] = centroid.tolist()
+        new_node["examplars"] = get_examplars(
+            node["example_ids"],
+            centroid,
+            self.embeddings,
+            self.features_dset,
+            n_examplars,
+        )
+        label_counts = {}
+        if with_labels:
+            for eid in node["example_ids"]:
+                label = self.features_dset[eid]["label"]
+                label_counts[label] = label_counts.get(label, 0) + 1
+        new_node["label_counts"] = sorted(
+            label_counts.items(), key=lambda x: x[1], reverse=True
+        )
+        if len(node["children_ids"]) == 0:
+            new_node["children_ids"] = []
+        else:
+            children = [
+                self.nodes[cid]
+                for cid in pretty_order(self.nodes, node["children_ids"])
+            ]
+            children_ids = [
+                self.finalize_node(child, new_node_id, n_examplars, with_labels)
+                for child in children
+            ]
+            new_node["children_ids"] = children_ids
+            if len(node["extras"]) > 0:
+                extra_node = {
+                    "node_id": len(self.tree_node_list),
+                    "parent_id": new_node_id,
+                    "depth": new_node["depth"] + 1,
+                    "merged_at": node["merge_threshold"],
+                    "weight": len(node["extras"]),
+                    "is_extra": True,
+                    "centroid": new_node["centroid"],
+                    "examplars": get_examplars(
+                        node["extras"],
+                        centroid,
+                        self.embeddings,
+                        self.features_dset,
+                        n_examplars,
+                    ),
+                }
+                self.tree_node_list += [extra_node]
+                label_counts = {}
+                if with_labels:
+                    for eid in node["extras"]:
+                        label = self.features_dset[eid]["label"]
+                        label_counts[label] = label_counts.get(label, 0) + 1
+                extra_node["label_counts"] = sorted(
+                    label_counts.items(), key=lambda x: x[1], reverse=True
+                )
+                extra_node["children_ids"] = []
+                new_node["children_ids"] += [extra_node["node_id"]]
+        return new_node_id
+    def make_hover_text(self, num_examples=5, text_width=64, with_labels=False):
+        for nid, node in enumerate(self.tree_node_list):
+            line_list = [
+                f"Node {nid:3d} - {node['weight']:6d} items - Linking threshold: {node['merged_at']:.2f}"
+            ]
+            for examplar in node["examplars"][:num_examples]:
+                line_list += [
+                    f"{examplar['ids']:6d}:{examplar['score']:.2f} - {examplar['field'][:text_width]}"
+                    + (f" - {examplar['label']}" if with_labels else "")
+                ]
+            if with_labels:
+                line_list += ["Label distribution"]
+                for label, count in node["label_counts"]:
+                    line_list += [f" - label: {label} - {count} items"]
+            node["hover_text"] = "<br>".join(line_list)
+    def build_tree(
+        self,
+        batch_size=10000,
+        low_thres=0.5,
+        identical_threshold=0.95,
+        thres_step=0.05,
+        min_weight=10,
+        n_examplars=25,
+        hover_examples=5,
+        hover_text_width=64,
+    ):
+        self.prepare_merges(batch_size, low_thres)
+        self.make_starting_nodes(identical_threshold)
+        # make a root to join all trees
+        root_node_id = self.make_merge_nodes(identical_threshold, thres_step)
+        top_nodes = [node for node in self.nodes.values() if node["parent_id"] == -1]
+        root_node = {
+            "node_id": root_node_id,
+            "parent_id": -1,
+            "children_ids": [node["node_id"] for node in top_nodes],
+            "example_ids": [],
+            "weight": sum([node["weight"] for node in top_nodes]),
+            "merge_threshold": -1.0,
+            "depth": 1 + max([node["depth"] for node in top_nodes]),
+        }
+        for node in top_nodes:
+            node["parent_id"] = root_node_id
+        self.nodes[root_node_id] = root_node
+        _ = self.collapse_nodes(root_node, min_weight)
+        self.tree_node_list = []
+        self.finalize_node(
+            root_node,
+            -1,
+            n_examplars,
+            with_labels=(self.label_name is not None),
+        )
+        self.make_hover_text(
+            num_examples=hover_examples,
+            text_width=hover_text_width,
+            with_labels=(self.label_name is not None),
+        )
+    def push_to_hub(self, use_auth_token=None, file_name=None):
+        path_list = self.cache_path_list
+        name = "tree" if file_name is None else file_name
+        tree_file = pjoin(pjoin(*path_list), f"{name}.jsonl.gz")
+        fout = gzip.open(tree_file, "w")
+        for node in tqdm(self.tree_node_list):
+            _ = fout.write((json.dumps(node) + "\n").encode("utf-8"))
+        fout.close()
+        api = HfApi()
+        file_loc = api.upload_file(
+            path_or_fileobj=tree_file,
+            path_in_repo=pjoin(pjoin(*path_list[1:]), f"{name}.jsonl.gz"),
+            repo_id="yjernite/datasets_clusters",
+            token=use_auth_token,
+            repo_type="dataset",
+        )
+        return file_loc
+class Clustering:
+    def __init__(
+        self,
+        dataset_name,
+        config_name,
+        split_name,
+        input_field_path,
+        label_name,
+        num_rows,
+        n_examplars=10,
+        model_name=_DEFAULT_MODEL,
+        file_name=None,
+        max_depth_subtree=3,
+    ):
+        self.dataset_name = dataset_name
+        self.config_name = config_name
+        self.split_name = split_name
+        self.input_field_path = input_field_path
+        self.label_name = label_name
+        self.num_rows = num_rows
+        self.model_name = model_name
+        self.n_examplars = n_examplars
+        self.file_name = "tree" if file_name is None else file_name
+        self.repo_path_list = [
+            dataset_name.replace("/", "---"),
+            f"{'default' if config_name is None else config_name}",
+            f"{'train' if split_name is None else split_name}",
+            f"field-{'->'.join(input_field_path)}-label-{label_name}",
+            f"{num_rows}_rows",
+            model_name.replace("/", "---"),
+            f"{self.file_name}.jsonl.gz",
+        ]
+        self.repo_path = pjoin(*self.repo_path_list)
+        self.node_list = load_dataset(
+            "yjernite/datasets_clusters", data_files=[self.repo_path]
+        )["train"]
+        self.node_reps = [{} for node in self.node_list]
+        self.max_depth_subtree = max_depth_subtree
+    def set_full_tree(self):
+        self.node_reps[0]["tree"] = self.node_reps[0].get(
+            "tree",
+            make_tree_plot(
+                self.node_list,
+                0,
+            ),
+        )
+    def get_full_tree(self):
+        self.set_full_tree()
+        return self.node_reps[0]["tree"]
+    def set_node_subtree(self, node_id):
+        self.node_reps[node_id]["subtree"] = self.node_reps[node_id].get(
+            "subtree",
+            make_tree_plot(
+                self.node_list,
+                node_id,
+                self.max_depth_subtree,
+            ),
+        )
+    def get_node_subtree(self, node_id):
+        self.set_node_subtree(node_id)
+        return self.node_reps[node_id]["subtree"]
+    def set_node_examplars(self, node_id):
+        self.node_reps[node_id]["examplars"] = self.node_reps[node_id].get(
+            "examplars",
+            pd.DataFrame(
+                [
+                    {
+                        "id": exple["ids"],
+                        "score": exple["score"],
+                        "field": exple["field"],
+                        "label": exple.get("label", "N/A"),
+                    }
+                    for exple in self.node_list[node_id]["examplars"]
+                ][: self.n_examplars]
+            ),
+        )
+    def get_node_examplars(self, node_id):
+        self.set_node_examplars(node_id)
+        return self.node_reps[node_id]["examplars"]
+    def set_node_label_chart(self, node_id):
+        self.node_reps[node_id]["label_chart"] = self.node_reps[node_id].get(
+            "label_chart",
+            px.pie(
+                values=[ct for lab, ct in self.node_list[node_id]["label_counts"]],
+                names=[
+                    f"Label {lab}"
+                    for lab, ct in self.node_list[node_id]["label_counts"]
+                ],
+                color_discrete_sequence=px.colors.sequential.Rainbow,
+                width=400,
+                height=400,
+            ),
+        )
+    def get_node_label_chart(self, node_id):
+        self.set_node_label_chart(node_id)
+        return self.node_reps[node_id]["label_chart"]

data_measurements_clusters/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from os.path import exists
+from os.path import join as pjoin
+from datasets import Dataset, load_dataset, load_from_disk
+from tqdm import tqdm
+_CACHE_DIR = "cache_dir"
+# grab first N rows of a dataset from the hub
+def load_truncated_dataset(
+    dataset_name,
+    config_name=None,
+    split_name=None,
+    num_rows=0,
+    use_streaming=True,
+    use_auth_token=None,
+    use_dataset=None,
+):
+    """
+    This function loads the first `num_rows` items of a dataset for a
+    given `config_name` and `split_name`.
+    When the dataset is streamable, we iterate through the first
+    `num_rows` examples in streaming mode, write them to a jsonl file,
+    then create a new dataset from the json.
+    This is the most direct way to make a Dataset from an IterableDataset
+    as of datasets version 1.6.1.
+    Otherwise, we download the full dataset and select the first
+    `num_rows` items
+    Args:
+        dataset_name (string):
+            dataset id in the dataset library
+        config_name (string):
+            dataset configuration
+        split_name (string):
+            optional split name, defaults to `train`
+        num_rows (int):
+            number of rows to truncate the dataset to, <= 0  means no truncation
+        use_streaming (bool):
+            whether to use streaming when the dataset supports it
+        use_auth_token (string):
+            HF authentication token to access private datasets
+        use_dataset (Dataset):
+            use existing dataset instead of getting one from the hub
+    Returns:
+        Dataset:
+            the truncated dataset as a Dataset object
+    """
+    split_name = "train" if split_name is None else split_name
+    cache_name = f"{dataset_name.replace('/', '---')}_{'default' if config_name is None else config_name}_{split_name}_{num_rows}"
+    if use_streaming:
+        if not exists(pjoin(_CACHE_DIR, "tmp", f"{cache_name}.jsonl")):
+            iterable_dataset = (
+                load_dataset(
+                    dataset_name,
+                    name=config_name,
+                    split=split_name,
+                    cache_dir=pjoin(_CACHE_DIR, "tmp", cache_name + "_temp"),
+                    streaming=True,
+                    use_auth_token=use_auth_token,
+                )
+                if use_dataset is None
+                else use_dataset
+            )
+            if num_rows > 0:
+                iterable_dataset = iterable_dataset.take(num_rows)
+            f = open(
+                pjoin(_CACHE_DIR, "tmp", f"{cache_name}.jsonl"), "w", encoding="utf-8"
+            )
+            for row in tqdm(iterable_dataset):
+                _ = f.write(json.dumps(row) + "\n")
+            f.close()
+        dataset = Dataset.from_json(
+            pjoin(_CACHE_DIR, "tmp", f"{cache_name}.jsonl"),
+            cache_dir=pjoin(_CACHE_DIR, "tmp", cache_name + "_jsonl"),
+        )
+    else:
+        full_dataset = (
+            load_dataset(
+                dataset_name,
+                name=config_name,
+                split=split_name,
+                use_auth_token=use_auth_token,
+                cache_dir=pjoin(_CACHE_DIR, "tmp", cache_name + "_temp"),
+            )
+            if use_dataset is None
+            else use_dataset
+        )
+        if num_rows > 0:
+            dataset = full_dataset.select(range(num_rows))
+        else:
+            dataset = full_dataset
+    return dataset
+# get all instances of a specific field in a dataset with indices and labels
+def extract_features(examples, indices, input_field_path, label_name=None):
+    """
+    This function prepares examples for further processing by:
+        - returning an "unrolled" list of all the fields denoted by input_field_path
+        - with the indices corresponding to the example the field item came from
+        - optionally, the corresponding label is also returned with each field item
+    Args:
+        examples (dict):
+            a dictionary of lists, provided dataset.map with batched=True
+        indices (list):
+            a list of indices, provided dataset.map with with_indices=True
+        input_field_path (tuple):
+            a tuple indicating the field we want to extract. Can be a singleton
+            for top-level features (e.g. `("text",)`) or a full path for nested
+            features (e.g. `("answers", "text")`) to get all answer strings in
+            SQuAD
+        label_name (string):
+            optionally used to align the field items with labels. Currently,
+            returns the top-most field that has this name, which may fail in some
+            edge cases
+            TODO: make it so the label is specified through a full path
+    Returns:
+        Dict:
+            a dictionary of lists, used by dataset.map with batched=True.
+            labels are all None if label_name!=None but label_name is not found
+            TODO: raised an error if label_name is specified but not found
+    """
+    top_name = input_field_path[0]
+    if label_name is not None and label_name in examples:
+        item_list = [
+            {"index": i, "label": label, "items": items}
+            for i, items, label in zip(
+                indices, examples[top_name], examples[label_name]
+            )
+        ]
+    else:
+        item_list = [
+            {"index": i, "label": None, "items": items}
+            for i, items in zip(indices, examples[top_name])
+        ]
+    for field_name in input_field_path[1:]:
+        new_item_list = []
+        for dct in item_list:
+            if label_name is not None and label_name in dct["items"]:
+                if isinstance(dct["items"][field_name], list):
+                    new_item_list += [
+                        {"index": dct["index"], "label": label, "items": next_item}
+                        for next_item, label in zip(
+                            dct["items"][field_name], dct["items"][label_name]
+                        )
+                    ]
+                else:
+                    new_item_list += [
+                        {
+                            "index": dct["index"],
+                            "label": dct["items"][label_name],
+                            "items": dct["items"][field_name],
+                        }
+                    ]
+            else:
+                if isinstance(dct["items"][field_name], list):
+                    new_item_list += [
+                        {
+                            "index": dct["index"],
+                            "label": dct["label"],
+                            "items": next_item,
+                        }
+                        for next_item in dct["items"][field_name]
+                    ]
+                else:
+                    new_item_list += [
+                        {
+                            "index": dct["index"],
+                            "label": dct["label"],
+                            "items": dct["items"][field_name],
+                        }
+                    ]
+        item_list = new_item_list
+    res = (
+        {
+            "ids": [dct["index"] for dct in item_list],
+            "field": [dct["items"] for dct in item_list],
+        }
+        if label_name is None
+        else {
+            "ids": [dct["index"] for dct in item_list],
+            "field": [dct["items"] for dct in item_list],
+            "label": [dct["label"] for dct in item_list],
+        }
+    )
+    return res
+# grab some examples and extract interesting fields
+def prepare_clustering_dataset(
+    dataset_name,
+    input_field_path,
+    label_name=None,
+    config_name=None,
+    split_name=None,
+    num_rows=0,
+    use_streaming=True,
+    use_auth_token=None,
+    cache_dir=_CACHE_DIR,
+    use_dataset=None,
+):
+    """
+    This function loads the first `num_rows` items of a dataset for a
+    given `config_name` and `split_name`, and extracts all instances of a field
+    of interest denoted by `input_field_path` along with the indices of the
+    examples the instances came from and optionall their labels (`label_name`)
+    in the original dataset
+    Args:
+        dataset_name (string):
+            dataset id in the dataset library
+        input_field_path (tuple):
+            a tuple indicating the field we want to extract. Can be a singleton
+            for top-level features (e.g. `("text",)`) or a full path for nested
+            features (e.g. `("answers", "text")`) to get all answer strings in
+            SQuAD
+        label_name (string):
+            optionally used to align the field items with labels. Currently,
+            returns the top-most field that has this name, which fails in edge cases
+        config_name (string):
+            dataset configuration
+        split_name (string):
+            optional split name, defaults to `train`
+        num_rows (int):
+            number of rows to truncate the dataset to, <= 0  means no truncation
+        use_streaming (bool):
+            whether to use streaming when the dataset supports it
+        use_auth_token (string):
+            HF authentication token to access private datasets
+        use_dataset (Dataset):
+            use existing dataset instead of getting one from the hub
+    Returns:
+        Dataset:
+            the extracted dataset as a Dataset object. Note that if there is more
+            than one instance of the field per example in the original dataset
+            (e.g. multiple answers per QA example), the returned dataset will
+            have more than `num_rows` rows
+        string:
+            the path to the newsly created dataset directory
+    """
+    cache_path = [
+        cache_dir,
+        dataset_name.replace("/", "---"),
+        f"{'default' if config_name is None else config_name}",
+        f"{'train' if split_name is None else split_name}",
+        f"field-{'->'.join(input_field_path)}-label-{label_name}",
+        f"{num_rows}_rows",
+        "features_dset",
+    ]
+    if exists(pjoin(*cache_path)):
+        pre_clustering_dset = load_from_disk(pjoin(*cache_path))
+    else:
+        truncated_dset = load_truncated_dataset(
+            dataset_name,
+            config_name,
+            split_name,
+            num_rows,
+            use_streaming,
+            use_auth_token,
+            use_dataset,
+        )
+        def batch_func(examples, indices):
+            return extract_features(examples, indices, input_field_path, label_name)
+        pre_clustering_dset = truncated_dset.map(
+            batch_func,
+            remove_columns=truncated_dset.features,
+            batched=True,
+            with_indices=True,
+        )
+        for i in range(1, len(cache_path) - 1):
+            if not exists(pjoin(*cache_path[:i])):
+                os.mkdir(pjoin(*cache_path[:i]))
+        pre_clustering_dset.save_to_disk(pjoin(*cache_path))
+    return pre_clustering_dset, pjoin(*cache_path)

posts/conclusion.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import streamlit as st
+title = "Key Takeaways"
+description = "Review of the information from previous pages."
+date = "2022-01-26"
+thumbnail = "images/raised_hand.png"
+__KEY_TAKEAWAYS = """
+# Key Takeaways and Review
+Here are some of the main ideas we have conveyed in this exploration:
+- Defining hate speech is hard and changes depending on your context and goals.
+- Capturing a snapshot of what you've defined to be hate speech in a dataset is hard.
+- Models learn lots of different things based on the data it sees, and that can include things you didn't intend for them to learn.
+Next, please answer the following questions about the information presented in this demo:
+"""
+def run_article():
+    st.markdown(__KEY_TAKEAWAYS)
+    st.text_area(
+        "Did you click on any of the links provided in the **Hate Speech in ACM** page? If so, which one did you find most surprising?"
+    )
+    st.text_area(
+        "Of the datasets presented in the **Dataset Exploration** page, which one did you think best represented content that should be moderated? Which worst?"
+    )
+    st.text_area(
+        "Of the models presented in the **Model Exploration** page, which one did you think performed best? Which worst?"
+    )
+    st.text_area(
+        "Any additional comments about the materials?"
+    )
+    # from paper
+    st.text_area(
+        "How would you describe your role? E.g. model developer, dataset developer, domain expert, policy maker, platform manager, community advocate, platform user, student"
+    )
+    st.text_area(
+        "Why are you interested in content moderation?"
+    )
+    st.text_area(
+        "Which modules did you use the most?"
+    )
+    st.text_area(
+        "Which module did you find the most informative?"
+    )
+    st.text_area(
+        "Which application were you most interested in learning more about?"
+    )
+    st.text_area(
+        "What surprised you most about the datasets?"
+    )
+    st.text_area(
+        "Which models are you most concerned about as a user?"
+    )
+    st.text_area(
+        "Do you have any comments or suggestions?"
+    )

posts/context.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import streamlit as st
+title = "Hate Speech in ACM"
+description = "The history and development of hate speech detection as a modeling task"
+date = "2022-01-26"
+thumbnail = "images/prohibited.png"
+__ACM_SECTION = """
+Content moderation is a collection of interventions used by online platforms to partially obscure
+or remove entirely from user-facing view content that is objectionable based on the company's values
+or community guidelines, which vary from platform to platform.
+[Sarah T. Roberts (2014)](https://yalebooks.yale.edu/book/9780300261479/behind-the-screen/) describes
+content moderation as "the organized practice of screening user-generated content (UGC)
+posted to Internet sites, social media, and other online outlets" (p. 12).
+[Tarleton Gillespie (2021)](https://yalebooks.yale.edu/book/9780300261431/custodians-internet/) writes
+that platforms moderate content "both to protect one user from another,
+or one group from its antagonists, and to remove the offensive, vile, or illegal.''
+While there are a variety of approaches to this problem, in this tool, we focus on automated content moderation,
+which is the application of algorithms to the classification of problematic content.
+Content that is subject to moderation can be user-directed (e.g. targeted harassment of a particular user
+in comments or direct messages) or posted to a personal account (e.g. user-created posts that contain hateful
+remarks against a particular social group).
+"""
+__CURRENT_APPROACHES = """
+Automated content moderation has relied both on analysis of the media itself (e.g. using methods from natural
+language processing and computer vision) as well as user dynamics (e.g. whether the user sending the content
+to another user shares followers with the recipient, or whether the user posting the content is a relatively new account).
+Often, the ACM pipeline is fed by user-reported content. Within the realm of text-based ACM, approaches vary
+from wordlist-based approaches to data-driven, machine learning models. Common datasets used for training and
+evaluating hate speech detectors can be found at [https://hatespeechdata.com/](https://hatespeechdata.com/).
+"""
+__CURRENT_CHALLENGES = """
+Combating hateful content on the Internet continues to be a challenge. A 2021 survey of respondents
+in the United States, conducted by Anti-Defamation League, found an increase in online hate & harassment
+directed at LGBTQ+, Asian American, Jewish, and African American individuals.
+### Technical challenges for data-driven systems
+With respect to models that are based on training data, datasets encode worldviews, and so a common challenge
+lies in having insufficient data or data that only reflects a limited worldview. For example, a recent
+study found that Tweets posted by drag queens were more often rated by an automated system as toxic than
+Tweets posted by white supremacists.
+This may be due, in part, to the labeling schemes and choices made for the data used in training the model,
+as well as particular company policies that are invoked when making these labeling choices.
+(This all needs to be spelled out better!)
+### Context matters for content moderation.
+*Counterspeech* is "any direct response to hateful or harmful speech which seeks to undermine it"
+(from [Dangerous Speech Project](https://dangerousspeech.org/counterspeech/)). Counterspeech has been shown
+to be an important community self-moderation tool for reducing instances of hate speech (see
+[Hangartner et al. 2021](https://www.pnas.org/doi/10.1073/pnas.2116310118)), but counterspeech is often
+incorrectly categorized as hate speech by automatic systems due to the counterspeech making direct reference
+to or quoting the original hate speech. Such system behavior silences those who are trying to push back against
+hateful and toxis speech, and, if the flagged content is hidden automatically, prevents others from seeing the
+counterspeech.
+See [van Aken et al. 2018](https://aclanthology.org/W18-5105.pdf) for a detailed list of examples that
+automatic systems frequently misclassify.
+"""
+__SELF_EXAMPLES = """
+- [**(FB)(TOU)** - *Facebook Community Standards*](https://transparency.fb.com/policies/community-standards/)
+- [**(FB)(Blog)** - *What is Hate Speech? (2017)*](https://about.fb.com/news/2017/06/hard-questions-hate-speech/)
+- [**(NYT)(Blog)** - * New York Times on their partnership with JigSaw*](https://open.nytimes.com/to-apply-machine-learning-responsibly-we-use-it-in-moderation-d001f49e0644)
+- [**(NYT)(FAQ)** - *New York Times on their moderation policy*](https://help.nytimes.com/hc/en-us/articles/115014792387-Comments)
+- [**(Reddit)(TOU)** - *Reddit General Content Policies*](https://www.redditinc.com/policies/content-policy)
+- [**(Reddit)(Blog)** - *AutoMod - help scale moderation without ML*](https://mods.reddithelp.com/hc/en-us/articles/360008425592-Moderation-Tools-overview)
+- [**(Google)(Blog)** - *Google Search Results Moderation*](https://blog.google/products/search/when-and-why-we-remove-content-google-search-results/)
+- [**(Google)(Blog)** - *JigSaw Case Studies*](https://www.perspectiveapi.com/case-studies/)
+- [**(YouTube)(TOU)** - *YouTube Community Guidelines*](https://www.youtube.com/howyoutubeworks/policies/community-guidelines/)
+"""
+__CRITIC_EXAMPLES = """
+- [Social Media and Extremism - Questions about January 6th 2021](https://thehill.com/policy/technology/589651-jan-6-panel-subpoenas-facebook-twitter-reddit-and-alphabet/)
+- [Over-Moderation of LGBTQ content on YouTube](https://www.gaystarnews.com/article/youtube-lgbti-content/)
+- [Disparate Impacts of Moderation](https://www.aclu.org/news/free-speech/time-and-again-social-media-giants-get-content-moderation-wrong-silencing-speech-about-al-aqsa-mosque-is-just-the-latest-example/)
+- [Calls for Transparency](https://santaclaraprinciples.org/)
+- [Income Loss from Failures of Moderation](https://foundation.mozilla.org/de/blog/facebook-delivers-a-serious-blow-to-tunisias-music-scene/)
+- [Fighting Hate Speech, Silencing Drag Queens?](https://link.springer.com/article/10.1007/s12119-020-09790-w)
+- [Reddit Self Reflection on Lack of Content Policy](https://www.reddit.com/r/announcements/comments/gxas21/upcoming_changes_to_our_content_policy_our_board/)
+"""
+def run_article():
+    st.markdown("## Automatic Content Moderation (ACM)")
+    with st.expander("ACM definition", expanded=False):
+        st.markdown(__ACM_SECTION, unsafe_allow_html=True)
+    st.markdown("## Current approaches to ACM")
+    with st.expander("Current Approaches"):
+        st.markdown(__CURRENT_APPROACHES, unsafe_allow_html=True)
+    st.markdown("## Current challenges in ACM")
+    with st.expander("Current Challenges"):
+        st.markdown(__CURRENT_CHALLENGES, unsafe_allow_html=True)
+    st.markdown("## Examples of ACM in Use: in the Press and in their own Words")
+    col1, col2 = st.columns([4, 5])
+    with col1.expander("In their own Words"):
+        st.markdown(__SELF_EXAMPLES, unsafe_allow_html=True)
+    with col2.expander("Critical Writings"):
+        st.markdown(__CRITIC_EXAMPLES, unsafe_allow_html=True)

posts/dataset_exploration.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import logging
+from os import mkdir
+from os.path import isdir
+from os.path import join as pjoin
+from pathlib import Path
+import streamlit as st
+from data_measurements_clusters import Clustering
+title = "Dataset Exploration"
+description = "Comparison of hate speech detection datasets"
+date = "2022-01-26"
+thumbnail = "images/books.png"
+__COLLECT = """
+In order to turn observations of the world into data, choices must be made
+about what counts as data, where to collect data, and how to collect data.
+When collecting language data, this often means selecting websites that allow
+for easily collecting samples of text, and hate speech data is frequently
+collected from social media platforms like Twitter or forums like Wikipedia.
+Each of these decisions results in a specific sample of all the possible
+observations.
+"""
+__ANNOTATE = """
+Once the data is collected, further decisions must be made about how to
+label the data if the data is being used to train a classification system,
+as is common in hate speech detection. These labels must be defined in order
+for the dataset to be consistently labeled, which helps the classification
+model produce more consistent output. This labeling process, called
+*annotation*, can be done by the data collectors, by a set of trained
+annotators with relevant expert knowledge, or by online crowdworkers. Who
+is doing the annotating has a significant effect on the resulting set of
+labels ([Sap et al., 2019](https://aclanthology.org/P19-1163.pdf)).
+"""
+__STANDARDIZE = """
+As a relatively new task in NLP, the definitions that are used across
+different projects vary. Some projects target just hate speech, but others
+may label their data for ‘toxic’, ‘offensive’, or ‘abusive’ language. Still
+others may address related problems such as bullying and harassment.
+This variation makes it difficult to compare across datasets and their
+respective models. As these modeling paradigms become more established,
+definitions grounded in relevant sociological research will need to be
+agreed upon in order for datasets and models in ACM to appropriately
+capture the problems in the world that they set out to address. For more
+on this discussion, see
+[Madukwe et al 2020](https://aclanthology.org/2020.alw-1.18.pdf) and
+[Fortuna et al 2020](https://aclanthology.org/2020.lrec-1.838.pdf).
+"""
+__HOW_TO = """
+To use the tool, select a dataset. The tool will then show clusters of
+examples in the dataset that have been automatically determined to be similar
+to one another. Below that, you can see specific examples within the cluster,
+the labels for those examples, and the distribution of labels within the
+cluster. Note that cluster 0 will always be the full dataset.
+"""
+DSET_OPTIONS = {'classla/FRENK-hate-en': {'binary': {'train': {('text',): {'label': {100000: {
+       'sentence-transformers/all-mpnet-base-v2': {'tree': {'dataset_name': 'classla/FRENK-hate-en',
+         'config_name': 'binary',
+         'split_name': 'train',
+         'input_field_path': ('text',),
+         'label_name': 'label',
+         'num_rows': 100000,
+         'model_name': 'sentence-transformers/all-mpnet-base-v2',
+         'file_name': 'tree'}}}}}}}},
+ 'tweets_hate_speech_detection': {'default': {'train': {('tweet',): {'label': {100000: {
+       'sentence-transformers/all-mpnet-base-v2': {'tree': {'dataset_name': 'tweets_hate_speech_detection',
+         'config_name': 'default',
+         'split_name': 'train',
+         'input_field_path': ('tweet',),
+         'label_name': 'label',
+         'num_rows': 100000,
+         'model_name': 'sentence-transformers/all-mpnet-base-v2',
+         'file_name': 'tree'}}}}}}}},
+ 'ucberkeley-dlab/measuring-hate-speech': {'default': {'train': {('text',): {'hatespeech': {100000: {
+       'sentence-transformers/all-mpnet-base-v2': {'tree': {'dataset_name': 'ucberkeley-dlab/measuring-hate-speech',
+         'config_name': 'default',
+         'split_name': 'train',
+         'input_field_path': ('text',),
+         'label_name': 'hatespeech',
+         'num_rows': 100000,
+         'model_name': 'sentence-transformers/all-mpnet-base-v2',
+         'file_name': 'tree'}}}}}}}},
+}
+@st.cache(allow_output_mutation=True)
+def download_tree(args):
+    clusters = Clustering(**args)
+    return clusters
+def run_article():
+    st.markdown("# Making a Hate Speech Dataset")
+    st.markdown("## Collecting observations of the world")
+    with st.expander("Collection"):
+        st.markdown(__COLLECT, unsafe_allow_html=True)
+    st.markdown("## Annotating observations with task labels")
+    with st.expander("Annotation"):
+        st.markdown(__ANNOTATE, unsafe_allow_html=True)
+    st.markdown("## Standardizing the task")
+    with st.expander("Standardization"):
+        st.markdown(__STANDARDIZE, unsafe_allow_html=True)
+    st.markdown("# Exploring datasets")
+    with st.expander("How to use the tool"):
+        st.markdown(__HOW_TO, unsafe_allow_html=True)
+    choose_dset = st.selectbox(
+        "Select dataset to visualize",
+        DSET_OPTIONS,
+    )
+    pre_args = DSET_OPTIONS[choose_dset]
+    args = pre_args
+    while not 'dataset_name' in args:
+        args = list(args.values())[0]
+    clustering = download_tree(args)
+    st.markdown("---\n")
+    full_tree_fig = clustering.get_full_tree()
+    st.plotly_chart(full_tree_fig, use_container_width=True)
+    st.markdown("---\n")
+    show_node = st.selectbox(
+        "Visualize cluster node:",
+        range(len(clustering.node_list)),
+    )
+    st.markdown(f"Node {show_node} has {clustering.node_list[show_node]['weight']} examples.")
+    st.markdown(f"Node {show_node} was merged at {clustering.node_list[show_node]['merged_at']:.2f}.")
+    examplars = clustering.get_node_examplars(show_node)
+    st.markdown("---\n")
+    label_fig = clustering.get_node_label_chart(show_node)
+    examplars_col, labels_col = st.columns([2, 1])
+    examplars_col.markdown("#### Node cluster examplars")
+    examplars_col.table(examplars)
+    labels_col.markdown("#### Node cluster labels")
+    labels_col.plotly_chart(label_fig, use_container_width=True)

posts/model_exploration.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import streamlit as st
+import json
+import random
+import sys
+import numpy as np
+import pandas as pd
+# from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import pipeline
+title = "Model Exploration"
+description = "Comparison of hate speech detection models"
+date = "2022-01-26"
+thumbnail = "images/robot.png"
+__HATE_DETECTION = """
+Once the data has been collected using the definitions identified for the
+task, you can start training your model. At training, the model takes in
+the data with labels and learns the associated context in the input data
+for each label. Depending on the task design, the labels may be binary like
+'hateful' and 'non-hateful' or multiclass like 'neutral', 'offensive', and
+'attack'.
+When presented with a new input string, the model then predicts the
+likelihood that the input is classified as each of the available labels and
+returns the label with the highest likelihood as well as how confident the
+model is in its selection using a score from 0 to 1.
+Neural models such as transformers are frequently trained as general
+language models and then fine-tuned on specific classification tasks.
+These models can vary in their architecture and the optimization
+algorithms, sometimes resulting in very different output for the same
+input text.
+The models used below include:
+- [RoBERTa trained on FRENK dataset](https://huggingface.co/classla/roberta-base-frenk-hate)
+- [RoBERTa trained on Twitter Hate Speech](https://huggingface.co/cardiffnlp/twitter-roberta-base-hate)
+- [DeHateBERT model (trained on Twitter and StormFront)](https://huggingface.co/Hate-speech-CNERG/dehatebert-mono-english)
+- [RoBERTa trained on 11 English hate speech datasets](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r1-target)
+- [RoBERTa trained on 11 English hate speech datasets and Round 1 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r2-target)
+- [RoBERTa trained on 11 English hate speech datasets and Rounds 1 and 2 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r3-target)
+- [RoBERTa trained on 11 English hate speech datasets and Rounds 1, 2, and 3 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target)
+"""
+__HATECHECK = """
+[Röttinger et al. (2021)](https://aclanthology.org/2021.acl-long.4.pdf)
+developed a list of 3,901 test cases for hate speech detection models called
+HateCheck. HateCheck provides a number of templates long with placeholders for
+identity categories and hateful terms along with labels indicating whether a
+model should or should not categorize the instance as hate speech. For each
+case, they created several examples with different
+identity attributes to test models' abilities to detect hate speech towards
+a range of groups of people. Additionally, they used more difficult
+linguistic contexts such as adding negation or more nuanced words to try to fool the
+model. See some of there examples using the button or try to make
+your own examples to test the models in the tools below.
+*** Warning: these examples may include hateful and violent content as
+well as slurs and other offensive languages ***
+"""
+__RANKING = """
+When models process a given input, they calculate the probability of
+that input being labeled with each of the possible labels (in binary
+cases for example, either 'hateful' or 'not hateful'). The label with
+the highest probably is returned. If we test multiple input sentences
+for a given model, we can see which input sentences have the
+highest probabilities, indicating which examples the model is most
+confident in classifying.
+Try comparing different input sentences for a given model
+using the tool below.
+"""
+__COMPARISON = """
+Depending on their training data and parameters, models can return very
+different outputs for the same input. Knowing how models differ in
+their behavior can help with choosing an appropriate model for your
+given use case.
+Additionally, models trained on one kind of data can perform very
+differently when tested on novel data. To show the models' performance
+in a variety of settings, we also show the results of each model on
+four datasets: the entirety of the [HateCheck Dataset](https://github.com/paul-rottger/hatecheck-data),
+the [Dynamically Generated Hate Speech Dataset](https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset),
+the [Hateful Memes Dataset](https://huggingface.co/datasets/limjiayi/hateful_memes_expanded)
+(without the images), and a 2000-instance subsection of the
+[Open Subtitles](https://opus.nlpl.eu/OpenSubtitles-v2018.php)
+(English) Dataset. While the first three are hate speech datasets and have
+labels for whether or not each instance is hateful, the Open Subtitles dataset
+contains movie subtitles and is not labeled for instances of hate.
+Try comparing two different models' output for the same comparison
+sentence using the tool below.
+"""
+__MODELS = [
+        "classla/roberta-base-frenk-hate",
+        "cardiffnlp/twitter-roberta-base-hate",
+        "Hate-speech-CNERG/dehatebert-mono-english",
+        "facebook/roberta-hate-speech-dynabench-r1-target",
+        "facebook/roberta-hate-speech-dynabench-r2-target",
+        "facebook/roberta-hate-speech-dynabench-r3-target",
+        "facebook/roberta-hate-speech-dynabench-r4-target"
+    ]
+autopop_ranking = [
+    [
+        "I don't hate black people.",
+        "Black people should die.",
+        "My friend likes pancakes."
+    ],
+    [
+        "Those dirty redskins are all drunks.",
+        "I can't stand Chinese food. It's too spicy.",
+        "I lived near many Chinese people and I hated every single one of them."
+    ],
+    [
+        "Stop saying that Mexicans don't belong in our country!",
+        "We walked along the dyke down the road.",
+        "Ah shit, I fucked up."
+    ]
+]
+# Creates the forms for receiving multiple inputs to compare for a single
+# model or one input to compare for two models
+def run_article():
+    st.markdown("# Making a Hate Speech Detection Model")
+    with st.expander("Hate Speech Detection Models", expanded=False):
+        st.markdown(__HATE_DETECTION)
+    hc_path = "posts/resources/"
+    hc_pholders = json.load(
+        open(
+            hc_path + "template_placeholders.json",
+            encoding="utf-8"
+        )
+    )
+    hc_templates = json.load(
+        open(
+            hc_path + "hatecheck_category_templates.json",
+            encoding="utf-8"
+        )
+    )
+    hc_info = json.load(
+        open(
+            hc_path + "hatecheck_category_info.json",
+            encoding="utf-8"
+        )
+    )
+    hc_cats = [""] + list(hc_info.keys())
+    st.markdown("## Testing Models' Behavior")
+    with st.expander("HateCheck Examples", expanded=False):
+        st.markdown(__HATECHECK)
+        category = st.selectbox(
+                "Select a category of examples from HateCheck",
+                hc_cats,
+                key="hc_cat_select"
+            )
+        if category:
+            with st.form(key="hate_check"):
+                hc_cat = hc_info[category]
+                templates = []
+                names = []
+                for hc_temp in hc_cat:
+                    templates.append(hc_temp)
+                    names.append(hc_cat[hc_temp]["name"])
+                selected_names = st.multiselect(
+                        "Select one or more HateCheck templates to generate examples for",
+                        names,
+                        key="hc_temp_multiselect"
+                )
+                num_exs = st.number_input(
+                        "Select a number of examples to generate for each selected template",
+                        min_value = 1,
+                        max_value = 5,
+                        value = 3
+                        )
+                if st.form_submit_button(label="Generate Examples"):
+                    for name in selected_names:
+                        index = names.index(name)
+                        template = templates[index]
+                        examples = generate_hc_ex(
+                            hc_templates[template],
+                            hc_pholders,
+                            num_exs
+                        )
+                        st.header(name)
+                        st.subheader("Label: " + hc_cat[template]["value"])
+                        st.caption(hc_cat[template]["desc"])
+                        for num in range(num_exs):
+                            ex = examples[num]
+                            st.write("Example #" + str(num + 1) + ": " + ex)
+    st.markdown("## Model Output Ranking")
+    with st.expander("Model Output Ranking Tool", expanded=False):
+        st.markdown(__RANKING)
+        with st.form(key="ranking"):
+            model_name = st.selectbox(
+                "Select a model to test",
+                __MODELS,
+            )
+            # the placeholder key functionality was added in v1.2 of streamlit
+            # and versions on Spaces currently goes up to v1.0
+            input_1 = st.text_input(
+                "Input 1",
+                help="Try a phrase like 'We shouldn't let [IDENTITY] suffer.'",
+                # placeholder="We shouldn't let [IDENTITY] suffer."
+            )
+            input_2 = st.text_input(
+                "Input 2",
+                help="Try a phrase like 'I'd rather die than date [IDENTITY].'",
+                # placeholder="I'd rather die than date [IDENTITY]."
+            )
+            input_3 = st.text_input(
+                "Input 3",
+                help="Try a phrase like 'Good morning'",
+                #placeholder="Good morning."
+            )
+            autopop = st.checkbox(
+                'Choose examples for me',
+                key="rank_autopop_ckbx",
+                help="Check this box to run the model with 3 preselected sentences."
+            )
+            if st.form_submit_button(label="Rank inputs"):
+                if autopop:
+                    rank_inputs = random.choice(autopop_ranking)
+                else:
+                    rank_inputs = [input_1, input_2, input_3]
+                sys.stderr.write("\n" + str(rank_inputs) + "\n")
+                results = run_ranked(model_name, rank_inputs)
+                st.dataframe(results)
+    st.markdown("## Model Comparison")
+    with st.expander("Model Comparison Tool", expanded=False):
+        st.markdown(__COMPARISON)
+        with st.form(key="comparison"):
+            model_name_1 = st.selectbox(
+                "Select a model to compare",
+                __MODELS,
+                key="compare_model_1",
+            )
+            model_name_2 = st.selectbox(
+                "Select another model to compare",
+                __MODELS,
+                key="compare_model_2",
+            )
+            autopop = st.checkbox(
+                'Choose an example for me',
+                key="comp_autopop_ckbx",
+                help="Check this box to compare the models with a preselected sentence."
+            )
+            input_text = st.text_input("Comparison input")
+            if st.form_submit_button(label="Compare models"):
+                if autopop:
+                    input_text = random.choice(random.choice(autopop_ranking))
+                results = run_compare(model_name_1, model_name_2, input_text)
+                st.write("### Showing results for: " + input_text)
+                st.dataframe(results)
+                outside_ds = [
+                              "hatecheck",
+                              "dynabench",
+                              "hatefulmemes",
+                              "opensubtitles"
+                       ]
+                name_1_short = model_name_1.split("/")[1]
+                name_2_short = model_name_2.split("/")[1]
+                for calib_ds in outside_ds:
+                    ds_loc = "posts/resources/charts/" + calib_ds + "/"
+                    images, captions = [], []
+                    for model in [name_1_short, name_2_short]:
+                        images.append(ds_loc + model + "_" + calib_ds + ".png")
+                        captions.append("Counts of dataset instances by hate score.")
+                    st.write("#### Model performance comparison on " + calib_ds)
+                    st.image(images, captions)
+#                if model_name_1 == "Hate-speech-CNERG/dehatebert-mono-english":
+#                    st.image("posts/resources/dehatebert-mono-english_calibration.png")
+#                elif model_name_1 == "cardiffnlp/twitter-roberta-base-hate":
+#                    st.image("posts/resources/twitter-roberta-base-hate_calibration.png")
+#                st.write("Calibration of Model 2")
+#                if model_name_2 == "Hate-speech-CNERG/dehatebert-mono-english":
+#                    st.image("posts/resources/dehatebert-mono-english_calibration.png")
+#                elif model_name_2 == "cardiffnlp/twitter-roberta-base-hate":
+#                    st.image("posts/resources/twitter-roberta-base-hate_calibration.png")
+# Takes in a Hate Check template and placeholders and generates the given
+# number of random examples from the template, inserting a random instance of
+# an identity category if there is a placeholder in the template
+def generate_hc_ex(template, placeholders, gen_num):
+    sampled = random.sample(template, gen_num)
+    ph_cats = list(placeholders.keys())
+    for index in range(len(sampled)):
+        sample = sampled[index]
+        for ph_cat in ph_cats:
+            if ph_cat in sample:
+                insert = random.choice(placeholders[ph_cat])
+                sampled[index] = sample.replace(ph_cat, insert).capitalize()
+    return sampled
+# Runs the received input strings through the given model and returns the
+# all scores for all possible labels as a DataFrame
+def run_ranked(model, input_list):
+    classifier = pipeline(
+        "text-classification",
+        model=model,
+        return_all_scores=True
+    )
+    output = {}
+    results = classifier(input_list)
+    for result in results:
+        for index in range(len(result)):
+            label = result[index]["label"]
+            score = result[index]["score"]
+            if label in output:
+                output[label].append(score)
+            else:
+                new_out = [score]
+                output[label] = new_out
+    return pd.DataFrame(output, index=input_list)
+# Takes in two model names and returns the output of both models for that
+# given input string
+def run_compare(name_1, name_2, text):
+    classifier_1 = pipeline("text-classification", model=name_1)
+    result_1 = classifier_1(text)
+    out_1 = {}
+    out_1["Model"] = name_1
+    out_1["Label"] = result_1[0]["label"]
+    out_1["Score"] = result_1[0]["score"]
+    classifier_2 = pipeline("text-classification", model=name_2)
+    result_2 = classifier_2(text)
+    out_2 = {}
+    out_2["Model"] = name_2
+    out_2["Label"] = result_2[0]["label"]
+    out_2["Score"] = result_2[0]["score"]
+    return [out_1, out_2]

posts/welcome.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import streamlit as st
+title = "Welcome Page"
+description = "Introduction"
+date = "2022-01-26"
+thumbnail = "images/waving_hand.png"
+__INTRO_TEXT = """
+Welcome to the Task Exploration Activity for hate speech detection!
+In this series of modules, you'll learn about the history of hate speech detection as a task in
+the larger pipeline of automatic content moderation (ACM).
+You'll also be able to interact with and compare datasets and models built for this task.
+The goal of this exploration is to share the design considerations and challenges faced when using algorithms to detect hate speech.
+"""
+__DEF_HATE_SPEECH = """
+Hate speech is hard to define, with definitions shifting across time and location.
+In 2019, the United Nations defined hate speech as "any kind of communication in speech,
+writing or behaviour, that attacks or uses pejorative or discriminatory language with
+reference to a person or a group on the basis of who they are, in other words, based on their religion,
+ethnicity, nationality, race, colour, descent, gender or other identity factor."
+"""
+__DEF_CONTENT = """
+Different platforms have different guidelines about what
+content is sanctioned on the platform. For example, many US-based platforms prohibit posting threats of violence,
+nudity, and hate speech. We discuss hate speech below.
+"""
+__CONTENT_WARNING = """
+These modules contain examples of hateful, abusive, and offensive language that have be collected in datasets and
+reproduced by models. These examples are meant to illustrate the variety of content that may be subject to
+moderation.
+"""
+__DATASET_LIST = """
+- [FRENK hate speech dataset](https://huggingface.co/datasets/classla/FRENK-hate-en)
+- [Twitter Hate Speech dataset](https://huggingface.co/datasets/tweets_hate_speech_detection)
+- [UC Berkley Measuring Hate Speech](https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech)
+- [Dynamically Generated Hate Speech Dataset](https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset)
+- [HateCheck](https://github.com/paul-rottger/hatecheck-data)
+- [Hateful Memes Dataset](https://huggingface.co/datasets/limjiayi/hateful_memes_expanded)
+- [Open Subtitles English Dataset](https://opus.nlpl.eu/OpenSubtitles-v2018.php)
+"""
+__MODEL_LIST = """
+- [RoBERTa trained on FRENK dataset](https://huggingface.co/classla/roberta-base-frenk-hate)
+- [RoBERTa trained on Twitter Hate Speech](https://huggingface.co/cardiffnlp/twitter-roberta-base-hate)
+- [DeHateBERT model (trained on Twitter and StormFront)](https://huggingface.co/Hate-speech-CNERG/dehatebert-mono-english)
+- [RoBERTa trained on 11 English hate speech datasets](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r1-target)
+- [RoBERTa trained on 11 English hate speech datasets and Round 1 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r2-target)
+- [RoBERTa trained on 11 English hate speech datasets and Rounds 1 and 2 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r3-target)
+- [RoBERTa trained on 11 English hate speech datasets and Rounds 1, 2, and 3 of the Dynamically Generated Hate Speech Dataset](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target)
+"""
+def run_article():
+    st.markdown("# Welcome!")
+    st.markdown(__INTRO_TEXT)
+    st.markdown("### What is hate speech?")
+    st.markdown(__DEF_HATE_SPEECH)
+    st.markdown("### What kind of content is subject to moderation?")
+    st.markdown(__DEF_CONTENT)
+    st.markdown("### Content Warning")
+    st.markdown(__CONTENT_WARNING)
+    st.markdown("---\n\n## Featured datasets and models")
+    col_1, col_2, _ = st.columns(3)
+    with col_1:
+        st.markdown("### Datasets")
+        st.markdown(__DATASET_LIST, unsafe_allow_html=True)
+    with col_2:
+        st.markdown("### Models")
+        st.markdown(__MODEL_LIST, unsafe_allow_html=True)