Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

yourusername commited on Dec 3, 2021

Commit

9b51db9

1 Parent(s): b9430ed

:tada: init

Browse files

Files changed (7) hide show

data_measurements/__init__.py +0 -0
data_measurements/dataset_statistics.py +980 -0
data_measurements/dataset_utils.py +292 -0
data_measurements/embeddings.py +448 -0
data_measurements/npmi.py +251 -0
data_measurements/streamlit_utils.py +483 -0
data_measurements/zipf.py +244 -0

data_measurements/__init__.py ADDED Viewed

File without changes

data_measurements/dataset_statistics.py ADDED Viewed

	@@ -0,0 +1,980 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import statistics
+from os import mkdir
+from os.path import exists, isdir
+from os.path import join as pjoin
+import nltk
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.figure_factory as ff
+import plotly.graph_objects as go
+import pyarrow.feather as feather
+from datasets import load_from_disk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
+from .dataset_utils import (
+    CNT,
+    DEDUP_TOT,
+    EMBEDDING_FIELD,
+    LENGTH_FIELD,
+    OUR_LABEL_FIELD,
+    OUR_TEXT_FIELD,
+    PROP,
+    TEXT_NAN_CNT,
+    TOKENIZED_FIELD,
+    TXT_LEN,
+    VOCAB,
+    WORD,
+    extract_field,
+    load_truncated_dataset,
+)
+from .embeddings import Embeddings
+from .npmi import nPMI
+from .zipf import Zipf
+pd.options.display.float_format = "{:,.3f}".format
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.WARNING)
+logs.propagate = False
+if not logs.handlers:
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/dataset_statistics.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+# TODO: Read this in depending on chosen language / expand beyond english
+nltk.download("stopwords")
+_CLOSED_CLASS = (
+    stopwords.words("english")
+    + [
+        "t",
+        "n",
+        "ll",
+        "d",
+        "wasn",
+        "weren",
+        "won",
+        "aren",
+        "wouldn",
+        "shouldn",
+        "didn",
+        "don",
+        "hasn",
+        "ain",
+        "couldn",
+        "doesn",
+        "hadn",
+        "haven",
+        "isn",
+        "mightn",
+        "mustn",
+        "needn",
+        "shan",
+        "would",
+        "could",
+        "dont",
+        "u",
+    ]
+    + [str(i) for i in range(0, 21)]
+)
+_IDENTITY_TERMS = [
+    "man",
+    "woman",
+    "non-binary",
+    "gay",
+    "lesbian",
+    "queer",
+    "trans",
+    "straight",
+    "cis",
+    "she",
+    "her",
+    "hers",
+    "he",
+    "him",
+    "his",
+    "they",
+    "them",
+    "their",
+    "theirs",
+    "himself",
+    "herself",
+]
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+_MIN_VOCAB_COUNT = 10
+_TREE_DEPTH = 12
+_TREE_MIN_NODES = 250
+# as long as we're using sklearn - already pushing the resources
+_MAX_CLUSTER_EXAMPLES = 5000
+_NUM_VOCAB_BATCHES = 2000
+_CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
+num_rows = 200000
+class DatasetStatisticsCacheClass:
+    def __init__(
+        self,
+        cache_dir,
+        dset_name,
+        dset_config,
+        split_name,
+        text_field,
+        label_field,
+        label_names,
+        calculation=None,
+    ):
+        # This is only used for standalone runs for each kind of measurement.
+        self.calculation = calculation
+        self.our_text_field = OUR_TEXT_FIELD
+        self.our_length_field = LENGTH_FIELD
+        self.our_label_field = OUR_LABEL_FIELD
+        self.our_tokenized_field = TOKENIZED_FIELD
+        self.our_embedding_field = EMBEDDING_FIELD
+        self.cache_dir = cache_dir
+        ### What are we analyzing?
+        # name of the Hugging Face dataset
+        self.dset_name = dset_name
+        # name of the dataset config
+        self.dset_config = dset_config
+        # name of the split to analyze
+        self.split_name = split_name
+        # which text fields are we analysing?
+        self.text_field = text_field
+        # which label fields are we analysing?
+        self.label_field = label_field
+        # what are the names of the classes?
+        self.label_names = label_names
+        ## Hugging Face dataset objects
+        self.dset = None  # original dataset
+        # HF dataset with all of the self.text_field instances in self.dset
+        self.text_dset = None
+        # HF dataset with text embeddings in the same order as self.text_dset
+        self.embeddings_dset = None
+        # HF dataset with all of the self.label_field instances in self.dset
+        self.label_dset = None
+        ## Data frames
+        # Tokenized text
+        self.tokenized_df = []
+        # save sentence length histogram in the class so it doesn't ge re-computed
+        self.fig_tok_length = None
+        # Data Frame version of self.label_dset
+        self.label_df = None
+        # save label pie chart in the class so it doesn't ge re-computed
+        self.fig_labels = None
+        # Vocabulary with word counts in the dataset
+        self.vocab_counts_df = None
+        # Vocabulary filtered to remove stopwords
+        self.vocab_counts_filtered_df = None
+        ## General statistics and duplicates
+        # Number of NaN values (NOT empty strings)
+        self.text_nan_count = 0
+        # Number of text items that appear more than once in the dataset
+        self.dedup_total = 0
+        # Duplicated text items along with their number of occurences ("count")
+        self.text_dup_counts_df = None
+        self.avg_length = None
+        self.std_length = None
+        self.general_stats_dict = None
+        # clustering text by embeddings
+        # the hierarchical clustering tree is represented as a list of nodes,
+        # the first is the root
+        self.node_list = []
+        # save tree figure in the class so it doesn't ge re-computed
+        self.fig_tree = None
+        # keep Embeddings object around to explore clusters
+        self.embeddings = None
+        # nPMI
+        # Holds a nPMIStatisticsCacheClass object
+        self.npmi_stats = None
+        # TODO: Users ideally can type in whatever words they want.
+        self.termlist = _IDENTITY_TERMS
+        # termlist terms that are available more than _MIN_VOCAB_COUNT times
+        self.available_terms = _IDENTITY_TERMS
+        # TODO: Have lowercase be an option for a user to set.
+        self.to_lowercase = True
+        # The minimum amount of times a word should occur to be included in
+        # word-count-based calculations (currently just relevant to nPMI)
+        self.min_vocab_count = _MIN_VOCAB_COUNT
+        # zipf
+        self.z = None
+        self.zipf_fig = None
+        self.cvec = _CVEC
+        # File definitions
+        # path to the directory used for caching
+        if not isinstance(text_field, str):
+            text_field = "-".join(text_field)
+        if isinstance(label_field, str):
+            label_field = label_field
+        else:
+            label_field = "-".join(label_field)
+        self.cache_path = pjoin(
+            self.cache_dir,
+            f"{dset_name}_{dset_config}_{split_name}_{text_field}_{label_field}",
+        )
+        if not isdir(self.cache_path):
+            logs.warning("Creating cache directory %s." % self.cache_path)
+            mkdir(self.cache_path)
+        self.dset_fid = pjoin(self.cache_path, "base_dset")
+        self.text_dset_fid = pjoin(self.cache_path, "text_dset")
+        self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
+        self.label_dset_fid = pjoin(self.cache_path, "label_dset")
+        self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
+        self.general_stats_fid = pjoin(self.cache_path, "general_stats.json")
+        self.text_duplicate_counts_df_fid = pjoin(
+            self.cache_path, "text_dup_counts_df.feather"
+        )
+        self.zipf_fid = pjoin(self.cache_path, "zipf_basic_stats.json")
+    def get_base_dataset(self):
+        """Gets a pointer to the truncated base dataset object."""
+        if not self.dset:
+            self.dset = load_truncated_dataset(
+                self.dset_name,
+                self.dset_config,
+                self.split_name,
+                cache_name=self.dset_fid,
+                use_cache=True,
+                use_streaming=True,
+            )
+    def get_dataset_peek(self):
+        self.get_base_dataset()
+        return self.dset[:100]
+    def load_or_prepare_general_stats(self, use_cache=False):
+        """Data structures used in calculating general statistics and duplicates"""
+        # TODO: These probably don't need to be feather files, could be csv.
+        # General statistics
+        if (
+            use_cache
+            and exists(self.general_stats_fid)
+            and exists(self.text_duplicate_counts_df_fid)
+        ):
+            self.load_general_stats(
+                self.general_stats_fid, self.text_duplicate_counts_df_fid
+            )
+        else:
+            (
+                self.text_nan_count,
+                self.dedup_total,
+                self.text_dup_counts_df,
+            ) = self.prepare_general_text_stats()
+            self.general_stats_dict = {
+                TEXT_NAN_CNT: self.text_nan_count,
+                DEDUP_TOT: self.dedup_total,
+            }
+            write_df(self.text_dup_counts_df, self.text_duplicate_counts_df_fid)
+            write_json(self.general_stats_dict, self.general_stats_fid)
+    def load_or_prepare_text_lengths(self, use_cache=False):
+        if len(self.tokenized_df) == 0:
+            self.tokenized_df = self.do_tokenization()
+        self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(len)
+        self.avg_length = round(
+            sum(self.tokenized_df[self.our_length_field])
+            / len(self.tokenized_df[self.our_length_field]),
+            1,
+        )
+        self.std_length = round(
+            statistics.stdev(self.tokenized_df[self.our_length_field]), 1
+        )
+        self.fig_tok_length = make_fig_lengths(self.tokenized_df, self.our_length_field)
+    def load_or_prepare_embeddings(self, use_cache=False):
+        self.embeddings = Embeddings(self, use_cache=use_cache)
+        self.embeddings.make_hierarchical_clustering()
+        self.fig_tree = self.embeddings.fig_tree
+        self.node_list = self.embeddings.node_list
+    # get vocab with word counts
+    def load_or_prepare_vocab(self, use_cache=True, save=True):
+        """
+        Calculates the vocabulary count from the tokenized text.
+        The resulting dataframes may be used in nPMI calculations, zipf, etc.
+        :param use_cache:
+        :return:
+        """
+        if (
+            use_cache
+            and exists(self.vocab_counts_df_fid)
+        ):
+            logs.info("Reading vocab from cache")
+            self.load_vocab()
+            self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
+        else:
+            logs.info("Calculating vocab afresh")
+            if len(self.tokenized_df) == 0:
+                self.tokenized_df = self.do_tokenization()
+                if save:
+                    logs.info("Writing out.")
+                    write_df(self.tokenized_df, self.tokenized_df_fid)
+            word_count_df = count_vocab_frequencies(self.tokenized_df)
+            logs.info("Making dfs with proportion.")
+            self.vocab_counts_df = calc_p_word(word_count_df)
+            self.vocab_counts_filtered_df = filter_vocab(self.vocab_counts_df)
+            if save:
+                logs.info("Writing out.")
+                write_df(self.vocab_counts_df, self.vocab_counts_df_fid)
+        logs.info("unfiltered vocab")
+        logs.info(self.vocab_counts_df)
+        logs.info("filtered vocab")
+        logs.info(self.vocab_counts_filtered_df)
+    def load_or_prepare_npmi_terms(self, use_cache=False):
+        self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=use_cache)
+        self.npmi_stats.load_or_prepare_npmi_terms()
+    def load_or_prepare_zipf(self, use_cache=False):
+        if use_cache and exists(self.zipf_fid):
+            # TODO: Read zipf data so that the vocab is there.
+            with open(self.zipf_fid, "r") as f:
+                zipf_dict = json.load(f)
+            self.z = Zipf()
+            self.z.load(zipf_dict)
+        else:
+            self.z = Zipf(self.vocab_counts_df)
+            write_zipf_data(self.z, self.zipf_fid)
+        self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
+    def prepare_general_text_stats(self):
+        text_nan_count = int(self.tokenized_df.isnull().sum().sum())
+        dup_df = self.tokenized_df[self.tokenized_df.duplicated([self.our_text_field])]
+        dedup_df = pd.DataFrame(
+            dup_df.pivot_table(
+                columns=[self.our_text_field], aggfunc="size"
+            ).sort_values(ascending=False),
+            columns=[CNT],
+        )
+        dedup_df.index = dedup_df.index.map(str)
+        dedup_df[OUR_TEXT_FIELD] = dedup_df.index
+        dedup_total = sum(dedup_df[CNT])
+        return text_nan_count, dedup_total, dedup_df
+    def load_general_stats(self, general_stats_fid, text_duplicate_counts_df_fid):
+        general_stats = json.load(open(general_stats_fid, encoding="utf-8"))
+        self.text_nan_count = general_stats[TEXT_NAN_CNT]
+        self.dedup_total = general_stats[DEDUP_TOT]
+        with open(text_duplicate_counts_df_fid, "rb") as f:
+            self.text_dup_counts_df = feather.read_feather(f)
+    def load_or_prepare_dataset(self, use_cache=True, use_df=False, save=True):
+        """
+         Prepares the HF datasets and data frames containing the untokenized and tokenized
+         text as well as the label values. If cache is not being used (use_cache=False), writes the datasets to text.
+        :param use_cache:
+        :param use_df: Whether to used stored dataframes rather than dset files
+        :return:
+        """
+        ## Raw text first, then tokenization.
+        # Use what has been previously stored in DataFrame form or Dataset form.
+        if (
+            use_cache
+            and use_df
+            and exists(self.tokenized_df_fid)
+        ):
+            self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
+        elif (
+            use_cache and exists(self.text_dset_fid)):
+            # load extracted text
+            self.text_dset = load_from_disk(self.text_dset_fid)
+            logs.warning("Loaded dataset from disk")
+            logs.info(self.text_dset)
+        # ...Or load it from the server and store it anew
+        else:
+            self.get_base_dataset()
+            # extract all text instances
+            self.text_dset = self.dset.map(
+                lambda examples: extract_field(
+                    examples, self.text_field, OUR_TEXT_FIELD
+                ),
+                batched=True,
+                remove_columns=list(self.dset.features),
+            )
+            if save:
+                # save extracted text instances
+                logs.warning("Saving dataset to disk")
+                self.text_dset.save_to_disk(self.text_dset_fid)
+            # tokenize all text instances
+            self.tokenized_df = self.do_tokenization()
+            if save:
+                # save tokenized text
+                write_df(self.tokenized_df, self.tokenized_df_fid)
+    def do_tokenization(self):
+        """
+        Tokenizes the dataset
+        :return:
+        """
+        sent_tokenizer = self.cvec.build_tokenizer()
+        def tokenize_batch(examples):
+            # TODO: lowercase should be an option
+            res = {
+                TOKENIZED_FIELD: [
+                    tuple(sent_tokenizer(text.lower()))
+                    for text in examples[OUR_TEXT_FIELD]
+                ]
+            }
+            res[LENGTH_FIELD] = [len(tok_text) for tok_text in res[TOKENIZED_FIELD]]
+            return res
+        tokenized_dset = self.text_dset.map(
+            tokenize_batch,
+            batched=True,
+            # remove_columns=[OUR_TEXT_FIELD], keep around to print
+        )
+        tokenized_df = pd.DataFrame(tokenized_dset)
+        return tokenized_df
+    def set_label_field(self, label_field="label"):
+        """
+        Setter for label_field. Used in the CLI when a user asks for information
+         about labels, but does not specify the field;
+         'label' is assumed as a default.
+        """
+        self.label_field = label_field
+    def load_or_prepare_labels(self, use_cache=False, save=True):
+        """
+        Extracts labels from the Dataset
+        :param use_cache:
+        :return:
+        """
+        # extracted labels
+        if len(self.label_field) > 0:
+            if use_cache and exists(self.label_dset_fid):
+                # load extracted labels
+                self.label_dset = load_from_disk(self.label_dset_fid)
+            else:
+                self.get_base_dataset()
+                self.label_dset = self.dset.map(
+                    lambda examples: extract_field(
+                        examples, self.label_field, OUR_LABEL_FIELD
+                    ),
+                    batched=True,
+                    remove_columns=list(self.dset.features),
+                )
+                if save:
+                    # save extracted label instances
+                    self.label_dset.save_to_disk(self.label_dset_fid)
+            self.label_df = self.label_dset.to_pandas()
+            self.fig_labels = make_fig_labels(
+                self.label_df, self.label_names, OUR_LABEL_FIELD
+            )
+    def load_vocab(self):
+        with open(self.vocab_counts_df_fid, "rb") as f:
+            self.vocab_counts_df = feather.read_feather(f)
+        # Handling for changes in how the index is saved.
+        self.vocab_counts_df = self._set_idx_col_names(self.vocab_counts_df)
+    def _set_idx_col_names(self, input_vocab_df):
+        if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
+            input_vocab_df = input_vocab_df.set_index([VOCAB])
+            input_vocab_df[VOCAB] = input_vocab_df.index
+        return input_vocab_df
+class nPMIStatisticsCacheClass:
+    """ "Class to interface between the app and the nPMI class
+    by calling the nPMI class with the user's selections."""
+    def __init__(self, dataset_stats, use_cache=False):
+        self.dstats = dataset_stats
+        self.pmi_cache_path = pjoin(self.dstats.cache_path, "pmi_files")
+        if not isdir(self.pmi_cache_path):
+            logs.warning("Creating pmi cache directory %s." % self.pmi_cache_path)
+            # We need to preprocess everything.
+            mkdir(self.pmi_cache_path)
+        self.joint_npmi_df_dict = {}
+        self.termlist = self.dstats.termlist
+        logs.info(self.termlist)
+        self.use_cache = use_cache
+        # TODO: Let users specify
+        self.open_class_only = True
+        self.min_vocab_count = self.dstats.min_vocab_count
+        self.subgroup_files = {}
+        self.npmi_terms_fid = pjoin(self.dstats.cache_path, "npmi_terms.json")
+        self.available_terms = self.dstats.available_terms
+        logs.info(self.available_terms)
+    def load_or_prepare_npmi_terms(self, use_cache=False):
+        """
+        Figures out what identity terms the user can select, based on whether
+        they occur more than self.min_vocab_count times
+        :param use_cache:
+        :return: Identity terms occurring at least self.min_vocab_count times.
+        """
+        # TODO: Add the user's ability to select subgroups.
+        # TODO: Make min_vocab_count here value selectable by the user.
+        if (
+            use_cache
+            and exists(self.npmi_terms_fid)
+            and json.load(open(self.npmi_terms_fid))["available terms"] != []
+        ):
+            available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
+        else:
+            true_false = [
+                term in self.dstats.vocab_counts_df.index for term in self.termlist
+            ]
+            word_list_tmp = [x for x, y in zip(self.termlist, true_false) if y]
+            true_false_counts = [
+                self.dstats.vocab_counts_df.loc[word, CNT] >= self.min_vocab_count
+                for word in word_list_tmp
+            ]
+            available_terms = [
+                word for word, y in zip(word_list_tmp, true_false_counts) if y
+            ]
+            logs.info(available_terms)
+            with open(self.npmi_terms_fid, "w+") as f:
+                json.dump({"available terms": available_terms}, f)
+        self.available_terms = available_terms
+        return available_terms
+    def load_or_prepare_joint_npmi(self, subgroup_pair, use_cache=True):
+        """
+        Run on-the fly, while the app is already open,
+        as it depends on the subgroup terms that the user chooses
+        :param subgroup_pair:
+        :return:
+        """
+        # Canonical ordering for subgroup_list
+        subgroup_pair = sorted(subgroup_pair)
+        subgroups_str = "-".join(subgroup_pair)
+        if not isdir(self.pmi_cache_path):
+            logs.warning("Creating cache")
+            # We need to preprocess everything.
+            # This should eventually all go into a prepare_dataset CLI
+            mkdir(self.pmi_cache_path)
+        joint_npmi_fid = pjoin(self.pmi_cache_path, subgroups_str + "_npmi.csv")
+        subgroup_files = define_subgroup_files(subgroup_pair, self.pmi_cache_path)
+        # Defines the filenames for the cache files from the selected subgroups.
+        # Get as much precomputed data as we can.
+        if use_cache and exists(joint_npmi_fid):
+            # When everything is already computed for the selected subgroups.
+            logs.info("Loading cached joint npmi")
+            joint_npmi_df = self.load_joint_npmi_df(joint_npmi_fid)
+            # When maybe some things have been computed for the selected subgroups.
+        else:
+            logs.info("Preparing new joint npmi")
+            joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
+                subgroup_pair, subgroup_files
+            )
+            # Cache new results
+            logs.info("Writing out.")
+            for subgroup in subgroup_pair:
+                write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
+            with open(joint_npmi_fid, "w+") as f:
+                joint_npmi_df.to_csv(f)
+        logs.info("The joint npmi df is")
+        logs.info(joint_npmi_df)
+        return joint_npmi_df
+    def load_joint_npmi_df(self, joint_npmi_fid):
+        """
+        Reads in a saved dataframe with all of the paired results.
+        :param joint_npmi_fid:
+        :return: paired results
+        """
+        with open(joint_npmi_fid, "rb") as f:
+            joint_npmi_df = pd.read_csv(f)
+        joint_npmi_df = self._set_idx_cols_from_cache(joint_npmi_df)
+        return joint_npmi_df.dropna()
+    def prepare_joint_npmi_df(self, subgroup_pair, subgroup_files):
+        """
+        Computs the npmi bias based on the given subgroups.
+        Handles cases where some of the selected subgroups have cached nPMI
+        computations, but other's don't, computing everything afresh if there
+        are not cached files.
+        :param subgroup_pair:
+        :return: Dataframe with nPMI for the words, nPMI bias between the words.
+        """
+        subgroup_dict = {}
+        # When npmi is computed for some (but not all) of subgroup_list
+        for subgroup in subgroup_pair:
+            logs.info("Load or failing...")
+            # When subgroup npmi has been computed in a prior session.
+            cached_results = self.load_or_fail_cached_npmi_scores(
+                subgroup, subgroup_files[subgroup]
+            )
+            # If the function did not return False and we did find it, use.
+            if cached_results:
+                # FYI: subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df = cached_results
+                # Holds the previous sessions' data for use in this session.
+                subgroup_dict[subgroup] = cached_results
+        logs.info("Calculating for subgroup list")
+        joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
+        return joint_npmi_df.dropna(), subgroup_dict
+    # TODO: Update pairwise assumption
+    def do_npmi(self, subgroup_pair, subgroup_dict):
+        """
+        Calculates nPMI for given identity terms and the nPMI bias between.
+        :param subgroup_pair: List of identity terms to calculate the bias for
+        :return: Subset of data for the UI
+        :return: Selected identity term's co-occurrence counts with
+                 other words, pmi per word, and nPMI per word.
+        """
+        logs.info("Initializing npmi class")
+        npmi_obj = self.set_npmi_obj()
+        # Canonical ordering used
+        subgroup_pair = tuple(sorted(subgroup_pair))
+        # Calculating nPMI statistics
+        for subgroup in subgroup_pair:
+            # If the subgroup data is already computed, grab it.
+            # TODO: Should we set idx and column names similarly to how we set them for cached files?
+            if subgroup not in subgroup_dict:
+                logs.info("Calculating statistics for %s" % subgroup)
+                vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
+                # Store the nPMI information for the current subgroups
+                subgroup_dict[subgroup] = (vocab_cooc_df, pmi_df, npmi_df)
+        # Pair the subgroups together, indexed by all words that
+        # co-occur between them.
+        logs.info("Computing pairwise npmi bias")
+        paired_results = npmi_obj.calc_paired_metrics(subgroup_pair, subgroup_dict)
+        UI_results = make_npmi_fig(paired_results, subgroup_pair)
+        return UI_results, subgroup_dict
+    def set_npmi_obj(self):
+        """
+        Initializes the nPMI class with the given words and tokenized sentences.
+        :return:
+        """
+        npmi_obj = nPMI(self.dstats.vocab_counts_df, self.dstats.tokenized_df)
+        return npmi_obj
+    def load_or_fail_cached_npmi_scores(self, subgroup, subgroup_fids):
+        """
+        Reads cached scores from the specified subgroup files
+        :param subgroup: string of the selected identity term
+        :return:
+        """
+        # TODO: Ordering of npmi, pmi, vocab triple should be consistent
+        subgroup_npmi_fid, subgroup_pmi_fid, subgroup_cooc_fid = subgroup_fids
+        if (
+            exists(subgroup_npmi_fid)
+            and exists(subgroup_pmi_fid)
+            and exists(subgroup_cooc_fid)
+        ):
+            logs.info("Reading in pmi data....")
+            with open(subgroup_cooc_fid, "rb") as f:
+                subgroup_cooc_df = pd.read_csv(f)
+            logs.info("pmi")
+            with open(subgroup_pmi_fid, "rb") as f:
+                subgroup_pmi_df = pd.read_csv(f)
+            logs.info("npmi")
+            with open(subgroup_npmi_fid, "rb") as f:
+                subgroup_npmi_df = pd.read_csv(f)
+            subgroup_cooc_df = self._set_idx_cols_from_cache(
+                subgroup_cooc_df, subgroup, "count"
+            )
+            subgroup_pmi_df = self._set_idx_cols_from_cache(
+                subgroup_pmi_df, subgroup, "pmi"
+            )
+            subgroup_npmi_df = self._set_idx_cols_from_cache(
+                subgroup_npmi_df, subgroup, "npmi"
+            )
+            return subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df
+        return False
+    def _set_idx_cols_from_cache(self, csv_df, subgroup=None, calc_str=None):
+        """
+        Helps make sure all of the read-in files can be accessed within code
+        via standardized indices and column names.
+        :param csv_df:
+        :param subgroup:
+        :param calc_str:
+        :return:
+        """
+        # The csv saves with this column instead of the index, so that's weird.
+        if "Unnamed: 0" in csv_df.columns:
+            csv_df = csv_df.set_index("Unnamed: 0")
+            csv_df.index.name = WORD
+        elif WORD in csv_df.columns:
+            csv_df = csv_df.set_index(WORD)
+            csv_df.index.name = WORD
+        elif VOCAB in csv_df.columns:
+            csv_df = csv_df.set_index(VOCAB)
+            csv_df.index.name = WORD
+        if subgroup and calc_str:
+            csv_df.columns = [subgroup + "-" + calc_str]
+        elif subgroup:
+            csv_df.columns = [subgroup]
+        elif calc_str:
+            csv_df.columns = [calc_str]
+        return csv_df
+    def get_available_terms(self, use_cache=False):
+        return self.load_or_prepare_npmi_terms(use_cache=use_cache)
+def dummy(doc):
+    return doc
+def count_vocab_frequencies(tokenized_df):
+    """
+    Based on an input pandas DataFrame with a 'text' column,
+    this function will count the occurrences of all words.
+    :return: [num_words x num_sentences] DataFrame with the rows corresponding to the
+    different vocabulary words and the column to the presence (0 or 1) of that word.
+    """
+    cvec = CountVectorizer(
+        tokenizer=dummy,
+        preprocessor=dummy,
+    )
+    # We do this to calculate per-word statistics
+    # Fast calculation of single word counts
+    logs.info("Fitting dummy tokenization to make matrix using the previous tokenization")
+    cvec.fit(tokenized_df[TOKENIZED_FIELD])
+    document_matrix = cvec.transform(tokenized_df[TOKENIZED_FIELD])
+    batches = np.linspace(0, tokenized_df.shape[0], _NUM_VOCAB_BATCHES).astype(int)
+    i = 0
+    tf = []
+    while i < len(batches) - 1:
+        logs.info("%s of %s vocab batches" % (str(i), str(len(batches))))
+        batch_result = np.sum(
+            document_matrix[batches[i] : batches[i + 1]].toarray(), axis=0
+        )
+        tf.append(batch_result)
+        i += 1
+    word_count_df = pd.DataFrame(
+        [np.sum(tf, axis=0)], columns=cvec.get_feature_names()
+    ).transpose()
+    # Now organize everything into the dataframes
+    word_count_df.columns = [CNT]
+    word_count_df.index.name = WORD
+    return word_count_df
+def calc_p_word(word_count_df):
+    # p(word)
+    word_count_df[PROP] = word_count_df[CNT] / float(sum(word_count_df[CNT]))
+    vocab_counts_df = pd.DataFrame(word_count_df.sort_values(by=CNT, ascending=False))
+    vocab_counts_df[VOCAB] = vocab_counts_df.index
+    return vocab_counts_df
+def filter_words(vocab_counts_df):
+    # TODO: Add warnings (which words are missing) to log file?
+    filtered_vocab_counts_df = vocab_counts_df.drop(_CLOSED_CLASS,
+                                                    errors="ignore")
+    filtered_count = filtered_vocab_counts_df[CNT]
+    filtered_count_denom = float(sum(filtered_vocab_counts_df[CNT]))
+    filtered_vocab_counts_df[PROP] = filtered_count / filtered_count_denom
+    return filtered_vocab_counts_df
+## Figures ##
+def make_fig_lengths(tokenized_df, length_field):
+    fig_tok_length = px.histogram(
+        tokenized_df, x=length_field, marginal="rug", hover_data=[length_field]
+    )
+    return fig_tok_length
+def make_fig_labels(label_df, label_names, label_field):
+    labels = label_df[label_field].unique()
+    label_sums = [len(label_df[label_df[label_field] == label]) for label in labels]
+    fig_labels = px.pie(label_df, values=label_sums, names=label_names)
+    return fig_labels
+def make_zipf_fig_ranked_word_list(vocab_df, unique_counts, unique_ranks):
+    ranked_words = {}
+    for count, rank in zip(unique_counts, unique_ranks):
+        vocab_df[vocab_df[CNT] == count]["rank"] = rank
+        ranked_words[rank] = ",".join(
+            vocab_df[vocab_df[CNT] == count].index.astype(str)
+        )  # Use the hovertext kw argument for hover text
+    ranked_words_list = [wrds for rank, wrds in sorted(ranked_words.items())]
+    return ranked_words_list
+def make_npmi_fig(paired_results, subgroup_pair):
+    subgroup1, subgroup2 = subgroup_pair
+    UI_results = pd.DataFrame()
+    if "npmi-bias" in paired_results:
+        UI_results["npmi-bias"] = paired_results["npmi-bias"].astype(float)
+    UI_results[subgroup1 + "-npmi"] = paired_results["npmi"][
+        subgroup1 + "-npmi"
+    ].astype(float)
+    UI_results[subgroup1 + "-count"] = paired_results["count"][
+        subgroup1 + "-count"
+    ].astype(int)
+    if subgroup1 != subgroup2:
+        UI_results[subgroup2 + "-npmi"] = paired_results["npmi"][
+            subgroup2 + "-npmi"
+        ].astype(float)
+        UI_results[subgroup2 + "-count"] = paired_results["count"][
+            subgroup2 + "-count"
+        ].astype(int)
+    return UI_results.sort_values(by="npmi-bias", ascending=True)
+def make_zipf_fig(vocab_counts_df, z):
+    zipf_counts = z.calc_zipf_counts(vocab_counts_df)
+    unique_counts = z.uniq_counts
+    unique_ranks = z.uniq_ranks
+    ranked_words_list = make_zipf_fig_ranked_word_list(
+        vocab_counts_df, unique_counts, unique_ranks
+    )
+    zmin = z.get_xmin()
+    logs.info("zipf counts is")
+    logs.info(zipf_counts)
+    layout = go.Layout(xaxis=dict(range=[0, 100]))
+    fig = go.Figure(
+        data=[
+            go.Bar(
+                x=z.uniq_ranks,
+                y=z.uniq_counts,
+                hovertext=ranked_words_list,
+                name="Word Rank Frequency",
+            )
+        ],
+        layout=layout,
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=z.uniq_ranks[zmin : len(z.uniq_ranks)],
+            y=zipf_counts[zmin : len(z.uniq_ranks)],
+            hovertext=ranked_words_list[zmin : len(z.uniq_ranks)],
+            line=go.scatter.Line(color="crimson", width=3),
+            name="Zipf Predicted Frequency",
+        )
+    )
+    # Customize aspect
+    # fig.update_traces(marker_color='limegreen',
+    #                  marker_line_width=1.5, opacity=0.6)
+    fig.update_layout(title_text="Word Counts, Observed and Predicted by Zipf")
+    fig.update_layout(xaxis_title="Word Rank")
+    fig.update_layout(yaxis_title="Frequency")
+    fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.10))
+    return fig
+## Input/Output ###
+def define_subgroup_files(subgroup_list, pmi_cache_path):
+    """
+    Sets the file ids for the input identity terms
+    :param subgroup_list: List of identity terms
+    :return:
+    """
+    subgroup_files = {}
+    for subgroup in subgroup_list:
+        # TODO: Should the pmi, npmi, and count just be one file?
+        subgroup_npmi_fid = pjoin(pmi_cache_path, subgroup + "_npmi.csv")
+        subgroup_pmi_fid = pjoin(pmi_cache_path, subgroup + "_pmi.csv")
+        subgroup_cooc_fid = pjoin(pmi_cache_path, subgroup + "_vocab_cooc.csv")
+        subgroup_files[subgroup] = (
+            subgroup_npmi_fid,
+            subgroup_pmi_fid,
+            subgroup_cooc_fid,
+        )
+    return subgroup_files
+## Input/Output ##
+def intersect_dfs(df_dict):
+    started = 0
+    new_df = None
+    for key, df in df_dict.items():
+        if df is None:
+            continue
+        for key2, df2 in df_dict.items():
+            if df2 is None:
+                continue
+            if key == key2:
+                continue
+            if started:
+                new_df = new_df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+            else:
+                new_df = df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+                started = 1
+    return new_df.copy()
+def write_df(df, df_fid):
+    feather.write_feather(df, df_fid)
+def write_json(json_dict, json_fid):
+    with open(json_fid, "w", encoding="utf-8") as f:
+        json.dump(json_dict, f)
+def write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files):
+    """
+    Saves the calculated nPMI statistics to their output files.
+    Includes the npmi scores for each identity term, the pmi scores, and the
+    co-occurrence counts of the identity term with all the other words
+    :param subgroup: Identity term
+    :return:
+    """
+    subgroup_fids = subgroup_files[subgroup]
+    subgroup_npmi_fid, subgroup_pmi_fid, subgroup_cooc_fid = subgroup_fids
+    subgroup_dfs = subgroup_dict[subgroup]
+    subgroup_cooc_df, subgroup_pmi_df, subgroup_npmi_df = subgroup_dfs
+    with open(subgroup_npmi_fid, "w+") as f:
+        subgroup_npmi_df.to_csv(f)
+    with open(subgroup_pmi_fid, "w+") as f:
+        subgroup_pmi_df.to_csv(f)
+    with open(subgroup_cooc_fid, "w+") as f:
+        subgroup_cooc_df.to_csv(f)
+def write_zipf_data(z, zipf_fid):
+    zipf_dict = {}
+    zipf_dict["xmin"] = int(z.xmin)
+    zipf_dict["xmax"] = int(z.xmax)
+    zipf_dict["alpha"] = float(z.alpha)
+    zipf_dict["ks_distance"] = float(z.distance)
+    zipf_dict["p-value"] = float(z.ks_test.pvalue)
+    zipf_dict["uniq_counts"] = [int(count) for count in z.uniq_counts]
+    zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
+    with open(zipf_fid, "w+", encoding="utf-8") as f:
+        json.dump(zipf_dict, f)

data_measurements/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from dataclasses import asdict
+from os.path import exists
+import pandas as pd
+from datasets import Dataset, get_dataset_infos, load_dataset, load_from_disk
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+## String names used in Hugging Face dataset configs.
+HF_FEATURE_FIELD = "features"
+HF_LABEL_FIELD = "label"
+HF_DESC_FIELD = "description"
+CACHE_DIR = "cache_dir"
+## String names we are using within this code.
+# These are not coming from the stored dataset nor HF config,
+# but rather used as identifiers in our dicts and dataframes.
+OUR_TEXT_FIELD = "text"
+OUR_LABEL_FIELD = "label"
+TOKENIZED_FIELD = "tokenized_text"
+EMBEDDING_FIELD = "embedding"
+LENGTH_FIELD = "length"
+VOCAB = "vocab"
+WORD = "word"
+CNT = "count"
+PROP = "proportion"
+TEXT_NAN_CNT = "text_nan_count"
+TXT_LEN = "text lengths"
+DEDUP_TOT = "dedup_total"
+_DATASET_LIST = [
+    "c4",
+    "squad",
+    "squad_v2",
+    "hate_speech18",
+    "hate_speech_offensive",
+    "glue",
+    "super_glue",
+    "wikitext",
+    "imdb",
+]
+_STREAMABLE_DATASET_LIST = [
+    "c4",
+    "wikitext",
+]
+_MAX_ROWS = 200000
+def load_truncated_dataset(
+    dataset_name,
+    config_name,
+    split_name,
+    num_rows=_MAX_ROWS,
+    cache_name=None,
+    use_cache=True,
+    use_streaming=True,
+):
+    """
+    This function loads the first `num_rows` items of a dataset for a
+    given `config_name` and `split_name`.
+    If `cache_name` exists, the truncated dataset is loaded from `cache_name`.
+    Otherwise, a new truncated dataset is created and immediately saved
+    to `cache_name`.
+    When the dataset is streamable, we iterate through the first
+    `num_rows` examples in streaming mode, write them to a jsonl file,
+    then create a new dataset from the json.
+    This is the most direct way to make a Dataset from an IterableDataset
+    as of datasets version 1.6.1.
+    Otherwise, we download the full dataset and select the first
+    `num_rows` items
+    Args:
+        dataset_name (string):
+            dataset id in the dataset library
+        config_name (string):
+            dataset configuration
+        split_name (string):
+            split name
+        num_rows (int):
+            number of rows to truncate the dataset to
+        cache_name (string):
+            name of the cache directory
+        use_cache (bool):
+            whether to load form the cache if it exists
+        use_streaming (bool):
+            whether to use streaming when the dataset supports it
+    Returns:
+        Dataset: the truncated dataset as a Dataset object
+    """
+    if cache_name is None:
+        cache_name = f"{dataset_name}_{config_name}_{split_name}_{num_rows}"
+    if exists(cache_name):
+        dataset = load_from_disk(cache_name)
+    else:
+        if use_streaming and dataset_name in _STREAMABLE_DATASET_LIST:
+            iterable_dataset = load_dataset(
+                dataset_name,
+                name=config_name,
+                split=split_name,
+                streaming=True,
+            ).take(num_rows)
+            rows = list(iterable_dataset)
+            f = open("temp.jsonl", "w", encoding="utf-8")
+            for row in rows:
+                _ = f.write(json.dumps(row) + "\n")
+            f.close()
+            dataset = Dataset.from_json(
+                "temp.jsonl", features=iterable_dataset.features, split=split_name
+            )
+        else:
+            full_dataset = load_dataset(
+                dataset_name,
+                name=config_name,
+                split=split_name,
+            )
+            dataset = full_dataset.select(range(num_rows))
+        dataset.save_to_disk(cache_name)
+    return dataset
+def intersect_dfs(df_dict):
+    started = 0
+    new_df = None
+    for key, df in df_dict.items():
+        if df is None:
+            continue
+        for key2, df2 in df_dict.items():
+            if df2 is None:
+                continue
+            if key == key2:
+                continue
+            if started:
+                new_df = new_df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+            else:
+                new_df = df.join(df2, how="inner", lsuffix="1", rsuffix="2")
+                started = 1
+    return new_df.copy()
+def get_typed_features(features, ftype="string", parents=None):
+    """
+    Recursively get a list of all features of a certain dtype
+    :param features:
+    :param ftype:
+    :param parents:
+    :return: a list of tuples > e.g. ('A', 'B', 'C') for feature example['A']['B']['C']
+    """
+    if parents is None:
+        parents = []
+    typed_features = []
+    for name, feat in features.items():
+        if isinstance(feat, dict):
+            if feat.get("dtype", None) == ftype or feat.get("feature", {}).get(
+                ("dtype", None) == ftype
+            ):
+                typed_features += [tuple(parents + [name])]
+            elif "feature" in feat:
+                if feat["feature"].get("dtype", None) == ftype:
+                    typed_features += [tuple(parents + [name])]
+                elif isinstance(feat["feature"], dict):
+                    typed_features += get_typed_features(
+                        feat["feature"], ftype, parents + [name]
+                    )
+            else:
+                for k, v in feat.items():
+                    if isinstance(v, dict):
+                        typed_features += get_typed_features(
+                            v, ftype, parents + [name, k]
+                        )
+        elif name == "dtype" and feat == ftype:
+            typed_features += [tuple(parents)]
+    return typed_features
+def get_label_features(features, parents=None):
+    """
+    Recursively get a list of all features that are ClassLabels
+    :param features:
+    :param parents:
+    :return: pairs of tuples as above and the list of class names
+    """
+    if parents is None:
+        parents = []
+    label_features = []
+    for name, feat in features.items():
+        if isinstance(feat, dict):
+            if "names" in feat:
+                label_features += [(tuple(parents + [name]), feat["names"])]
+            elif "feature" in feat:
+                if "names" in feat:
+                    label_features += [
+                        (tuple(parents + [name]), feat["feature"]["names"])
+                    ]
+                elif isinstance(feat["feature"], dict):
+                    label_features += get_label_features(
+                        feat["feature"], parents + [name]
+                    )
+            else:
+                for k, v in feat.items():
+                    if isinstance(v, dict):
+                        label_features += get_label_features(v, parents + [name, k])
+        elif name == "names":
+            label_features += [(tuple(parents), feat)]
+    return label_features
+# get the info we need for the app sidebar in dict format
+def dictionarize_info(dset_info):
+    info_dict = asdict(dset_info)
+    res = {
+        "config_name": info_dict["config_name"],
+        "splits": {
+            spl: spl_info["num_examples"]
+            for spl, spl_info in info_dict["splits"].items()
+        },
+        "features": {
+            "string": get_typed_features(info_dict["features"], "string"),
+            "int32": get_typed_features(info_dict["features"], "int32"),
+            "float32": get_typed_features(info_dict["features"], "float32"),
+            "label": get_label_features(info_dict["features"]),
+        },
+        "description": dset_info.description,
+    }
+    return res
+def get_dataset_info_dicts(dataset_id=None):
+    """
+    Creates a dict from dataset configs.
+    Uses the datasets lib's get_dataset_infos
+    :return: Dictionary mapping dataset names to their configurations
+    """
+    if dataset_id != None:
+        ds_name_to_conf_dict = {
+            dataset_id: {
+                config_name: dictionarize_info(config_info)
+                for config_name, config_info in get_dataset_infos(dataset_id).items()
+            }
+        }
+    else:
+        ds_name_to_conf_dict = {
+            ds_id: {
+                config_name: dictionarize_info(config_info)
+                for config_name, config_info in get_dataset_infos(ds_id).items()
+            }
+            for ds_id in _DATASET_LIST
+        }
+    return ds_name_to_conf_dict
+# get all instances of a specific field in a dataset
+def extract_field(examples, field_path, new_field_name=None):
+    if new_field_name is None:
+        new_field_name = "_".join(field_path)
+    field_list = []
+    # TODO: Breaks the CLI if this isn't checked.
+    if isinstance(field_path, str):
+        field_path = [field_path]
+    item_list = examples[field_path[0]]
+    for field_name in field_path[1:]:
+        item_list = [
+            next_item
+            for item in item_list
+            for next_item in (
+                item[field_name]
+                if isinstance(item[field_name], list)
+                else [item[field_name]]
+            )
+        ]
+    field_list += [
+        field
+        for item in item_list
+        for field in (item if isinstance(item, list) else [item])
+    ]
+    return {new_field_name: field_list}

data_measurements/embeddings.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from os.path import exists
+from os.path import join as pjoin
+import plotly.graph_objects as go
+import torch
+import transformers
+from datasets import load_from_disk
+from tqdm import tqdm
+from .dataset_utils import EMBEDDING_FIELD, OUR_TEXT_FIELD
+def sentence_mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[
+        0
+    ]  # First element of model_output contains all token embeddings
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    )
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+class Embeddings:
+    def __init__(self, dstats, use_cache=False):
+        """Item embeddings and clustering"""
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.node_list = None
+        self.nid_map = None
+        self.embeddings_dset = None
+        self.fig_tree = None
+        self.cached_clusters = {}
+        self.dstats = dstats
+        self.cache_path = dstats.cache_path
+        self.node_list_fid = pjoin(self.cache_path, "node_list.th")
+        self.use_cache = use_cache
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            "sentence-transformers/all-mpnet-base-v2"
+        )
+        self.model = transformers.AutoModel.from_pretrained(
+            "sentence-transformers/all-mpnet-base-v2"
+        ).to(self.device)
+    def make_text_embeddings(self):
+        embeddings_dset_fid = pjoin(self.cache_path, "embeddings_dset")
+        if self.use_cache and exists(embeddings_dset_fid):
+            self.embeddings_dset = load_from_disk(embeddings_dset_fid)
+        else:
+            self.embeddings_dset = self.make_embeddings()
+            self.embeddings_dset.save_to_disk(embeddings_dset_fid)
+    def make_hierarchical_clustering(self):
+        if self.use_cache and exists(self.node_list_fid):
+            self.node_list = torch.load(self.node_list_fid)
+        else:
+            self.make_text_embeddings()
+            self.node_list = self.fast_cluster(self.embeddings_dset, EMBEDDING_FIELD)
+            torch.save(self.node_list, self.node_list_fid)
+        self.nid_map = dict(
+            [(node["nid"], nid) for nid, node in enumerate(self.node_list)]
+        )
+        self.fig_tree = make_tree_plot(self.node_list, self.dstats.text_dset)
+    def compute_sentence_embeddings(self, sentences):
+        batch = self.tokenizer(
+            sentences, padding=True, truncation=True, return_tensors="pt"
+        )
+        batch = {k: v.to(self.device) for k, v in batch.items()}
+        with torch.no_grad():
+            model_output = self.model(**batch)
+            sentence_embeds = sentence_mean_pooling(
+                model_output, batch["attention_mask"]
+            )
+            sentence_embeds /= sentence_embeds.norm(dim=-1, keepdim=True)
+            return sentence_embeds
+    def make_embeddings(self):
+        def batch_embed_sentences(sentences):
+            return {
+                EMBEDDING_FIELD: [
+                    embed.tolist()
+                    for embed in self.compute_sentence_embeddings(
+                        sentences[OUR_TEXT_FIELD]
+                    )
+                ]
+            }
+        text_dset_embeds = self.dstats.text_dset.map(
+            batch_embed_sentences,
+            batched=True,
+            batch_size=32,
+            remove_columns=[self.dstats.our_text_field],
+        )
+        return text_dset_embeds
+    @staticmethod
+    def prepare_merges(embeddings, batch_size, low_thres=0.5):
+        top_idx_pre = torch.cat(
+            [torch.LongTensor(range(embeddings.shape[0]))[:, None]] * batch_size, dim=1
+        )
+        top_val_all = torch.Tensor(0, batch_size)
+        top_idx_all = torch.LongTensor(0, batch_size)
+        n_batches = math.ceil(len(embeddings) / batch_size)
+        for b in tqdm(range(n_batches)):
+            cos_scores = torch.mm(
+                embeddings[b * batch_size : (b + 1) * batch_size], embeddings.t()
+            )
+            for i in range(cos_scores.shape[0]):
+                cos_scores[i, (b * batch_size) + i :] = -1
+            top_val_large, top_idx_large = cos_scores.topk(
+                k=batch_size, dim=-1, largest=True
+            )
+            top_val_all = torch.cat([top_val_all, top_val_large], dim=0)
+            top_idx_all = torch.cat([top_idx_all, top_idx_large], dim=0)
+        all_merges = torch.cat(
+            [
+                top_idx_pre[top_val_all > low_thres][:, None],
+                top_idx_all[top_val_all > low_thres][:, None],
+            ],
+            dim=1,
+        )
+        all_merge_scores = top_val_all[top_val_all > low_thres]
+        return (all_merges, all_merge_scores)
+    @staticmethod
+    def merge_nodes(nodes, current_thres, previous_thres, all_merges, all_merge_scores):
+        merge_ids = (all_merge_scores <= previous_thres) * (
+            all_merge_scores > current_thres
+        )
+        merges = all_merges[merge_ids]
+        for a, b in merges.tolist():
+            node_a = nodes[a]
+            while node_a["parent_id"] != -1:
+                node_a = nodes[node_a["parent_id"]]
+            node_b = nodes[b]
+            while node_b["parent_id"] != -1:
+                node_b = nodes[node_b["parent_id"]]
+            if node_a["nid"] == node_b["nid"]:
+                continue
+            else:
+                # merge if threshold allows
+                if (node_a["depth"] + node_b["depth"]) > 0 and min(
+                    node_a["merge_threshold"], node_b["merge_threshold"]
+                ) == current_thres:
+                    merge_to = None
+                    merge_from = None
+                    if node_a["nid"] < node_b["nid"]:
+                        merge_from = node_a
+                        merge_to = node_b
+                    if node_a["nid"] > node_b["nid"]:
+                        merge_from = node_b
+                        merge_to = node_a
+                    merge_to["depth"] = max(merge_to["depth"], merge_from["depth"])
+                    merge_to["weight"] += merge_from["weight"]
+                    merge_to["children_ids"] += (
+                        merge_from["children_ids"]
+                        if merge_from["depth"] > 0
+                        else [merge_from["nid"]]
+                    )
+                    for cid in merge_from["children_ids"]:
+                        nodes[cid]["parent_id"] = merge_to["nid"]
+                    merge_from["parent_id"] = merge_to["nid"]
+                # else new node
+                else:
+                    new_nid = len(nodes)
+                    new_node = {
+                        "nid": new_nid,
+                        "parent_id": -1,
+                        "depth": max(node_a["depth"], node_b["depth"]) + 1,
+                        "weight": node_a["weight"] + node_b["weight"],
+                        "children": [],
+                        "children_ids": [node_a["nid"], node_b["nid"]],
+                        "example_ids": [],
+                        "merge_threshold": current_thres,
+                    }
+                    node_a["parent_id"] = new_nid
+                    node_b["parent_id"] = new_nid
+                    nodes += [new_node]
+        return nodes
+    def finalize_node(self, node, nodes, min_cluster_size):
+        node["children"] = sorted(
+            [
+                self.finalize_node(nodes[cid], nodes, min_cluster_size)
+                for cid in node["children_ids"]
+            ],
+            key=lambda x: x["weight"],
+            reverse=True,
+        )
+        if node["depth"] > 0:
+            node["example_ids"] = [
+                eid for child in node["children"] for eid in child["example_ids"]
+            ]
+        node["children"] = [
+            child for child in node["children"] if child["weight"] >= min_cluster_size
+        ]
+        assert node["weight"] == len(node["example_ids"]), print(node)
+        return node
+    def fast_cluster(
+        self,
+        text_dset_embeds,
+        embedding_field,
+        batch_size=1000,
+        min_cluster_size=10,
+        low_thres=0.5,
+    ):
+        embeddings = torch.Tensor(text_dset_embeds[embedding_field])
+        batch_size = min(embeddings.shape[0], batch_size)
+        all_merges, all_merge_scores = self.prepare_merges(
+            embeddings, batch_size, low_thres
+        )
+        # prepare leaves
+        nodes = [
+            {
+                "nid": nid,
+                "parent_id": -1,
+                "depth": 0,
+                "weight": 1,
+                "children": [],
+                "children_ids": [],
+                "example_ids": [nid],
+                "merge_threshold": 1.0,
+            }
+            for nid in range(embeddings.shape[0])
+        ]
+        # one level per threshold range
+        for i in range(10):
+            p_thres = 1 - i * 0.05
+            c_thres = 0.95 - i * 0.05
+            nodes = self.merge_nodes(
+                nodes, c_thres, p_thres, all_merges, all_merge_scores
+            )
+        # make root
+        root_children = [
+            node
+            for node in nodes
+            if node["parent_id"] == -1 and node["weight"] >= min_cluster_size
+        ]
+        root = {
+            "nid": len(nodes),
+            "parent_id": -1,
+            "depth": max([node["depth"] for node in root_children]) + 1,
+            "weight": sum([node["weight"] for node in root_children]),
+            "children": [],
+            "children_ids": [node["nid"] for node in root_children],
+            "example_ids": [],
+            "merge_threshold": -1.0,
+        }
+        nodes += [root]
+        for node in root_children:
+            node["parent_id"] = root["nid"]
+        # finalize tree
+        tree = self.finalize_node(root, nodes, min_cluster_size)
+        node_list = []
+        def rec_map_nodes(node, node_list):
+            node_list += [node]
+            for child in node["children"]:
+                rec_map_nodes(child, node_list)
+        rec_map_nodes(tree, node_list)
+        # get centroids and distances
+        for node in node_list:
+            node_embeds = embeddings[node["example_ids"]]
+            node["centroid"] = node_embeds.sum(dim=0)
+            node["centroid"] /= node["centroid"].norm()
+            node["centroid_dot_prods"] = torch.mv(node_embeds, node["centroid"])
+            node["sorted_examples_centroid"] = sorted(
+                [
+                    (eid, edp.item())
+                    for eid, edp in zip(node["example_ids"], node["centroid_dot_prods"])
+                ],
+                key=lambda x: x[1],
+                reverse=True,
+            )
+        return node_list
+    def find_cluster_beam(self, sentence, beam_size=20):
+        """
+        This function finds the `beam_size` lef clusters that are closest to the
+        proposed sentence and returns the full path from the root to the cluster
+        along with the dot product between the sentence embedding and the
+        cluster centroid
+        Args:
+            sentence (string): input sentence for which to find clusters
+            beam_size (int): this is a beam size algorithm to explore the tree
+        Returns:
+            [([int], float)]: list of (path_from_root, score) sorted by score
+        """
+        embed = self.compute_sentence_embeddings([sentence])[0].to("cpu")
+        active_paths = [([0], torch.dot(embed, self.node_list[0]["centroid"]).item())]
+        finished_paths = []
+        children_ids_list = [
+            [
+                self.nid_map[nid]
+                for nid in self.node_list[path[-1]]["children_ids"]
+                if nid in self.nid_map
+            ]
+            for path, score in active_paths
+        ]
+        while len(active_paths) > 0:
+            next_ids = sorted(
+                [
+                    (
+                        beam_id,
+                        nid,
+                        torch.dot(embed, self.node_list[nid]["centroid"]).item(),
+                    )
+                    for beam_id, children_ids in enumerate(children_ids_list)
+                    for nid in children_ids
+                ],
+                key=lambda x: x[2],
+                reverse=True,
+            )[:beam_size]
+            paths = [
+                (active_paths[beam_id][0] + [next_id], score)
+                for beam_id, next_id, score in next_ids
+            ]
+            active_paths = []
+            for path, score in paths:
+                if (
+                    len(
+                        [
+                            nid
+                            for nid in self.node_list[path[-1]]["children_ids"]
+                            if nid in self.nid_map
+                        ]
+                    )
+                    > 0
+                ):
+                    active_paths += [(path, score)]
+                else:
+                    finished_paths += [(path, score)]
+            children_ids_list = [
+                [
+                    self.nid_map[nid]
+                    for nid in self.node_list[path[-1]]["children_ids"]
+                    if nid in self.nid_map
+                ]
+                for path, score in active_paths
+            ]
+        return sorted(
+            finished_paths,
+            key=lambda x: x[-1],
+            reverse=True,
+        )[:beam_size]
+def make_tree_plot(node_list, text_dset):
+    nid_map = dict([(node["nid"], nid) for nid, node in enumerate(node_list)])
+    for nid, node in enumerate(node_list):
+        node["label"] = node.get(
+            "label",
+            f"{nid:2d} - {node['weight']:5d} items <br>"
+            + "<br>".join(
+                [
+                    "> " + txt[:64] + ("..." if len(txt) >= 63 else "")
+                    for txt in list(
+                        set(text_dset.select(node["example_ids"])[OUR_TEXT_FIELD])
+                    )[:5]
+                ]
+            ),
+        )
+    # make plot nodes
+    # TODO: something more efficient than set to remove duplicates
+    labels = [node["label"] for node in node_list]
+    root = node_list[0]
+    root["X"] = 0
+    root["Y"] = 0
+    def rec_make_coordinates(node):
+        total_weight = 0
+        add_weight = len(node["example_ids"]) - sum(
+            [child["weight"] for child in node["children"]]
+        )
+        for child in node["children"]:
+            child["X"] = node["X"] + total_weight
+            child["Y"] = node["Y"] - 1
+            total_weight += child["weight"] + add_weight / len(node["children"])
+            rec_make_coordinates(child)
+    rec_make_coordinates(root)
+    E = []  # list of edges
+    Xn = []
+    Yn = []
+    Xe = []
+    Ye = []
+    for nid, node in enumerate(node_list):
+        Xn += [node["X"]]
+        Yn += [node["Y"]]
+        for child in node["children"]:
+            E += [(nid, nid_map[child["nid"]])]
+            Xe += [node["X"], child["X"], None]
+            Ye += [node["Y"], child["Y"], None]
+    # make figure
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=Xe,
+            y=Ye,
+            mode="lines",
+            line=dict(color="rgb(210,210,210)", width=1),
+            hoverinfo="none",
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=Xn,
+            y=Yn,
+            mode="markers",
+            name="nodes",
+            marker=dict(
+                symbol="circle-dot",
+                size=18,
+                color="#6175c1",
+                line=dict(color="rgb(50,50,50)", width=1)
+                # '#DB4551',
+            ),
+            text=labels,
+            hoverinfo="text",
+            opacity=0.8,
+        )
+    )
+    return fig

data_measurements/npmi.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import warnings
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import MultiLabelBinarizer
+# Might be nice to print to log instead? Happens when we drop closed class.
+warnings.filterwarnings(action="ignore", category=UserWarning)
+# When we divide by 0 in log
+np.seterr(divide="ignore")
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.INFO)
+logs.propagate = False
+if not logs.handlers:
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/npmi.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+_NUM_BATCHES = 500
+class nPMI:
+    # TODO: Expand beyond pairwise
+    def __init__(
+        self,
+        vocab_counts_df,
+        tokenized_df,
+        tokenized_col_name="tokenized_text",
+        num_batches=_NUM_BATCHES,
+    ):
+        logs.info("Initiating npmi class.")
+        logs.info("vocab is")
+        logs.info(vocab_counts_df)
+        self.vocab_counts_df = vocab_counts_df
+        logs.info("tokenized is")
+        self.tokenized_df = tokenized_df
+        logs.info(self.tokenized_df)
+        self.tokenized_col_name = tokenized_col_name
+        # self.mlb_list holds num batches x num_sentences
+        self.mlb_list = []
+    def binarize_words_in_sentence(self):
+        logs.info("Creating co-occurrence matrix for PMI calculations.")
+        batches = np.linspace(0, self.tokenized_df.shape[0], _NUM_BATCHES).astype(int)
+        i = 0
+        # Creates list of size (# batches x # sentences)
+        while i < len(batches) - 1:
+            # Makes a sparse matrix (shape: # sentences x # words),
+            # with the occurrence of each word per sentence.
+            mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index)
+            logs.info(
+                "%s of %s sentence binarize batches." % (str(i), str(len(batches)))
+            )
+            # Returns series: batch size x num_words
+            mlb_series = mlb.fit_transform(
+                self.tokenized_df[self.tokenized_col_name][batches[i] : batches[i + 1]]
+            )
+            i += 1
+            self.mlb_list.append(mlb_series)
+    def calc_cooccurrences(self, subgroup, subgroup_idx):
+        initialize = True
+        coo_df = None
+        # Big computation here!  Should only happen once.
+        logs.info(
+            "Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences."
+        )
+        if not self.mlb_list:
+            self.binarize_words_in_sentence()
+        for batch_id in range(len(self.mlb_list)):
+            logs.info(
+                "%s of %s co-occurrence count batches"
+                % (str(batch_id), str(len(self.mlb_list)))
+            )
+            # List of all the sentences (list of vocab) in that batch
+            batch_sentence_row = self.mlb_list[batch_id]
+            # Dataframe of # sentences in batch x vocabulary size
+            sent_batch_df = pd.DataFrame(batch_sentence_row)
+            # logs.info('sent batch df is')
+            # logs.info(sent_batch_df)
+            # Subgroup counts per-sentence for the given batch
+            subgroup_df = sent_batch_df[subgroup_idx]
+            subgroup_df.columns = [subgroup]
+            # Remove the sentences where the count of the subgroup is 0.
+            # This way we have less computation & resources needs.
+            subgroup_df = subgroup_df[subgroup_df > 0]
+            logs.info("Removing 0 counts, subgroup_df is")
+            logs.info(subgroup_df)
+            mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
+            logs.info("mlb subgroup only is")
+            logs.info(mlb_subgroup_only)
+            # Create cooccurrence matrix for the given subgroup and all words.
+            logs.info("Now we do the T.dot approach for co-occurrences")
+            batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))
+            # Creates a batch-sized dataframe of co-occurrence counts.
+            # Note these could just be summed rather than be batch size.
+            if initialize:
+                coo_df = batch_coo_df
+            else:
+                coo_df = coo_df.add(batch_coo_df, fill_value=0)
+            logs.info("coo_df is")
+            logs.info(coo_df)
+            initialize = False
+        logs.info("Returning co-occurrence matrix")
+        logs.info(coo_df)
+        return pd.DataFrame(coo_df)
+    def calc_paired_metrics(self, subgroup_pair, subgroup_npmi_dict):
+        """
+        Calculates nPMI metrics between paired subgroups.
+        Special handling for a subgroup paired with itself.
+        :param subgroup_npmi_dict:
+        :return:
+        """
+        paired_results_dict = {"npmi": {}, "pmi": {}, "count": {}}
+        # Canonical ordering. This is done previously, but just in case...
+        subgroup1, subgroup2 = sorted(subgroup_pair)
+        vocab_cooc_df1, pmi_df1, npmi_df1 = subgroup_npmi_dict[subgroup1]
+        logs.info("vocab cooc")
+        logs.info(vocab_cooc_df1)
+        if subgroup1 == subgroup2:
+            shared_npmi_df = npmi_df1
+            shared_pmi_df = pmi_df1
+            shared_vocab_cooc_df = vocab_cooc_df1
+        else:
+            vocab_cooc_df2, pmi_df2, npmi_df2 = subgroup_npmi_dict[subgroup2]
+            logs.info("vocab cooc2")
+            logs.info(vocab_cooc_df2)
+            # Note that lsuffix and rsuffix should not come into play.
+            shared_npmi_df = npmi_df1.join(
+                npmi_df2, how="inner", lsuffix="1", rsuffix="2"
+            )
+            shared_pmi_df = pmi_df1.join(pmi_df2, how="inner", lsuffix="1", rsuffix="2")
+            shared_vocab_cooc_df = vocab_cooc_df1.join(
+                vocab_cooc_df2, how="inner", lsuffix="1", rsuffix="2"
+            )
+            shared_vocab_cooc_df = shared_vocab_cooc_df.dropna()
+            shared_vocab_cooc_df = shared_vocab_cooc_df[
+                shared_vocab_cooc_df.index.notnull()
+            ]
+            logs.info("shared npmi df")
+            logs.info(shared_npmi_df)
+            logs.info("shared vocab df")
+            logs.info(shared_vocab_cooc_df)
+        npmi_bias = (
+            shared_npmi_df[subgroup1 + "-npmi"] - shared_npmi_df[subgroup2 + "-npmi"]
+        )
+        paired_results_dict["npmi-bias"] = npmi_bias.dropna()
+        paired_results_dict["npmi"] = shared_npmi_df.dropna()
+        paired_results_dict["pmi"] = shared_pmi_df.dropna()
+        paired_results_dict["count"] = shared_vocab_cooc_df.dropna()
+        return paired_results_dict
+    def calc_metrics(self, subgroup):
+        # Index of the subgroup word in the sparse vector
+        subgroup_idx = self.vocab_counts_df.index.get_loc(subgroup)
+        logs.info("Calculating co-occurrences...")
+        df_coo = self.calc_cooccurrences(subgroup, subgroup_idx)
+        vocab_cooc_df = self.set_idx_cols(df_coo, subgroup)
+        logs.info(vocab_cooc_df)
+        logs.info("Calculating PMI...")
+        pmi_df = self.calc_PMI(vocab_cooc_df, subgroup)
+        logs.info(pmi_df)
+        logs.info("Calculating nPMI...")
+        npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup)
+        logs.info(npmi_df)
+        return vocab_cooc_df, pmi_df, npmi_df
+    def set_idx_cols(self, df_coo, subgroup):
+        """
+        :param df_coo: Co-occurrence counts for subgroup, length is num_words
+        :return:
+        """
+        count_df = df_coo.set_index(self.vocab_counts_df.index)
+        count_df.columns = [subgroup + "-count"]
+        count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(int)
+        return count_df
+    def calc_PMI(self, vocab_cooc_df, subgroup):
+        """
+        # PMI(x;y) = h(y) - h(y|x)
+        #          = h(subgroup) - h(subgroup|word)
+        #          = log (p(subgroup|word) / p(subgroup))
+        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
+        """
+        # Calculation of p(subgroup)
+        subgroup_prob = self.vocab_counts_df.loc[subgroup]["proportion"]
+        # Calculation of p(subgroup|word) = count(subgroup,word) / count(word)
+        # Because the inidices match (the vocab words),
+        # this division doesn't need to specify the index (I think?!)
+        p_subgroup_g_word = (
+            vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df["count"]
+        )
+        logs.info("p_subgroup_g_word is")
+        logs.info(p_subgroup_g_word)
+        pmi_df = pd.DataFrame()
+        pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob)
+        # Note: A potentially faster solution for adding count, npmi,
+        # can be based on this zip idea:
+        # df_test['size_kb'],  df_test['size_mb'], df_test['size_gb'] =
+        # zip(*df_test['size'].apply(sizes))
+        return pmi_df.dropna()
+    def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup):
+        """
+        # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
+        #                                           = -log(p(word|subgroup)p(word))
+        """
+        p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum(
+            vocab_cooc_df[subgroup + "-count"]
+        )
+        p_word = pmi_df.apply(
+            lambda x: self.vocab_counts_df.loc[x.name]["proportion"], axis=1
+        )
+        normalize_pmi = -np.log(p_word_g_subgroup * p_word)
+        npmi_df = pd.DataFrame()
+        npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi
+        return npmi_df.dropna()

data_measurements/streamlit_utils.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import statistics
+import pandas as pd
+import seaborn as sns
+import streamlit as st
+from st_aggrid import AgGrid, GridOptionsBuilder
+from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
+def sidebar_header():
+    st.sidebar.markdown(
+        """
+    This demo showcases the [dataset metrics as we develop them](https://github.com/huggingface/DataMeasurements).
+    Right now this has:
+    - dynamic loading of datasets in the lib
+    - fetching config and info without downloading the dataset
+    - propose the list of candidate text and label features to select
+    We are still working on:
+    - implementing all the current tools
+    """,
+        unsafe_allow_html=True,
+    )
+def sidebar_selection(ds_name_to_dict, column_id):
+    ds_names = list(ds_name_to_dict.keys())
+    with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
+        # choose a dataset to analyze
+        ds_name = st.selectbox(
+            f"Choose dataset to explore{column_id}:",
+            ds_names,
+            index=ds_names.index("hate_speech18"),
+        )
+        # choose a config to analyze
+        ds_configs = ds_name_to_dict[ds_name]
+        config_names = list(ds_configs.keys())
+        config_name = st.selectbox(
+            f"Choose configuration{column_id}:",
+            config_names,
+            index=0,
+        )
+        # choose a subset of num_examples
+        # TODO: Handling for multiple text features
+        ds_config = ds_configs[config_name]
+        text_features = ds_config[HF_FEATURE_FIELD]["string"]
+        # TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
+        text_field = st.selectbox(
+            f"Which text feature from the{column_id} dataset would you like to analyze?",
+            [("text",)]
+            if ds_name == "c4"
+            else [tp for tp in text_features if tp[0] != "id"],
+        )
+        # Choose a split and dataset size
+        avail_splits = list(ds_config["splits"].keys())
+        # 12.Nov note: Removing "test" because those should not be examined
+        # without discussion of pros and cons, which we haven't done yet.
+        if "test" in avail_splits:
+            avail_splits.remove("test")
+        split = st.selectbox(
+            f"Which split from the{column_id} dataset would you like to analyze?",
+            avail_splits,
+            index=0,
+        )
+        label_field, label_names = (
+            ds_name_to_dict[ds_name][config_name][HF_FEATURE_FIELD][HF_LABEL_FIELD][0]
+            if len(
+                ds_name_to_dict[ds_name][config_name][HF_FEATURE_FIELD][HF_LABEL_FIELD]
+            )
+            > 0
+            else ((), [])
+        )
+        return {
+            "dset_name": ds_name,
+            "dset_config": config_name,
+            "split_name": split,
+            "text_field": text_field,
+            "label_field": label_field,
+            "label_names": label_names,
+        }
+def expander_header(dstats, ds_name_to_dict, column_id):
+    with st.expander(f"Dataset Description{column_id}"):
+        st.markdown(
+            ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
+        )
+        st.dataframe(dstats.get_dataset_peek())
+def expander_general_stats(dstats, top_n, column_id):
+    with st.expander(f"General Text Statistics{column_id}"):
+        st.caption(
+            "Use this widget to check whether the terms you see most represented in the dataset make sense for the goals of the dataset."
+        )
+        st.markdown(
+            "There are {0} total words".format(str(len(dstats.vocab_counts_df)))
+        )
+        st.markdown(
+            "There are {0} words after removing closed "
+            "class words".format(str(len(dstats.vocab_counts_filtered_df)))
+        )
+        sorted_top_vocab_df = dstats.vocab_counts_filtered_df.sort_values(
+            "count", ascending=False
+        ).head(top_n)
+        st.markdown(
+            "The most common [open class words](https://dictionary.apa.org/open-class-words) and their counts are: "
+        )
+        st.dataframe(sorted_top_vocab_df)
+        st.markdown(
+            "There are {0} missing values in the dataset.".format(
+                str(dstats.text_nan_count)
+            )
+        )
+        st.markdown(
+            "There are {0} duplicate items in the dataset. For more information about the duplicates, click the 'Duplicates' tab below.".format(
+                str(dstats.dedup_total)
+            )
+        )
+### Show the label distribution from the datasets
+def expander_label_distribution(label_df, fig_labels, column_id):
+    with st.expander(f"Label Distribution{column_id}", expanded=False):
+        st.caption(
+            "Use this widget to see how balanced the labels in your dataset are."
+        )
+        if label_df is not None:
+            st.plotly_chart(fig_labels, use_container_width=True)
+        else:
+            st.markdown("No labels were found in the dataset")
+def expander_text_lengths(
+    tokenized_df,
+    fig_tok_length,
+    avg_length,
+    std_length,
+    text_field_name,
+    length_field_name,
+    column_id,
+):
+    _TEXT_LENGTH_CAPTION = (
+        "Use this widget to identify outliers, particularly suspiciously long outliers."
+    )
+    with st.expander(f"Text Lengths{column_id}", expanded=False):
+        st.caption(_TEXT_LENGTH_CAPTION)
+        st.markdown(
+            "Below, you can see how the lengths of the text instances in your dataset are distributed."
+        )
+        st.markdown(
+            "Any unexpected peaks or valleys in the distribution may help to identify data instances you want to remove or augment."
+        )
+        st.markdown(
+            "### Here is the relative frequency of different text lengths in your dataset:"
+        )
+        st.plotly_chart(fig_tok_length, use_container_width=True)
+        data = tokenized_df[[length_field_name, text_field_name]].sort_values(
+            by=["length"], ascending=True
+        )
+        st.markdown(
+            "The average length of text instances is **"
+            + str(avg_length)
+            + " words**, with a standard deviation of **"
+            + str(std_length)
+            + "**."
+        )
+        start_id_show_lengths = st.slider(
+            f"Show the shortest sentences{column_id} starting at:",
+            0,
+            len(data["length"].unique()),
+            value=0,
+            step=1,
+        )
+        st.dataframe(data[data["length"] == start_id_show_lengths].set_index("length"))
+### Third, use a sentence embedding model
+def expander_text_embeddings(
+    text_dset, fig_tree, node_list, embeddings, text_field, column_id
+):
+    with st.expander(f"Text Embedding Clusters{column_id}", expanded=False):
+        _EMBEDDINGS_CAPTION = """
+        ### Hierarchical Clustering of Text Fields
+        Taking in the diversity of text represented in a dataset can be
+        challenging when it is made up of hundreds to thousands of sentences.
+        Grouping these text items based on a measure of similarity can help
+        users gain some insights into their distribution.
+        The following figure shows a hierarchical clustering of the text fields
+        in the dataset based on a
+        [Sentence-Transformer](https://hf.co/sentence-transformers/all-mpnet-base-v2)
+        model. Clusters are merged if any of the embeddings in cluster A has a
+        dot product with any of the embeddings or with the centroid of cluster B
+        higher than a threshold (one threshold per level, from 0.5 to 0.95).
+        To explore the clusters, you can:
+        - hover over a node to see the 5 most representative examples (deduplicated)
+        - enter an example in the text box below to see which clusters it is most similar to
+        - select a cluster by ID to show all of its examples
+        """
+        st.markdown(_EMBEDDINGS_CAPTION)
+        st.plotly_chart(fig_tree, use_container_width=True)
+        st.markdown("---\n")
+        if st.checkbox(
+            label="Enter text to see nearest clusters",
+            key=f"search_clusters_{column_id}",
+        ):
+            compare_example = st.text_area(
+                label="Enter some text here to see which of the clusters in the dataset it is closest to",
+                key=f"search_cluster_input_{column_id}",
+            )
+            if compare_example != "":
+                paths_to_leaves = embeddings.cached_clusters.get(
+                    compare_example,
+                    embeddings.find_cluster_beam(compare_example, beam_size=50),
+                )
+                clusters_intro = ""
+                if paths_to_leaves[0][1] < 0.3:
+                    clusters_intro += (
+                        "**Warning: no close clusters found (best score <0.3). **"
+                    )
+                clusters_intro += "The closest clusters to the text entered aboce are:"
+                st.markdown(clusters_intro)
+                for path, score in paths_to_leaves[:5]:
+                    example = text_dset[
+                        node_list[path[-1]]["sorted_examples_centroid"][0][0]
+                    ][text_field][:256]
+                    st.write(
+                        f"Cluster {path[-1]:5d} | Score: {score:.3f}  \n Example: {example}"
+                    )
+                show_node_default = paths_to_leaves[0][0][-1]
+            else:
+                show_node_default = len(node_list) // 2
+        else:
+            show_node_default = len(node_list) // 2
+        st.markdown("---\n")
+        show_node = st.selectbox(
+            f"Choose a leaf node to explore in the{column_id} dataset:",
+            range(len(node_list)),
+            index=show_node_default,
+        )
+        node = node_list[show_node]
+        start_id = st.slider(
+            f"Show closest sentences in cluster to the centroid{column_id} starting at index:",
+            0,
+            len(node["sorted_examples_centroid"]) - 5,
+            value=0,
+            step=5,
+        )
+        for sid, sim in node["sorted_examples_centroid"][start_id : start_id + 5]:
+            # only show the first 4 lines and the first 10000 characters
+            show_text = text_dset[sid][text_field][:10000]
+            show_text = "\n".join(show_text.split("\n")[:4])
+            st.text(f"{sim:.3f} \t {show_text}")
+### Then, show duplicates
+def expander_text_duplicates(dedup_df, column_id):
+    with st.expander(f"Text Duplicates{column_id}", expanded=False):
+        st.caption(
+            "Use this widget to identify text strings that appear more than once."
+        )
+        st.markdown(
+            "A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
+        )
+        dedup_df["count"] = dedup_df["count"] + 1
+        st.markdown("------")
+        st.write(
+            "### Here is the list of all the duplicated items and their counts in your dataset:"
+        )
+        # Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
+        if len(dedup_df) == 0:
+            st.write("There are no duplicates in this dataset! 🥳")
+        else:
+            gb = GridOptionsBuilder.from_dataframe(dedup_df)
+            gb.configure_column(
+                f"text{column_id}",
+                wrapText=True,
+                resizable=True,
+                autoHeight=True,
+                min_column_width=85,
+                use_container_width=True,
+            )
+            go = gb.build()
+            AgGrid(dedup_df, gridOptions=go)
+def expander_npmi_description(min_vocab):
+    _NPMI_CAPTION = (
+        "Use this widget to identify problematic biases and stereotypes in your data."
+    )
+    _NPMI_CAPTION1 = """
+    nPMI scores for a word help to identify potentially
+    problematic associations, ranked by how close the association is."""
+    _NPMI_CAPTION2 = """
+    nPMI bias scores for paired words help to identify how word
+    associations are skewed between the selected selected words
+    ([Aka et al., 2021](https://arxiv.org/abs/2103.03417)).
+    """
+    st.caption(_NPMI_CAPTION)
+    st.markdown(_NPMI_CAPTION1)
+    st.markdown(_NPMI_CAPTION2)
+    st.markdown("  ")
+    st.markdown(
+        "You can select from gender and sexual orientation "
+        "identity terms that appear in the dataset at least %s "
+        "times." % min_vocab
+    )
+    st.markdown(
+        "The resulting ranked words are those that co-occur with both "
+        "identity terms.  "
+    )
+    st.markdown(
+        "The more *positive* the score, the more associated the word is with the first identity term.  "
+        "The more *negative* the score, the more associated the word is with the second identity term."
+    )
+### Finally, show Zipf stuff
+def expander_zipf(z, zipf_fig, column_id):
+    _ZIPF_CAPTION = """This shows how close the observed language is to an ideal
+    natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
+    calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
+    powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
+    zipf_summary = (
+        "The optimal alpha based on this dataset is: **"
+        + str(round(z.alpha, 2))
+        + "**, with a KS distance of: **"
+        + str(round(z.distance, 2))
+    )
+    zipf_summary += (
+        "**.  This was fit with a minimum rank value of: **"
+        + str(int(z.xmin))
+        + "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
+    )
+    alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
+    xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
+    fit_results_table = pd.DataFrame.from_dict(
+        {
+            r"Alpha:": [str("%.2f" % z.alpha)],
+            "KS distance:": [str("%.2f" % z.distance)],
+            "Min rank:": [str("%s" % int(z.xmin))],
+        },
+        columns=["Results"],
+        orient="index",
+    )
+    fit_results_table.index.name = column_id
+    with st.expander(
+        f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
+    ):
+        st.caption(
+            "Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
+        )
+        st.markdown(_ZIPF_CAPTION)
+        st.write(
+            """
+        A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
+with an ideal α value of 1."""
+        )
+        st.markdown(
+            "In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
+        )
+        st.markdown(
+            "Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
+        )
+        st.markdown("-----")
+        st.write("### Here is your dataset's Zipf results:")
+        st.dataframe(fit_results_table)
+        st.write(zipf_summary)
+        # TODO: Nice UI version of the content in the comments.
+        # st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
+        # if z.ks_test.pvalue < 0.01:
+        #    st.markdown(
+        #        "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
+        # else:
+        #    st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
+        # st.markdown("Checking the goodness of fit of our observed distribution")
+        # st.markdown("to the hypothesized power law distribution")
+        # st.markdown("using a Kolmogorov–Smirnov (KS) test.")
+        st.plotly_chart(zipf_fig, use_container_width=True)
+        if z.alpha > 2:
+            st.markdown(alpha_warning)
+        if z.xmin > 5:
+            st.markdown(xmin_warning)
+### Finally finally finally, show nPMI stuff.
+def npmi_widget(column_id, available_terms, npmi_stats, min_vocab, use_cache=False):
+    """
+    Part of the main app, but uses a user interaction so pulled out as its own f'n.
+    :param use_cache:
+    :param column_id:
+    :param npmi_stats:
+    :param min_vocab:
+    :return:
+    """
+    with st.expander(f"Word Association{column_id}: nPMI", expanded=False):
+        if len(available_terms) > 0:
+            expander_npmi_description(min_vocab)
+            st.markdown("-----")
+            term1 = st.selectbox(
+                f"What is the first term you want to select?{column_id}",
+                available_terms,
+            )
+            term2 = st.selectbox(
+                f"What is the second term you want to select?{column_id}",
+                reversed(available_terms),
+            )
+            # We calculate/grab nPMI data based on a canonical (alphabetic)
+            # subgroup ordering.
+            subgroup_pair = sorted([term1, term2])
+            try:
+                joint_npmi_df = npmi_stats.load_or_prepare_joint_npmi(subgroup_pair)
+                npmi_show(joint_npmi_df)
+            except KeyError:
+                st.markdown(
+                    "**WARNING!** The nPMI for these terms has not been pre-computed, please re-run caching."
+                )
+        else:
+            st.markdown(
+                "No words found co-occurring with both of the selected identity terms."
+            )
+def npmi_show(paired_results):
+    if paired_results.empty:
+        st.markdown("No words that co-occur enough times for results!  Or there's a 🐛.")
+    else:
+        s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
+        # s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
+        s.index.name = "word"
+        npmi_cols = s.filter(like="npmi").columns
+        count_cols = s.filter(like="count").columns
+        # TODO: This is very different look than the duplicates table above. Should probably standardize.
+        cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
+        out_df = (
+            s.style.background_gradient(subset=npmi_cols, cmap=cm)
+            .format(subset=npmi_cols, formatter="{:,.3f}")
+            .format(subset=count_cols, formatter=int)
+            .set_properties(
+                subset=count_cols, **{"width": "10em", "text-align": "center"}
+            )
+            .set_properties(**{"align": "center"})
+            .set_caption(
+                "nPMI scores and co-occurence counts between the selected identity terms and the words they both co-occur with"
+            )
+        )  # s = pd.read_excel("output.xlsx", index_col="word")
+        st.write("### Here is your dataset's nPMI results:")
+        st.dataframe(out_df)
+### Dumping unused functions here for now
+### Second, show the distribution of text perplexities
+def expander_text_perplexities(text_label_df, sorted_sents_loss, fig_loss):
+    with st.expander("Show text perplexities A", expanded=False):
+        st.markdown("### Text perplexities A")
+        st.plotly_chart(fig_loss, use_container_width=True)
+        start_id_show_loss = st.slider(
+            "Show highest perplexity sentences in A starting at index:",
+            0,
+            text_label_df.shape[0] - 5,
+            value=0,
+            step=5,
+        )
+        for lss, sent in sorted_sents_loss[start_id_show_loss : start_id_show_loss + 5]:
+            st.text(f"{lss:.3f} {sent}")

data_measurements/zipf.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import numpy as np
+import pandas as pd
+import powerlaw
+import streamlit as st
+from scipy.stats import ks_2samp
+from scipy.stats import zipf as zipf_lib
+from .dataset_utils import CNT, PROP
+# treating inf values as NaN as well
+pd.set_option("use_inf_as_na", True)
+logs = logging.getLogger(__name__)
+logs.setLevel(logging.INFO)
+logs.propagate = False
+if not logs.handlers:
+    # Logging info to log file
+    file = logging.FileHandler("./log_files/zipf.log")
+    fileformat = logging.Formatter("%(asctime)s:%(message)s")
+    file.setLevel(logging.INFO)
+    file.setFormatter(fileformat)
+    # Logging debug messages to stream
+    stream = logging.StreamHandler()
+    streamformat = logging.Formatter("[data_measurements_tool] %(message)s")
+    stream.setLevel(logging.WARNING)
+    stream.setFormatter(streamformat)
+    logs.addHandler(file)
+    logs.addHandler(stream)
+class Zipf:
+    def __init__(self, vocab_counts_df=pd.DataFrame()):
+        self.vocab_counts_df = vocab_counts_df
+        self.alpha = None
+        self.xmin = None
+        self.xmax = None
+        self.fit = None
+        self.ranked_words = {}
+        self.uniq_counts = []
+        self.uniq_ranks = []
+        self.uniq_fit_counts = None
+        self.term_df = None
+        self.pvalue = None
+        self.ks_test = None
+        self.distance = None
+        self.fit = None
+        self.predicted_zipf_counts = None
+        if not self.vocab_counts_df.empty:
+            logs.info("Fitting based on input vocab counts.")
+            self.calc_fit(vocab_counts_df)
+            logs.info("Getting predicted counts.")
+            self.predicted_zipf_counts = self.calc_zipf_counts(vocab_counts_df)
+    def load(self, zipf_dict):
+        self.set_xmin(zipf_dict["xmin"])
+        self.set_xmax(zipf_dict["xmax"])
+        self.set_alpha(zipf_dict["alpha"])
+        self.set_ks_distance(zipf_dict["ks_distance"])
+        self.set_p(zipf_dict["p-value"])
+        self.set_unique_ranks(zipf_dict["uniq_ranks"])
+        self.set_unique_counts(zipf_dict["uniq_counts"])
+    def calc_fit(self, vocab_counts_df):
+        """
+        Uses the powerlaw package to fit the observed frequencies to a zipfian distribution.
+        We use the KS-distance to fit, as that seems more appropriate that MLE.
+        :param vocab_counts_df:
+        :return:
+        """
+        self.vocab_counts_df = vocab_counts_df
+        # TODO: These proportions may have already been calculated.
+        vocab_counts_df[PROP] = vocab_counts_df[CNT] / float(sum(vocab_counts_df[CNT]))
+        rank_column = vocab_counts_df[CNT].rank(
+            method="dense", numeric_only=True, ascending=False
+        )
+        vocab_counts_df["rank"] = rank_column.astype("int64")
+        observed_counts = vocab_counts_df[CNT].values
+        # Note another method for determining alpha might be defined by
+        # (Newman, 2005): alpha = 1 + n * sum(ln( xi / xmin )) ^ -1
+        self.fit = powerlaw.Fit(observed_counts, fit_method="KS", discrete=True)
+        # This should probably be a pmf (not pdf); using discrete=True above.
+        # original_data=False uses only the fitted data (within xmin and xmax).
+        # pdf_bin_edges: The portion of the data within the bin.
+        # observed_pdf: The probability density function (normalized histogram)
+        # of the data.
+        pdf_bin_edges, observed_pdf = self.fit.pdf(original_data=False)
+        # See the 'Distribution' class described here for info:
+        # https://pythonhosted.org/powerlaw/#powerlaw.Fit.pdf
+        theoretical_distro = self.fit.power_law
+        # The probability density function (normalized histogram) of the
+        # theoretical distribution.
+        predicted_pdf = theoretical_distro.pdf()
+        # !!!! CRITICAL VALUE FOR ZIPF !!!!
+        self.alpha = theoretical_distro.alpha
+        # Exclusive xmin: The optimal xmin *beyond which* the scaling regime of
+        # the power law fits best.
+        self.xmin = theoretical_distro.xmin
+        self.xmax = theoretical_distro.xmax
+        self.distance = theoretical_distro.KS()
+        self.ks_test = ks_2samp(observed_pdf, predicted_pdf)
+        self.pvalue = self.ks_test[1]
+        logs.info("KS test:")
+        logs.info(self.ks_test)
+    def set_xmax(self, xmax):
+        """
+        xmax is usually None, so we add some handling to set it as the
+        maximum rank in the dataset.
+        :param xmax:
+        :return:
+        """
+        if xmax:
+            self.xmax = int(xmax)
+        elif self.uniq_counts:
+            self.xmax = int(len(self.uniq_counts))
+        elif self.uniq_ranks:
+            self.xmax = int(len(self.uniq_ranks))
+    def get_xmax(self):
+        """
+        :return:
+        """
+        if not self.xmax:
+            self.set_xmax(self.xmax)
+        return self.xmax
+    def set_p(self, p):
+        self.p = int(p)
+    def get_p(self):
+        return int(self.p)
+    def set_xmin(self, xmin):
+        self.xmin = xmin
+    def get_xmin(self):
+        if self.xmin:
+            return int(self.xmin)
+        return self.xmin
+    def set_alpha(self, alpha):
+        self.alpha = float(alpha)
+    def get_alpha(self):
+        return float(self.alpha)
+    def set_ks_distance(self, distance):
+        self.distance = float(distance)
+    def get_ks_distance(self):
+        return self.distance
+    def calc_zipf_counts(self, vocab_counts_df):
+        """
+        The fit is based on an optimal xmin (minimum rank)
+        Let's use this to make count estimates for the zipf fit,
+        by multiplying the fitted pmf value by the sum of counts above xmin.
+        :return: array of count values following the fitted pmf.
+        """
+        # TODO: Limit from above xmin to below xmax, not just above xmin.
+        counts = vocab_counts_df[CNT]
+        self.uniq_counts = list(pd.unique(counts))
+        self.uniq_ranks = list(np.arange(1, len(self.uniq_counts) + 1))
+        logs.info(self.uniq_counts)
+        logs.info(self.xmin)
+        logs.info(self.xmax)
+        # Makes sure they are ints if not None
+        xmin = self.get_xmin()
+        xmax = self.get_xmax()
+        self.uniq_fit_counts = self.uniq_counts[xmin + 1 : xmax]
+        pmf_mass = float(sum(self.uniq_fit_counts))
+        zipf_counts = np.array(
+            [self.estimate_count(rank, pmf_mass) for rank in self.uniq_ranks]
+        )
+        return zipf_counts
+    def estimate_count(self, rank, pmf_mass):
+        return int(round(zipf_lib.pmf(rank, self.alpha) * pmf_mass))
+    def set_unique_ranks(self, ranks):
+        self.uniq_ranks = ranks
+    def get_unique_ranks(self):
+        return self.uniq_ranks
+    def get_unique_fit_counts(self):
+        return self.uniq_fit_counts
+    def set_unique_counts(self, counts):
+        self.uniq_counts = counts
+    def get_unique_counts(self):
+        return self.uniq_counts
+    def set_axes(self, unique_counts, unique_ranks):
+        self.uniq_counts = unique_counts
+        self.uniq_ranks = unique_ranks
+    # TODO: Incorporate this function (not currently using)
+    def fit_others(self, fit):
+        st.markdown(
+            "_Checking log likelihood ratio to see if the data is better explained by other well-behaved distributions..._"
+        )
+        # The first value returned from distribution_compare is the log likelihood ratio
+        better_distro = False
+        trunc = fit.distribution_compare("power_law", "truncated_power_law")
+        if trunc[0] < 0:
+            st.markdown("Seems a truncated power law is a better fit.")
+            better_distro = True
+        lognormal = fit.distribution_compare("power_law", "lognormal")
+        if lognormal[0] < 0:
+            st.markdown("Seems a lognormal distribution is a better fit.")
+            st.markdown("But don't panic -- that happens sometimes with language.")
+            better_distro = True
+        exponential = fit.distribution_compare("power_law", "exponential")
+        if exponential[0] < 0:
+            st.markdown("Seems an exponential distribution is a better fit. Panic.")
+            better_distro = True
+        if not better_distro:
+            st.markdown("\nSeems your data is best fit by a power law. Celebrate!!")