diff --git "a/fineweb_bias_eval.ipynb" "b/fineweb_bias_eval.ipynb" new file mode 100644--- /dev/null +++ "b/fineweb_bias_eval.ipynb" @@ -0,0 +1,2705 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "### Load packages" + ], + "metadata": { + "id": "utSDkGUL101i" + }, + "id": "utSDkGUL101i" + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "34299990-bd58-4fe9-99fe-15d4b6796106", + "metadata": { + "id": "34299990-bd58-4fe9-99fe-15d4b6796106", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e99e0fdc-27ee-4e6f-bc64-18f6127b9b3a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.19.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.14.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.25.2)\n", + "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (14.0.2)\n", + "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.0.3)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.4)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2024.3.1)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.5)\n", + "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.1)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets) (4.11.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2024.2.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", + "Requirement already satisfied: datatrove in /usr/local/lib/python3.10/dist-packages (0.2.0)\n", + "Requirement already satisfied: dill>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.3.8)\n", + "Requirement already satisfied: fsspec>=2023.12.2 in /usr/local/lib/python3.10/dist-packages (from datatrove) (2024.3.1)\n", + "Requirement already satisfied: huggingface-hub>=0.17.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.23.1)\n", + "Requirement already satisfied: humanize in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.7.0)\n", + "Requirement already satisfied: loguru>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.7.2)\n", + "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.70.16)\n", + "Requirement already satisfied: numpy>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (1.25.2)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.66.4)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (3.14.0)\n", + "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (24.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (6.0.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (2.31.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (4.11.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2024.2.2)\n" + ] + } + ], + "source": [ + "!pip install datasets\n", + "!pip install datatrove\n", + "import datasets\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from datatrove.pipeline.readers import ParquetReader" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "922a0454", + "metadata": { + "id": "922a0454", + "outputId": "8500a12a-6856-46ac-bb65-6f86db4bb001", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The rich extension is already loaded. To reload it, use:\n", + " %reload_ext rich\n" + ] + } + ], + "source": [ + "%load_ext rich" + ] + }, + { + "cell_type": "markdown", + "id": "703c7781-0a33-41dc-8da9-2fa034483cad", + "metadata": { + "id": "703c7781-0a33-41dc-8da9-2fa034483cad" + }, + "source": [ + "## Methodology\n", + "\n", + "In order to measure bias in the dataset, we consider the following simple [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) based approach. The idea is that the specificity of a term -- in our case, how `biased` it is -- can be quantified as an inverse function of the number of documents in which it occurs.\n", + "\n", + "Given a dataset and terms for a subpopulation (gender) of interest:\n", + "1. Evaluate Inverse Document Frequencies on the full dataset\n", + "2. Compute the average TF-IDF vectors for the dataset for a given subpopulation (gender)\n", + "3. Sort the terms by variance to see words that are much more likely to appear specifically for a given subpopulation\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "7c837c65-987f-45cf-b18d-fc7836894372", + "metadata": { + "id": "7c837c65-987f-45cf-b18d-fc7836894372" + }, + "source": [ + "### Load Fineweb\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbd19018", + "metadata": { + "id": "dbd19018", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2852efb2-954f-460f-d143-18baa0408973" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\u001b[32m2024-05-29 19:38:01.457\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdatatrove.pipeline.readers.base\u001b[0m:\u001b[36mread_files_shard\u001b[0m:\u001b[36m193\u001b[0m - \u001b[1mReading input file 000_00000.parquet\u001b[0m\n" + ] + } + ], + "source": [ + "local = False\n", + "data_reader = ParquetReader(\"hf://datasets/HuggingFaceFW/fineweb/sample/10BT\")\n", + "all_docs = [document.text for document in data_reader()]" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Compute frequencies" + ], + "metadata": { + "id": "eBj1TtiW2C-6" + }, + "id": "eBj1TtiW2C-6" + }, + { + "cell_type": "code", + "source": [ + "# Step 1: get Inverse document frequencies for the dataset\n", + "vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')\n", + "full_tfidf = vectorizer.fit_transform(all_docs)\n", + "tfidf_feature_names = np.array(vectorizer.get_feature_names_out())" + ], + "metadata": { + "id": "e_nQogiWceYZ" + }, + "id": "e_nQogiWceYZ", + "execution_count": 50, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Bias analysis: Gender tf-idf" + ], + "metadata": { + "id": "aqIybwilj0KH" + }, + "id": "aqIybwilj0KH" + }, + { + "cell_type": "code", + "source": [ + "# Step 2: get average TF-IDF vectors **for each gender**\n", + "GENDER_PHRASES = [\"man\", \"woman\"]\n", + "tfidf_by_gender = {}\n", + "for phrase in GENDER_PHRASES:\n", + " gdr_docs = [doc for doc in all_docs if phrase in doc.split()]\n", + " if gdr_docs != []:\n", + " gdr_tfidf = np.asarray(vectorizer.transform(gdr_docs).mean(axis=0))[0]\n", + " tfidf_by_gender[phrase] = gdr_tfidf" + ], + "metadata": { + "id": "d-Na79jvczt0" + }, + "id": "d-Na79jvczt0", + "execution_count": 51, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Step 3: for each term, compute the variance across genders\n", + "all_tfidf = np.array(list(tfidf_by_gender.values()))\n", + "tf_idf_var = all_tfidf - all_tfidf.sum(axis=0, keepdims=True)\n", + "tf_idf_var = np.power((tf_idf_var * tf_idf_var).sum(axis=0), 0.5)\n", + "sort_by_variance = tf_idf_var.argsort()[::-1]" + ], + "metadata": { + "id": "D0sbbLyWw2CZ" + }, + "id": "D0sbbLyWw2CZ", + "execution_count": 52, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "03393fe5-2a92-451a-bd08-6a27a6239097", + "metadata": { + "id": "03393fe5-2a92-451a-bd08-6a27a6239097" + }, + "outputs": [], + "source": [ + "# Create the data structure for the visualization,\n", + "# showing the highest variance words for each gender,\n", + "# and how they deviate from the mean\n", + "pre_pandas_lines = [\n", + " {\n", + " \"word\": tfidf_feature_names[w],\n", + " \"man\": all_tfidf[0, w],\n", + " \"woman\": all_tfidf[1, w],\n", + " \"man+\": all_tfidf[0, w] - all_tfidf[:, w].mean(),\n", + " \"woman+\": all_tfidf[1, w] - all_tfidf[:, w].mean(),\n", + " \"variance\": tf_idf_var[w],\n", + " \"total\": all_tfidf[:, w].sum(),\n", + " }\n", + " for w in sort_by_variance[:50]\n", + "]" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Results" + ], + "metadata": { + "id": "IhJC-iT91smy" + }, + "id": "IhJC-iT91smy" + }, + { + "cell_type": "code", + "source": [ + "# Plot\n", + "df = pd.DataFrame.from_dict(pre_pandas_lines)\n", + "df.style.background_gradient(\n", + " axis=None,\n", + " vmin=0,\n", + " vmax=0.2,\n", + " cmap=\"YlGnBu\"\n", + ").format(precision=2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "LDLjFa6HdMWe", + "outputId": "d012172a-4c03-4505-83c6-7bd6c3c77a91" + }, + "id": "LDLjFa6HdMWe", + "execution_count": 47, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [], + "text/html": [ + "
\n"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eb340\u001b[0m\u001b[1m>\u001b[0m"
+            ],
+            "text/html": [
+              "\n",
+              "\n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "
 wordmanwomanman+woman+variancetotal
0woman0.010.07-0.030.030.070.08
1man0.050.020.01-0.010.050.07
2women0.010.04-0.010.010.040.06
3said0.030.010.01-0.010.030.05
4people0.020.020.00-0.000.030.04
5tsa0.010.03-0.010.010.030.04
6life0.030.010.01-0.010.030.04
7just0.020.020.00-0.000.030.04
8police0.020.020.00-0.000.030.04
9god0.020.020.00-0.000.030.04
10like0.020.020.00-0.000.030.04
11cancer0.000.03-0.010.010.030.03
12marriage0.020.02-0.000.000.030.04
13time0.020.020.00-0.000.030.04
14mouse0.000.03-0.010.010.030.03
15rudy0.010.02-0.010.010.020.03
16gangnam0.010.02-0.010.010.020.03
17medical0.000.02-0.010.010.020.03
18world0.010.02-0.000.000.020.03
19work0.010.02-0.000.000.020.03
20make0.020.010.00-0.000.020.03
21think0.020.010.00-0.000.020.03
22palin0.010.02-0.000.000.020.03
23john0.010.02-0.000.000.020.03
24surgery0.000.02-0.010.010.020.02
25anderson0.000.02-0.010.010.020.02
26day0.010.010.00-0.000.020.03
27gregory0.010.02-0.000.000.020.03
28st0.010.02-0.000.000.020.03
29hermit0.010.02-0.000.000.020.03
30says0.010.010.00-0.000.020.03
31know0.010.010.00-0.000.020.03
32use0.010.02-0.010.010.020.03
33plus0.000.02-0.010.010.020.02
34size0.000.02-0.010.010.020.02
35year0.010.010.00-0.000.020.03
36don0.010.01-0.000.000.020.03
37died0.010.02-0.010.010.020.02
38left0.010.01-0.000.000.020.03
39did0.010.01-0.000.000.020.03
40white0.000.02-0.010.010.020.02
41right0.020.010.00-0.000.020.03
42wife0.010.02-0.000.000.020.02
43sir0.010.02-0.000.000.020.03
44way0.020.010.00-0.000.020.03
45great0.010.010.00-0.000.020.03
46city0.010.01-0.000.000.020.03
47korean0.010.01-0.000.000.020.03
48camera0.000.02-0.010.010.020.02
49place0.020.010.00-0.000.020.02
\n" + ] + }, + "metadata": {}, + "execution_count": 47 + } + ] + }, + { + "cell_type": "markdown", + "id": "e273abff-3d81-431f-9188-82d87d1ecda2", + "metadata": { + "id": "e273abff-3d81-431f-9188-82d87d1ecda2" + }, + "source": [ + "#### Sorting by bias\n", + "\n", + "In order to better surface biases, we can sort the table by how much one gender over-represents a term.\n", + "\n", + "In this case, we see that instances mentioning `man` are more likely to include `god` than those mentioning `woman`, which in turn are more likely to include `cancer`." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "34229f06-5bf7-4ece-b43e-7d453931abd4", + "metadata": { + "id": "34229f06-5bf7-4ece-b43e-7d453931abd4", + "outputId": "7720b46d-a37d-4007-aa8e-8d7973f4f91c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "collapsed": true + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [], + "text/html": [ + "
\n"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eac20\u001b[0m\u001b[1m>\u001b[0m"
+            ],
+            "text/html": [
+              "\n",
+              "\n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "
 wordmanwomanman+woman+variancetotal
1man0.050.020.01-0.010.050.07
3said0.030.010.01-0.010.030.05
6life0.030.010.01-0.010.030.04
9god0.020.020.00-0.000.030.04
7just0.020.020.00-0.000.030.04
10like0.020.020.00-0.000.030.04
44way0.020.010.00-0.000.020.03
21think0.020.010.00-0.000.020.03
49place0.020.010.00-0.000.020.02
41right0.020.010.00-0.000.020.03
13time0.020.020.00-0.000.030.04
35year0.010.010.00-0.000.020.03
31know0.010.010.00-0.000.020.03
20make0.020.010.00-0.000.020.03
4people0.020.020.00-0.000.030.04
8police0.020.020.00-0.000.030.04
26day0.010.010.00-0.000.020.03
30says0.010.010.00-0.000.020.03
45great0.010.010.00-0.000.020.03
46city0.010.01-0.000.000.020.03
39did0.010.01-0.000.000.020.03
36don0.010.01-0.000.000.020.03
28st0.010.02-0.000.000.020.03
38left0.010.01-0.000.000.020.03
23john0.010.02-0.000.000.020.03
18world0.010.02-0.000.000.020.03
47korean0.010.01-0.000.000.020.03
43sir0.010.02-0.000.000.020.03
12marriage0.020.02-0.000.000.030.04
19work0.010.02-0.000.000.020.03
29hermit0.010.02-0.000.000.020.03
27gregory0.010.02-0.000.000.020.03
22palin0.010.02-0.000.000.020.03
42wife0.010.02-0.000.000.020.02
16gangnam0.010.02-0.010.010.020.03
15rudy0.010.02-0.010.010.020.03
32use0.010.02-0.010.010.020.03
37died0.010.02-0.010.010.020.02
5tsa0.010.03-0.010.010.030.04
40white0.000.02-0.010.010.020.02
34size0.000.02-0.010.010.020.02
48camera0.000.02-0.010.010.020.02
33plus0.000.02-0.010.010.020.02
17medical0.000.02-0.010.010.020.03
24surgery0.000.02-0.010.010.020.02
25anderson0.000.02-0.010.010.020.02
14mouse0.000.03-0.010.010.030.03
2women0.010.04-0.010.010.040.06
11cancer0.000.03-0.010.010.030.03
0woman0.010.07-0.030.030.070.08
\n" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ], + "source": [ + "df.sort_values('man+', ascending=False).style.background_gradient(\n", + " axis=None,\n", + " vmin=0,\n", + " vmax=0.2,\n", + " cmap=\"YlGnBu\"\n", + ").format(precision=2)" + ] + }, + { + "cell_type": "code", + "source": [ + "df.sort_values('woman+', ascending=False).style.background_gradient(\n", + " axis=None,\n", + " vmin=0,\n", + " vmax=0.2,\n", + " cmap=\"YlGnBu\"\n", + ").format(precision=2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ufATwOCojOdv", + "outputId": "299fdb81-a754-4afe-b0fd-5be8aac8c549" + }, + "id": "ufATwOCojOdv", + "execution_count": 46, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [], + "text/html": [ + "
\n"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eab60\u001b[0m\u001b[1m>\u001b[0m"
+            ],
+            "text/html": [
+              "\n",
+              "\n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "  \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "    \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "      \n",
+              "    \n",
+              "  \n",
+              "
 wordmanwomanman+woman+variancetotal
0woman0.010.07-0.030.030.070.08
11cancer0.000.03-0.010.010.030.03
2women0.010.04-0.010.010.040.06
14mouse0.000.03-0.010.010.030.03
25anderson0.000.02-0.010.010.020.02
24surgery0.000.02-0.010.010.020.02
17medical0.000.02-0.010.010.020.03
33plus0.000.02-0.010.010.020.02
48camera0.000.02-0.010.010.020.02
34size0.000.02-0.010.010.020.02
40white0.000.02-0.010.010.020.02
5tsa0.010.03-0.010.010.030.04
37died0.010.02-0.010.010.020.02
32use0.010.02-0.010.010.020.03
15rudy0.010.02-0.010.010.020.03
16gangnam0.010.02-0.010.010.020.03
42wife0.010.02-0.000.000.020.02
22palin0.010.02-0.000.000.020.03
27gregory0.010.02-0.000.000.020.03
29hermit0.010.02-0.000.000.020.03
19work0.010.02-0.000.000.020.03
12marriage0.020.02-0.000.000.030.04
43sir0.010.02-0.000.000.020.03
47korean0.010.01-0.000.000.020.03
18world0.010.02-0.000.000.020.03
23john0.010.02-0.000.000.020.03
38left0.010.01-0.000.000.020.03
28st0.010.02-0.000.000.020.03
36don0.010.01-0.000.000.020.03
39did0.010.01-0.000.000.020.03
46city0.010.01-0.000.000.020.03
45great0.010.010.00-0.000.020.03
30says0.010.010.00-0.000.020.03
26day0.010.010.00-0.000.020.03
8police0.020.020.00-0.000.030.04
4people0.020.020.00-0.000.030.04
20make0.020.010.00-0.000.020.03
31know0.010.010.00-0.000.020.03
35year0.010.010.00-0.000.020.03
13time0.020.020.00-0.000.030.04
41right0.020.010.00-0.000.020.03
49place0.020.010.00-0.000.020.02
21think0.020.010.00-0.000.020.03
44way0.020.010.00-0.000.020.03
10like0.020.020.00-0.000.030.04
7just0.020.020.00-0.000.030.04
9god0.020.020.00-0.000.030.04
6life0.030.010.01-0.010.030.04
3said0.030.010.01-0.010.030.05
1man0.050.020.01-0.010.050.07
\n" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file