{
"cells": [
{
"cell_type": "markdown",
"source": [
"### Load packages"
],
"metadata": {
"id": "utSDkGUL101i"
},
"id": "utSDkGUL101i"
},
{
"cell_type": "code",
"execution_count": 48,
"id": "34299990-bd58-4fe9-99fe-15d4b6796106",
"metadata": {
"id": "34299990-bd58-4fe9-99fe-15d4b6796106",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "e99e0fdc-27ee-4e6f-bc64-18f6127b9b3a"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.19.1)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.14.0)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.25.2)\n",
"Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (14.0.2)\n",
"Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.0.3)\n",
"Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n",
"Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.4)\n",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.4.1)\n",
"Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
"Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2024.3.1)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.5)\n",
"Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.1)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets) (4.11.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2024.2.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n",
"Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
"Requirement already satisfied: datatrove in /usr/local/lib/python3.10/dist-packages (0.2.0)\n",
"Requirement already satisfied: dill>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.3.8)\n",
"Requirement already satisfied: fsspec>=2023.12.2 in /usr/local/lib/python3.10/dist-packages (from datatrove) (2024.3.1)\n",
"Requirement already satisfied: huggingface-hub>=0.17.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.23.1)\n",
"Requirement already satisfied: humanize in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.7.0)\n",
"Requirement already satisfied: loguru>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.7.2)\n",
"Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datatrove) (0.70.16)\n",
"Requirement already satisfied: numpy>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from datatrove) (1.25.2)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from datatrove) (4.66.4)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (3.14.0)\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (24.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (6.0.1)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (2.31.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.17.0->datatrove) (4.11.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.17.0->datatrove) (2024.2.2)\n"
]
}
],
"source": [
"!pip install datasets\n",
"!pip install datatrove\n",
"import datasets\n",
"import json\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from datatrove.pipeline.readers import ParquetReader"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "922a0454",
"metadata": {
"id": "922a0454",
"outputId": "8500a12a-6856-46ac-bb65-6f86db4bb001",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The rich extension is already loaded. To reload it, use:\n",
" %reload_ext rich\n"
]
}
],
"source": [
"%load_ext rich"
]
},
{
"cell_type": "markdown",
"id": "703c7781-0a33-41dc-8da9-2fa034483cad",
"metadata": {
"id": "703c7781-0a33-41dc-8da9-2fa034483cad"
},
"source": [
"## Methodology\n",
"\n",
"In order to measure bias in the dataset, we consider the following simple [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) based approach. The idea is that the specificity of a term -- in our case, how `biased` it is -- can be quantified as an inverse function of the number of documents in which it occurs.\n",
"\n",
"Given a dataset and terms for a subpopulation (gender) of interest:\n",
"1. Evaluate Inverse Document Frequencies on the full dataset\n",
"2. Compute the average TF-IDF vectors for the dataset for a given subpopulation (gender)\n",
"3. Sort the terms by variance to see words that are much more likely to appear specifically for a given subpopulation\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "7c837c65-987f-45cf-b18d-fc7836894372",
"metadata": {
"id": "7c837c65-987f-45cf-b18d-fc7836894372"
},
"source": [
"### Load Fineweb\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbd19018",
"metadata": {
"id": "dbd19018",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "2852efb2-954f-460f-d143-18baa0408973"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"\u001b[32m2024-05-29 19:38:01.457\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mdatatrove.pipeline.readers.base\u001b[0m:\u001b[36mread_files_shard\u001b[0m:\u001b[36m193\u001b[0m - \u001b[1mReading input file 000_00000.parquet\u001b[0m\n"
]
}
],
"source": [
"local = False\n",
"data_reader = ParquetReader(\"hf://datasets/HuggingFaceFW/fineweb/sample/10BT\")\n",
"all_docs = [document.text for document in data_reader()]"
]
},
{
"cell_type": "markdown",
"source": [
"### Compute frequencies"
],
"metadata": {
"id": "eBj1TtiW2C-6"
},
"id": "eBj1TtiW2C-6"
},
{
"cell_type": "code",
"source": [
"# Step 1: get Inverse document frequencies for the dataset\n",
"vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')\n",
"full_tfidf = vectorizer.fit_transform(all_docs)\n",
"tfidf_feature_names = np.array(vectorizer.get_feature_names_out())"
],
"metadata": {
"id": "e_nQogiWceYZ"
},
"id": "e_nQogiWceYZ",
"execution_count": 50,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Bias analysis: Gender tf-idf"
],
"metadata": {
"id": "aqIybwilj0KH"
},
"id": "aqIybwilj0KH"
},
{
"cell_type": "code",
"source": [
"# Step 2: get average TF-IDF vectors **for each gender**\n",
"GENDER_PHRASES = [\"man\", \"woman\"]\n",
"tfidf_by_gender = {}\n",
"for phrase in GENDER_PHRASES:\n",
" gdr_docs = [doc for doc in all_docs if phrase in doc.split()]\n",
" if gdr_docs != []:\n",
" gdr_tfidf = np.asarray(vectorizer.transform(gdr_docs).mean(axis=0))[0]\n",
" tfidf_by_gender[phrase] = gdr_tfidf"
],
"metadata": {
"id": "d-Na79jvczt0"
},
"id": "d-Na79jvczt0",
"execution_count": 51,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Step 3: for each term, compute the variance across genders\n",
"all_tfidf = np.array(list(tfidf_by_gender.values()))\n",
"tf_idf_var = all_tfidf - all_tfidf.sum(axis=0, keepdims=True)\n",
"tf_idf_var = np.power((tf_idf_var * tf_idf_var).sum(axis=0), 0.5)\n",
"sort_by_variance = tf_idf_var.argsort()[::-1]"
],
"metadata": {
"id": "D0sbbLyWw2CZ"
},
"id": "D0sbbLyWw2CZ",
"execution_count": 52,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 53,
"id": "03393fe5-2a92-451a-bd08-6a27a6239097",
"metadata": {
"id": "03393fe5-2a92-451a-bd08-6a27a6239097"
},
"outputs": [],
"source": [
"# Create the data structure for the visualization,\n",
"# showing the highest variance words for each gender,\n",
"# and how they deviate from the mean\n",
"pre_pandas_lines = [\n",
" {\n",
" \"word\": tfidf_feature_names[w],\n",
" \"man\": all_tfidf[0, w],\n",
" \"woman\": all_tfidf[1, w],\n",
" \"man+\": all_tfidf[0, w] - all_tfidf[:, w].mean(),\n",
" \"woman+\": all_tfidf[1, w] - all_tfidf[:, w].mean(),\n",
" \"variance\": tf_idf_var[w],\n",
" \"total\": all_tfidf[:, w].sum(),\n",
" }\n",
" for w in sort_by_variance[:50]\n",
"]"
]
},
{
"cell_type": "markdown",
"source": [
"### Results"
],
"metadata": {
"id": "IhJC-iT91smy"
},
"id": "IhJC-iT91smy"
},
{
"cell_type": "code",
"source": [
"# Plot\n",
"df = pd.DataFrame.from_dict(pre_pandas_lines)\n",
"df.style.background_gradient(\n",
" axis=None,\n",
" vmin=0,\n",
" vmax=0.2,\n",
" cmap=\"YlGnBu\"\n",
").format(precision=2)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "LDLjFa6HdMWe",
"outputId": "d012172a-4c03-4505-83c6-7bd6c3c77a91"
},
"id": "LDLjFa6HdMWe",
"execution_count": 47,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [],
"text/html": [
"
\n"
]
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eb340\u001b[0m\u001b[1m>\u001b[0m"
],
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" word | \n",
" man | \n",
" woman | \n",
" man+ | \n",
" woman+ | \n",
" variance | \n",
" total | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" woman | \n",
" 0.01 | \n",
" 0.07 | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.07 | \n",
" 0.08 | \n",
"
\n",
" \n",
" 1 | \n",
" man | \n",
" 0.05 | \n",
" 0.02 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.05 | \n",
" 0.07 | \n",
"
\n",
" \n",
" 2 | \n",
" women | \n",
" 0.01 | \n",
" 0.04 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.04 | \n",
" 0.06 | \n",
"
\n",
" \n",
" 3 | \n",
" said | \n",
" 0.03 | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.03 | \n",
" 0.05 | \n",
"
\n",
" \n",
" 4 | \n",
" people | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 5 | \n",
" tsa | \n",
" 0.01 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 6 | \n",
" life | \n",
" 0.03 | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 7 | \n",
" just | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 8 | \n",
" police | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 9 | \n",
" god | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 10 | \n",
" like | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 11 | \n",
" cancer | \n",
" 0.00 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 12 | \n",
" marriage | \n",
" 0.02 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 13 | \n",
" time | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 14 | \n",
" mouse | \n",
" 0.00 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 15 | \n",
" rudy | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 16 | \n",
" gangnam | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 17 | \n",
" medical | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 18 | \n",
" world | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 19 | \n",
" work | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 20 | \n",
" make | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 21 | \n",
" think | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 22 | \n",
" palin | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 23 | \n",
" john | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 24 | \n",
" surgery | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 25 | \n",
" anderson | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 26 | \n",
" day | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 27 | \n",
" gregory | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 28 | \n",
" st | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 29 | \n",
" hermit | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 30 | \n",
" says | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 31 | \n",
" know | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 32 | \n",
" use | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 33 | \n",
" plus | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 34 | \n",
" size | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 35 | \n",
" year | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 36 | \n",
" don | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 37 | \n",
" died | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 38 | \n",
" left | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 39 | \n",
" did | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 40 | \n",
" white | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 41 | \n",
" right | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 42 | \n",
" wife | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 43 | \n",
" sir | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 44 | \n",
" way | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 45 | \n",
" great | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 46 | \n",
" city | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 47 | \n",
" korean | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 48 | \n",
" camera | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 49 | \n",
" place | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
"
\n"
]
},
"metadata": {},
"execution_count": 47
}
]
},
{
"cell_type": "markdown",
"id": "e273abff-3d81-431f-9188-82d87d1ecda2",
"metadata": {
"id": "e273abff-3d81-431f-9188-82d87d1ecda2"
},
"source": [
"#### Sorting by bias\n",
"\n",
"In order to better surface biases, we can sort the table by how much one gender over-represents a term.\n",
"\n",
"In this case, we see that instances mentioning `man` are more likely to include `god` than those mentioning `woman`, which in turn are more likely to include `cancer`."
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "34229f06-5bf7-4ece-b43e-7d453931abd4",
"metadata": {
"id": "34229f06-5bf7-4ece-b43e-7d453931abd4",
"outputId": "7720b46d-a37d-4007-aa8e-8d7973f4f91c",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"collapsed": true
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [],
"text/html": [
"\n"
]
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eac20\u001b[0m\u001b[1m>\u001b[0m"
],
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" word | \n",
" man | \n",
" woman | \n",
" man+ | \n",
" woman+ | \n",
" variance | \n",
" total | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" man | \n",
" 0.05 | \n",
" 0.02 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.05 | \n",
" 0.07 | \n",
"
\n",
" \n",
" 3 | \n",
" said | \n",
" 0.03 | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.03 | \n",
" 0.05 | \n",
"
\n",
" \n",
" 6 | \n",
" life | \n",
" 0.03 | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 9 | \n",
" god | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 7 | \n",
" just | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 10 | \n",
" like | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 44 | \n",
" way | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 21 | \n",
" think | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 49 | \n",
" place | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 41 | \n",
" right | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 13 | \n",
" time | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 35 | \n",
" year | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 31 | \n",
" know | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 20 | \n",
" make | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 4 | \n",
" people | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 8 | \n",
" police | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 26 | \n",
" day | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 30 | \n",
" says | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 45 | \n",
" great | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 46 | \n",
" city | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 39 | \n",
" did | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 36 | \n",
" don | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 28 | \n",
" st | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 38 | \n",
" left | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 23 | \n",
" john | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 18 | \n",
" world | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 47 | \n",
" korean | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 43 | \n",
" sir | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 12 | \n",
" marriage | \n",
" 0.02 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 19 | \n",
" work | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 29 | \n",
" hermit | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 27 | \n",
" gregory | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 22 | \n",
" palin | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 42 | \n",
" wife | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 16 | \n",
" gangnam | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 15 | \n",
" rudy | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 32 | \n",
" use | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 37 | \n",
" died | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 5 | \n",
" tsa | \n",
" 0.01 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 40 | \n",
" white | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 34 | \n",
" size | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 48 | \n",
" camera | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 33 | \n",
" plus | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 17 | \n",
" medical | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 24 | \n",
" surgery | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 25 | \n",
" anderson | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 14 | \n",
" mouse | \n",
" 0.00 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 2 | \n",
" women | \n",
" 0.01 | \n",
" 0.04 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.04 | \n",
" 0.06 | \n",
"
\n",
" \n",
" 11 | \n",
" cancer | \n",
" 0.00 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 0 | \n",
" woman | \n",
" 0.01 | \n",
" 0.07 | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.07 | \n",
" 0.08 | \n",
"
\n",
" \n",
"
\n"
]
},
"metadata": {},
"execution_count": 45
}
],
"source": [
"df.sort_values('man+', ascending=False).style.background_gradient(\n",
" axis=None,\n",
" vmin=0,\n",
" vmax=0.2,\n",
" cmap=\"YlGnBu\"\n",
").format(precision=2)"
]
},
{
"cell_type": "code",
"source": [
"df.sort_values('woman+', ascending=False).style.background_gradient(\n",
" axis=None,\n",
" vmin=0,\n",
" vmax=0.2,\n",
" cmap=\"YlGnBu\"\n",
").format(precision=2)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "ufATwOCojOdv",
"outputId": "299fdb81-a754-4afe-b0fd-5be8aac8c549"
},
"id": "ufATwOCojOdv",
"execution_count": 46,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [],
"text/html": [
"\n"
]
},
"metadata": {}
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\u001b[1m<\u001b[0m\u001b[1;95mpandas.io.formats.style.Styler\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x7b89700eab60\u001b[0m\u001b[1m>\u001b[0m"
],
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" word | \n",
" man | \n",
" woman | \n",
" man+ | \n",
" woman+ | \n",
" variance | \n",
" total | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" woman | \n",
" 0.01 | \n",
" 0.07 | \n",
" -0.03 | \n",
" 0.03 | \n",
" 0.07 | \n",
" 0.08 | \n",
"
\n",
" \n",
" 11 | \n",
" cancer | \n",
" 0.00 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 2 | \n",
" women | \n",
" 0.01 | \n",
" 0.04 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.04 | \n",
" 0.06 | \n",
"
\n",
" \n",
" 14 | \n",
" mouse | \n",
" 0.00 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 25 | \n",
" anderson | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 24 | \n",
" surgery | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 17 | \n",
" medical | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 33 | \n",
" plus | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 48 | \n",
" camera | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 34 | \n",
" size | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 40 | \n",
" white | \n",
" 0.00 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 5 | \n",
" tsa | \n",
" 0.01 | \n",
" 0.03 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 37 | \n",
" died | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 32 | \n",
" use | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 15 | \n",
" rudy | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 16 | \n",
" gangnam | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.01 | \n",
" 0.01 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 42 | \n",
" wife | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 22 | \n",
" palin | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 27 | \n",
" gregory | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 29 | \n",
" hermit | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 19 | \n",
" work | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 12 | \n",
" marriage | \n",
" 0.02 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 43 | \n",
" sir | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 47 | \n",
" korean | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 18 | \n",
" world | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 23 | \n",
" john | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 38 | \n",
" left | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 28 | \n",
" st | \n",
" 0.01 | \n",
" 0.02 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 36 | \n",
" don | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 39 | \n",
" did | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 46 | \n",
" city | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.00 | \n",
" 0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 45 | \n",
" great | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 30 | \n",
" says | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 26 | \n",
" day | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 8 | \n",
" police | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 4 | \n",
" people | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 20 | \n",
" make | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 31 | \n",
" know | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 35 | \n",
" year | \n",
" 0.01 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 13 | \n",
" time | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 41 | \n",
" right | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 49 | \n",
" place | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.02 | \n",
"
\n",
" \n",
" 21 | \n",
" think | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 44 | \n",
" way | \n",
" 0.02 | \n",
" 0.01 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.02 | \n",
" 0.03 | \n",
"
\n",
" \n",
" 10 | \n",
" like | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 7 | \n",
" just | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 9 | \n",
" god | \n",
" 0.02 | \n",
" 0.02 | \n",
" 0.00 | \n",
" -0.00 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 6 | \n",
" life | \n",
" 0.03 | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.03 | \n",
" 0.04 | \n",
"
\n",
" \n",
" 3 | \n",
" said | \n",
" 0.03 | \n",
" 0.01 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.03 | \n",
" 0.05 | \n",
"
\n",
" \n",
" 1 | \n",
" man | \n",
" 0.05 | \n",
" 0.02 | \n",
" 0.01 | \n",
" -0.01 | \n",
" 0.05 | \n",
" 0.07 | \n",
"
\n",
" \n",
"
\n"
]
},
"metadata": {},
"execution_count": 46
}
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
},
"colab": {
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 5
}