{ "cells": [ { "cell_type": "markdown", "id": "3c5dfca2", "metadata": { "papermill": { "duration": 0.0108, "end_time": "2023-05-02T09:00:46.805433", "exception": false, "start_time": "2023-05-02T09:00:46.794633", "status": "completed" }, "tags": [] }, "source": [ "# Exploratory Data Analysis for Jigsaw Miltilingual Toxic Comment Classification" ] }, { "cell_type": "code", "execution_count": 1, "id": "2a536ac1", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-05-02T09:00:46.826342Z", "iopub.status.busy": "2023-05-02T09:00:46.825207Z", "iopub.status.idle": "2023-05-02T09:00:48.359659Z", "shell.execute_reply": "2023-05-02T09:00:48.358529Z" }, "papermill": { "duration": 1.547699, "end_time": "2023-05-02T09:00:48.362424", "exception": false, "start_time": "2023-05-02T09:00:46.814725", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Import the Libraries\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import random\n", "import re\n", "import string\n", "import nltk\n", "from nltk import FreqDist\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from collections import Counter\n", "from textblob import TextBlob" ] }, { "cell_type": "code", "execution_count": 2, "id": "616896f4", "metadata": { "execution": { "iopub.execute_input": "2023-05-02T09:00:48.385648Z", "iopub.status.busy": "2023-05-02T09:00:48.385290Z", "iopub.status.idle": "2023-05-02T09:00:48.483219Z", "shell.execute_reply": "2023-05-02T09:00:48.481238Z" }, "papermill": { "duration": 0.112844, "end_time": "2023-05-02T09:00:48.487024", "exception": false, "start_time": "2023-05-02T09:00:48.374180", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package vader_lexicon to\n", "[nltk_data] /usr/share/nltk_data...\n", "[nltk_data] Package vader_lexicon is already up-to-date!\n", "[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /usr/share/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.7/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n", " warnings.warn(\"The twython library has not been installed. \"\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('vader_lexicon')\n", "from nltk.sentiment import SentimentIntensityAnalyzer\n", "\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')" ] }, { "cell_type": "markdown", "id": "4d593182", "metadata": { "papermill": { "duration": 0.009049, "end_time": "2023-05-02T09:00:48.505618", "exception": false, "start_time": "2023-05-02T09:00:48.496569", "status": "completed" }, "tags": [] }, "source": [ "# Load the Dataset" ] }, { "cell_type": "code", "execution_count": 3, "id": "7c52e882", "metadata": { "execution": { "iopub.execute_input": "2023-05-02T09:00:48.525521Z", "iopub.status.busy": "2023-05-02T09:00:48.525178Z", "iopub.status.idle": "2023-05-02T09:02:27.840800Z", "shell.execute_reply": "2023-05-02T09:02:27.839765Z" }, "papermill": { "duration": 99.338061, "end_time": "2023-05-02T09:02:27.852939", "exception": false, "start_time": "2023-05-02T09:00:48.514878", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "comment_text | \n", "toxic | \n", "severe_toxic | \n", "obscene | \n", "threat | \n", "insult | \n", "identity_hate | \n", "input_word_ids | \n", "input_mask | \n", "all_segment_id | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0000997932d777bf | \n", "Explanation\\nWhy the edits made under my usern... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "(101, 27746, 31609, 11809, 24781, 10105, 70971... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
1 | \n", "000103f0d9cfb60f | \n", "D'aww! He matches this background colour I'm s... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "(101, 141, 112, 56237, 10874, 106, 10357, 1825... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
2 | \n", "000113f07ec002fd | \n", "Hey man, I'm really not trying to edit war. It... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "(101, 35936, 10817, 117, 146, 112, 181, 30181,... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
3 | \n", "0001b41b1c6bb37e | \n", "\"\\nMore\\nI can't make any real suggestions on ... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "(101, 107, 15946, 146, 10944, 112, 188, 13086,... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
4 | \n", "0001d958c54c6e35 | \n", "You, sir, are my hero. Any chance you remember... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "(101, 11065, 117, 52523, 117, 10301, 15127, 51... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
\n", " | id | \n", "comment_text | \n", "toxic | \n", "severe_toxic | \n", "obscene | \n", "threat | \n", "insult | \n", "identity_hate | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "0000997932d777bf | \n", "Explanation\\nWhy the edits made under my usern... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
1 | \n", "000103f0d9cfb60f | \n", "D'aww! He matches this background colour I'm s... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "000113f07ec002fd | \n", "Hey man, I'm really not trying to edit war. It... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "0001b41b1c6bb37e | \n", "\"\\nMore\\nI can't make any real suggestions on ... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "0001d958c54c6e35 | \n", "You, sir, are my hero. Any chance you remember... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
\n", " | id | \n", "comment_text | \n", "toxic | \n", "severe_toxicity | \n", "obscene | \n", "identity_attack | \n", "insult | \n", "threat | \n", "asian | \n", "atheist | \n", "... | \n", "wow | \n", "sad | \n", "likes | \n", "disagree | \n", "sexual_explicit | \n", "identity_annotator_count | \n", "toxicity_annotator_count | \n", "input_word_ids | \n", "input_mask | \n", "all_segment_id | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "59848 | \n", "This is so cool. It's like, 'would you want yo... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "(101, 10747, 10124, 10380, 67420, 119, 10377, ... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
1 | \n", "59849 | \n", "Thank you!! This would make my life a lot less... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "(101, 91327, 13028, 106, 106, 10747, 10894, 13... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
2 | \n", "59852 | \n", "This is such an urgent design problem; kudos t... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "(101, 10747, 10124, 11049, 10151, 10399, 22500... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
3 | \n", "59855 | \n", "Is this something I'll be able to install on m... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "(101, 12034, 10531, 26133, 146, 112, 22469, 10... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
4 | \n", "59856 | \n", "haha you guys are a bunch of losers. | \n", "0.893617 | \n", "0.021277 | \n", "0.0 | \n", "0.021277 | \n", "0.87234 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0.0 | \n", "4 | \n", "47 | \n", "(101, 10228, 10921, 13028, 75980, 12682, 10301... | \n", "(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... | \n", "(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | \n", "
5 rows × 48 columns
\n", "\n", " | id | \n", "comment_text | \n", "toxic | \n", "severe_toxicity | \n", "obscene | \n", "identity_attack | \n", "insult | \n", "threat | \n", "asian | \n", "atheist | \n", "... | \n", "article_id | \n", "rating | \n", "funny | \n", "wow | \n", "sad | \n", "likes | \n", "disagree | \n", "sexual_explicit | \n", "identity_annotator_count | \n", "toxicity_annotator_count | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "59848 | \n", "This is so cool. It's like, 'would you want yo... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "2006 | \n", "rejected | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "
1 | \n", "59849 | \n", "Thank you!! This would make my life a lot less... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "2006 | \n", "rejected | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "
2 | \n", "59852 | \n", "This is such an urgent design problem; kudos t... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "2006 | \n", "rejected | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "
3 | \n", "59855 | \n", "Is this something I'll be able to install on m... | \n", "0.000000 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.00000 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "... | \n", "2006 | \n", "rejected | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.0 | \n", "0 | \n", "4 | \n", "
4 | \n", "59856 | \n", "haha you guys are a bunch of losers. | \n", "0.893617 | \n", "0.021277 | \n", "0.0 | \n", "0.021277 | \n", "0.87234 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "2006 | \n", "rejected | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0.0 | \n", "4 | \n", "47 | \n", "
5 rows × 45 columns
\n", "