TLeonidas commited on
Commit
4e7c31d
1 Parent(s): 29b4e2d

Delete df1_dataset_training.ipynb

Browse files
Files changed (1) hide show
  1. df1_dataset_training.ipynb +0 -1777
df1_dataset_training.ipynb DELETED
@@ -1,1777 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {
6
- "id": "e3b3364f"
7
- },
8
- "source": [
9
- "# Fine Tuning roberta model with Twitter Data\n",
10
- "* List item\n",
11
- "* List item\n",
12
- "\n",
13
- "\n",
14
- "\n"
15
- ],
16
- "id": "e3b3364f"
17
- },
18
- {
19
- "cell_type": "code",
20
- "execution_count": 23,
21
- "metadata": {
22
- "id": "7cfcb724"
23
- },
24
- "outputs": [],
25
- "source": [
26
- "import pandas as pd\n",
27
- "import numpy as np\n",
28
- "from sklearn.feature_extraction.text import CountVectorizer\n",
29
- "from sklearn.model_selection import train_test_split\n",
30
- "from sklearn.tree import DecisionTreeClassifier"
31
- ],
32
- "id": "7cfcb724"
33
- },
34
- {
35
- "cell_type": "code",
36
- "execution_count": 24,
37
- "metadata": {
38
- "colab": {
39
- "base_uri": "https://localhost:8080/"
40
- },
41
- "id": "60ab1d26",
42
- "outputId": "ee4eb86f-3994-4391-e5c4-e2f363290977"
43
- },
44
- "outputs": [
45
- {
46
- "output_type": "stream",
47
- "name": "stderr",
48
- "text": [
49
- "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
50
- "[nltk_data] Package stopwords is already up-to-date!\n"
51
- ]
52
- }
53
- ],
54
- "source": [
55
- "import re\n",
56
- "import string\n",
57
- "import nltk\n",
58
- "nltk.download('stopwords')\n",
59
- "from nltk.stem.snowball import SnowballStemmer"
60
- ],
61
- "id": "60ab1d26"
62
- },
63
- {
64
- "cell_type": "markdown",
65
- "metadata": {
66
- "id": "538a8bf3"
67
- },
68
- "source": [
69
- "### Imports"
70
- ],
71
- "id": "538a8bf3"
72
- },
73
- {
74
- "cell_type": "code",
75
- "execution_count": 25,
76
- "metadata": {
77
- "id": "bae03f72",
78
- "scrolled": true
79
- },
80
- "outputs": [],
81
- "source": [
82
- "stemmer = nltk.SnowballStemmer(\"english\")\n",
83
- "from nltk.corpus import stopwords\n",
84
- "import string\n",
85
- "stopword = set(stopwords.words(\"english\"))"
86
- ],
87
- "id": "bae03f72"
88
- },
89
- {
90
- "cell_type": "code",
91
- "execution_count": 26,
92
- "metadata": {
93
- "colab": {
94
- "base_uri": "https://localhost:8080/",
95
- "height": 327
96
- },
97
- "id": "6de55c38",
98
- "outputId": "618d357e-2e8b-4300-cacb-c07b514913ce"
99
- },
100
- "outputs": [
101
- {
102
- "output_type": "execute_result",
103
- "data": {
104
- "text/plain": [
105
- " Unnamed: 0 count hate_speech offensive_language neither class \\\n",
106
- "0 0 3 0 0 3 2 \n",
107
- "1 1 3 0 3 0 1 \n",
108
- "2 2 3 0 3 0 1 \n",
109
- "3 3 3 0 2 1 1 \n",
110
- "4 4 6 0 6 0 1 \n",
111
- "\n",
112
- " tweet \n",
113
- "0 !!! RT @mayasolovely: As a woman you shouldn't... \n",
114
- "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... \n",
115
- "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... \n",
116
- "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... \n",
117
- "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... "
118
- ],
119
- "text/html": [
120
- "\n",
121
- " <div id=\"df-b9c9299d-eafc-4300-bf32-3ad628e8ff04\" class=\"colab-df-container\">\n",
122
- " <div>\n",
123
- "<style scoped>\n",
124
- " .dataframe tbody tr th:only-of-type {\n",
125
- " vertical-align: middle;\n",
126
- " }\n",
127
- "\n",
128
- " .dataframe tbody tr th {\n",
129
- " vertical-align: top;\n",
130
- " }\n",
131
- "\n",
132
- " .dataframe thead th {\n",
133
- " text-align: right;\n",
134
- " }\n",
135
- "</style>\n",
136
- "<table border=\"1\" class=\"dataframe\">\n",
137
- " <thead>\n",
138
- " <tr style=\"text-align: right;\">\n",
139
- " <th></th>\n",
140
- " <th>Unnamed: 0</th>\n",
141
- " <th>count</th>\n",
142
- " <th>hate_speech</th>\n",
143
- " <th>offensive_language</th>\n",
144
- " <th>neither</th>\n",
145
- " <th>class</th>\n",
146
- " <th>tweet</th>\n",
147
- " </tr>\n",
148
- " </thead>\n",
149
- " <tbody>\n",
150
- " <tr>\n",
151
- " <th>0</th>\n",
152
- " <td>0</td>\n",
153
- " <td>3</td>\n",
154
- " <td>0</td>\n",
155
- " <td>0</td>\n",
156
- " <td>3</td>\n",
157
- " <td>2</td>\n",
158
- " <td>!!! RT @mayasolovely: As a woman you shouldn't...</td>\n",
159
- " </tr>\n",
160
- " <tr>\n",
161
- " <th>1</th>\n",
162
- " <td>1</td>\n",
163
- " <td>3</td>\n",
164
- " <td>0</td>\n",
165
- " <td>3</td>\n",
166
- " <td>0</td>\n",
167
- " <td>1</td>\n",
168
- " <td>!!!!! RT @mleew17: boy dats cold...tyga dwn ba...</td>\n",
169
- " </tr>\n",
170
- " <tr>\n",
171
- " <th>2</th>\n",
172
- " <td>2</td>\n",
173
- " <td>3</td>\n",
174
- " <td>0</td>\n",
175
- " <td>3</td>\n",
176
- " <td>0</td>\n",
177
- " <td>1</td>\n",
178
- " <td>!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...</td>\n",
179
- " </tr>\n",
180
- " <tr>\n",
181
- " <th>3</th>\n",
182
- " <td>3</td>\n",
183
- " <td>3</td>\n",
184
- " <td>0</td>\n",
185
- " <td>2</td>\n",
186
- " <td>1</td>\n",
187
- " <td>1</td>\n",
188
- " <td>!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...</td>\n",
189
- " </tr>\n",
190
- " <tr>\n",
191
- " <th>4</th>\n",
192
- " <td>4</td>\n",
193
- " <td>6</td>\n",
194
- " <td>0</td>\n",
195
- " <td>6</td>\n",
196
- " <td>0</td>\n",
197
- " <td>1</td>\n",
198
- " <td>!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...</td>\n",
199
- " </tr>\n",
200
- " </tbody>\n",
201
- "</table>\n",
202
- "</div>\n",
203
- " <div class=\"colab-df-buttons\">\n",
204
- "\n",
205
- " <div class=\"colab-df-container\">\n",
206
- " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b9c9299d-eafc-4300-bf32-3ad628e8ff04')\"\n",
207
- " title=\"Convert this dataframe to an interactive table.\"\n",
208
- " style=\"display:none;\">\n",
209
- "\n",
210
- " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
211
- " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
212
- " </svg>\n",
213
- " </button>\n",
214
- "\n",
215
- " <style>\n",
216
- " .colab-df-container {\n",
217
- " display:flex;\n",
218
- " gap: 12px;\n",
219
- " }\n",
220
- "\n",
221
- " .colab-df-convert {\n",
222
- " background-color: #E8F0FE;\n",
223
- " border: none;\n",
224
- " border-radius: 50%;\n",
225
- " cursor: pointer;\n",
226
- " display: none;\n",
227
- " fill: #1967D2;\n",
228
- " height: 32px;\n",
229
- " padding: 0 0 0 0;\n",
230
- " width: 32px;\n",
231
- " }\n",
232
- "\n",
233
- " .colab-df-convert:hover {\n",
234
- " background-color: #E2EBFA;\n",
235
- " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
236
- " fill: #174EA6;\n",
237
- " }\n",
238
- "\n",
239
- " .colab-df-buttons div {\n",
240
- " margin-bottom: 4px;\n",
241
- " }\n",
242
- "\n",
243
- " [theme=dark] .colab-df-convert {\n",
244
- " background-color: #3B4455;\n",
245
- " fill: #D2E3FC;\n",
246
- " }\n",
247
- "\n",
248
- " [theme=dark] .colab-df-convert:hover {\n",
249
- " background-color: #434B5C;\n",
250
- " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
251
- " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
252
- " fill: #FFFFFF;\n",
253
- " }\n",
254
- " </style>\n",
255
- "\n",
256
- " <script>\n",
257
- " const buttonEl =\n",
258
- " document.querySelector('#df-b9c9299d-eafc-4300-bf32-3ad628e8ff04 button.colab-df-convert');\n",
259
- " buttonEl.style.display =\n",
260
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
261
- "\n",
262
- " async function convertToInteractive(key) {\n",
263
- " const element = document.querySelector('#df-b9c9299d-eafc-4300-bf32-3ad628e8ff04');\n",
264
- " const dataTable =\n",
265
- " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
266
- " [key], {});\n",
267
- " if (!dataTable) return;\n",
268
- "\n",
269
- " const docLinkHtml = 'Like what you see? Visit the ' +\n",
270
- " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
271
- " + ' to learn more about interactive tables.';\n",
272
- " element.innerHTML = '';\n",
273
- " dataTable['output_type'] = 'display_data';\n",
274
- " await google.colab.output.renderOutput(dataTable, element);\n",
275
- " const docLink = document.createElement('div');\n",
276
- " docLink.innerHTML = docLinkHtml;\n",
277
- " element.appendChild(docLink);\n",
278
- " }\n",
279
- " </script>\n",
280
- " </div>\n",
281
- "\n",
282
- "\n",
283
- "<div id=\"df-fe506237-418b-4ce7-a217-67e0aad7cf3f\">\n",
284
- " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-fe506237-418b-4ce7-a217-67e0aad7cf3f')\"\n",
285
- " title=\"Suggest charts\"\n",
286
- " style=\"display:none;\">\n",
287
- "\n",
288
- "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
289
- " width=\"24px\">\n",
290
- " <g>\n",
291
- " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
292
- " </g>\n",
293
- "</svg>\n",
294
- " </button>\n",
295
- "\n",
296
- "<style>\n",
297
- " .colab-df-quickchart {\n",
298
- " --bg-color: #E8F0FE;\n",
299
- " --fill-color: #1967D2;\n",
300
- " --hover-bg-color: #E2EBFA;\n",
301
- " --hover-fill-color: #174EA6;\n",
302
- " --disabled-fill-color: #AAA;\n",
303
- " --disabled-bg-color: #DDD;\n",
304
- " }\n",
305
- "\n",
306
- " [theme=dark] .colab-df-quickchart {\n",
307
- " --bg-color: #3B4455;\n",
308
- " --fill-color: #D2E3FC;\n",
309
- " --hover-bg-color: #434B5C;\n",
310
- " --hover-fill-color: #FFFFFF;\n",
311
- " --disabled-bg-color: #3B4455;\n",
312
- " --disabled-fill-color: #666;\n",
313
- " }\n",
314
- "\n",
315
- " .colab-df-quickchart {\n",
316
- " background-color: var(--bg-color);\n",
317
- " border: none;\n",
318
- " border-radius: 50%;\n",
319
- " cursor: pointer;\n",
320
- " display: none;\n",
321
- " fill: var(--fill-color);\n",
322
- " height: 32px;\n",
323
- " padding: 0;\n",
324
- " width: 32px;\n",
325
- " }\n",
326
- "\n",
327
- " .colab-df-quickchart:hover {\n",
328
- " background-color: var(--hover-bg-color);\n",
329
- " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
330
- " fill: var(--button-hover-fill-color);\n",
331
- " }\n",
332
- "\n",
333
- " .colab-df-quickchart-complete:disabled,\n",
334
- " .colab-df-quickchart-complete:disabled:hover {\n",
335
- " background-color: var(--disabled-bg-color);\n",
336
- " fill: var(--disabled-fill-color);\n",
337
- " box-shadow: none;\n",
338
- " }\n",
339
- "\n",
340
- " .colab-df-spinner {\n",
341
- " border: 2px solid var(--fill-color);\n",
342
- " border-color: transparent;\n",
343
- " border-bottom-color: var(--fill-color);\n",
344
- " animation:\n",
345
- " spin 1s steps(1) infinite;\n",
346
- " }\n",
347
- "\n",
348
- " @keyframes spin {\n",
349
- " 0% {\n",
350
- " border-color: transparent;\n",
351
- " border-bottom-color: var(--fill-color);\n",
352
- " border-left-color: var(--fill-color);\n",
353
- " }\n",
354
- " 20% {\n",
355
- " border-color: transparent;\n",
356
- " border-left-color: var(--fill-color);\n",
357
- " border-top-color: var(--fill-color);\n",
358
- " }\n",
359
- " 30% {\n",
360
- " border-color: transparent;\n",
361
- " border-left-color: var(--fill-color);\n",
362
- " border-top-color: var(--fill-color);\n",
363
- " border-right-color: var(--fill-color);\n",
364
- " }\n",
365
- " 40% {\n",
366
- " border-color: transparent;\n",
367
- " border-right-color: var(--fill-color);\n",
368
- " border-top-color: var(--fill-color);\n",
369
- " }\n",
370
- " 60% {\n",
371
- " border-color: transparent;\n",
372
- " border-right-color: var(--fill-color);\n",
373
- " }\n",
374
- " 80% {\n",
375
- " border-color: transparent;\n",
376
- " border-right-color: var(--fill-color);\n",
377
- " border-bottom-color: var(--fill-color);\n",
378
- " }\n",
379
- " 90% {\n",
380
- " border-color: transparent;\n",
381
- " border-bottom-color: var(--fill-color);\n",
382
- " }\n",
383
- " }\n",
384
- "</style>\n",
385
- "\n",
386
- " <script>\n",
387
- " async function quickchart(key) {\n",
388
- " const quickchartButtonEl =\n",
389
- " document.querySelector('#' + key + ' button');\n",
390
- " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
391
- " quickchartButtonEl.classList.add('colab-df-spinner');\n",
392
- " try {\n",
393
- " const charts = await google.colab.kernel.invokeFunction(\n",
394
- " 'suggestCharts', [key], {});\n",
395
- " } catch (error) {\n",
396
- " console.error('Error during call to suggestCharts:', error);\n",
397
- " }\n",
398
- " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
399
- " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
400
- " }\n",
401
- " (() => {\n",
402
- " let quickchartButtonEl =\n",
403
- " document.querySelector('#df-fe506237-418b-4ce7-a217-67e0aad7cf3f button');\n",
404
- " quickchartButtonEl.style.display =\n",
405
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
406
- " })();\n",
407
- " </script>\n",
408
- "</div>\n",
409
- "\n",
410
- " </div>\n",
411
- " </div>\n"
412
- ],
413
- "application/vnd.google.colaboratory.intrinsic+json": {
414
- "type": "dataframe",
415
- "variable_name": "df1",
416
- "summary": "{\n \"name\": \"df1\",\n \"rows\": 24783,\n \"fields\": [\n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7299,\n \"min\": 0,\n \"max\": 25296,\n \"num_unique_values\": 24783,\n \"samples\": [\n 2326,\n 16283,\n 19362\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 9,\n \"num_unique_values\": 5,\n \"samples\": [\n 6,\n 7,\n 9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"hate_speech\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 7,\n \"num_unique_values\": 8,\n \"samples\": [\n 1,\n 6,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"offensive_language\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 9,\n \"num_unique_values\": 10,\n \"samples\": [\n 8,\n 3,\n 7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"neither\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 9,\n \"num_unique_values\": 10,\n \"samples\": [\n 8,\n 0,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"class\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 2,\n \"num_unique_values\": 3,\n \"samples\": [\n 2,\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tweet\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 24783,\n \"samples\": [\n \"934 8616\\ni got a missed call from yo bitch\",\n \"RT @KINGTUNCHI_: Fucking with a bad bitch you gone need some money lil homie!\",\n \"RT @eanahS__: @1inkkofrosess lol my credit ain't no where near good , but I know the right man for the job .. that ho nice though!\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
417
- }
418
- },
419
- "metadata": {},
420
- "execution_count": 26
421
- }
422
- ],
423
- "source": [
424
- "df1 = pd.read_csv(\"twitter_data.csv\")\n",
425
- "df1 = df1.dropna()\n",
426
- "df1.head()"
427
- ],
428
- "id": "6de55c38"
429
- },
430
- {
431
- "cell_type": "markdown",
432
- "metadata": {
433
- "id": "4c288f90"
434
- },
435
- "source": [
436
- "#### `.tolist()` converts NumPy arrays into Python lists."
437
- ],
438
- "id": "4c288f90"
439
- },
440
- {
441
- "cell_type": "code",
442
- "execution_count": 27,
443
- "metadata": {
444
- "colab": {
445
- "base_uri": "https://localhost:8080/"
446
- },
447
- "id": "dd22b72e",
448
- "outputId": "90c4cb5a-7de8-4a19-8e7e-b227eac1b701"
449
- },
450
- "outputs": [
451
- {
452
- "output_type": "stream",
453
- "name": "stdout",
454
- "text": [
455
- "['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither', 'class', 'tweet']\n"
456
- ]
457
- }
458
- ],
459
- "source": [
460
- "print(df1.columns.tolist())\n"
461
- ],
462
- "id": "dd22b72e"
463
- },
464
- {
465
- "cell_type": "markdown",
466
- "metadata": {
467
- "id": "f584d46b"
468
- },
469
- "source": [
470
- "- The `.map()` function applies a specified function to an iterable and returns the result.\n",
471
- "- We used the `.map` function to assign 0, 1, and 2 to \"Hate Speech Detected\", \"Offensive language detected\", and \"No hate and - - offensive speech\""
472
- ],
473
- "id": "f584d46b"
474
- },
475
- {
476
- "cell_type": "markdown",
477
- "source": [
478
- "### Preprocess the Labels"
479
- ],
480
- "metadata": {
481
- "id": "MSIgr88pMz8x"
482
- },
483
- "id": "MSIgr88pMz8x"
484
- },
485
- {
486
- "cell_type": "code",
487
- "execution_count": 28,
488
- "metadata": {
489
- "id": "117eadd5"
490
- },
491
- "outputs": [],
492
- "source": [
493
- "df1['labels'] = df1['class'].map({0:\"Hate Speech Detected\", 1:\"Offensive language detected\", 2:\"No hate and offensive speech\"})\n",
494
- "\n",
495
- "# Merging the labels\n",
496
- "def unify_labels(row):\n",
497
- " if row['labels'] in ['Hate Speech Detected', 'Offensive language detected']:\n",
498
- " return 'Offensive or Hate Speech'\n",
499
- " else:\n",
500
- " return 'Not Hate'\n",
501
- "\n",
502
- "# Apply this function to the dataset with three labels\n",
503
- "df1['labels'] = df1.apply(unify_labels, axis=1)"
504
- ],
505
- "id": "117eadd5"
506
- },
507
- {
508
- "cell_type": "code",
509
- "execution_count": 29,
510
- "metadata": {
511
- "id": "8fdf617f",
512
- "colab": {
513
- "base_uri": "https://localhost:8080/",
514
- "height": 486
515
- },
516
- "outputId": "e79d6a0a-e650-46a2-ad66-ee93d9d66f9a"
517
- },
518
- "outputs": [
519
- {
520
- "output_type": "execute_result",
521
- "data": {
522
- "text/plain": [
523
- " Unnamed: 0 count hate_speech offensive_language neither class \\\n",
524
- "0 0 3 0 0 3 2 \n",
525
- "1 1 3 0 3 0 1 \n",
526
- "2 2 3 0 3 0 1 \n",
527
- "3 3 3 0 2 1 1 \n",
528
- "4 4 6 0 6 0 1 \n",
529
- "\n",
530
- " tweet labels \n",
531
- "0 !!! RT @mayasolovely: As a woman you shouldn't... Not Hate \n",
532
- "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... Offensive or Hate Speech \n",
533
- "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... Offensive or Hate Speech \n",
534
- "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... Offensive or Hate Speech \n",
535
- "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... Offensive or Hate Speech "
536
- ],
537
- "text/html": [
538
- "\n",
539
- " <div id=\"df-5abb4a7a-3803-402a-84db-c273d86a8803\" class=\"colab-df-container\">\n",
540
- " <div>\n",
541
- "<style scoped>\n",
542
- " .dataframe tbody tr th:only-of-type {\n",
543
- " vertical-align: middle;\n",
544
- " }\n",
545
- "\n",
546
- " .dataframe tbody tr th {\n",
547
- " vertical-align: top;\n",
548
- " }\n",
549
- "\n",
550
- " .dataframe thead th {\n",
551
- " text-align: right;\n",
552
- " }\n",
553
- "</style>\n",
554
- "<table border=\"1\" class=\"dataframe\">\n",
555
- " <thead>\n",
556
- " <tr style=\"text-align: right;\">\n",
557
- " <th></th>\n",
558
- " <th>Unnamed: 0</th>\n",
559
- " <th>count</th>\n",
560
- " <th>hate_speech</th>\n",
561
- " <th>offensive_language</th>\n",
562
- " <th>neither</th>\n",
563
- " <th>class</th>\n",
564
- " <th>tweet</th>\n",
565
- " <th>labels</th>\n",
566
- " </tr>\n",
567
- " </thead>\n",
568
- " <tbody>\n",
569
- " <tr>\n",
570
- " <th>0</th>\n",
571
- " <td>0</td>\n",
572
- " <td>3</td>\n",
573
- " <td>0</td>\n",
574
- " <td>0</td>\n",
575
- " <td>3</td>\n",
576
- " <td>2</td>\n",
577
- " <td>!!! RT @mayasolovely: As a woman you shouldn't...</td>\n",
578
- " <td>Not Hate</td>\n",
579
- " </tr>\n",
580
- " <tr>\n",
581
- " <th>1</th>\n",
582
- " <td>1</td>\n",
583
- " <td>3</td>\n",
584
- " <td>0</td>\n",
585
- " <td>3</td>\n",
586
- " <td>0</td>\n",
587
- " <td>1</td>\n",
588
- " <td>!!!!! RT @mleew17: boy dats cold...tyga dwn ba...</td>\n",
589
- " <td>Offensive or Hate Speech</td>\n",
590
- " </tr>\n",
591
- " <tr>\n",
592
- " <th>2</th>\n",
593
- " <td>2</td>\n",
594
- " <td>3</td>\n",
595
- " <td>0</td>\n",
596
- " <td>3</td>\n",
597
- " <td>0</td>\n",
598
- " <td>1</td>\n",
599
- " <td>!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...</td>\n",
600
- " <td>Offensive or Hate Speech</td>\n",
601
- " </tr>\n",
602
- " <tr>\n",
603
- " <th>3</th>\n",
604
- " <td>3</td>\n",
605
- " <td>3</td>\n",
606
- " <td>0</td>\n",
607
- " <td>2</td>\n",
608
- " <td>1</td>\n",
609
- " <td>1</td>\n",
610
- " <td>!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...</td>\n",
611
- " <td>Offensive or Hate Speech</td>\n",
612
- " </tr>\n",
613
- " <tr>\n",
614
- " <th>4</th>\n",
615
- " <td>4</td>\n",
616
- " <td>6</td>\n",
617
- " <td>0</td>\n",
618
- " <td>6</td>\n",
619
- " <td>0</td>\n",
620
- " <td>1</td>\n",
621
- " <td>!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...</td>\n",
622
- " <td>Offensive or Hate Speech</td>\n",
623
- " </tr>\n",
624
- " </tbody>\n",
625
- "</table>\n",
626
- "</div>\n",
627
- " <div class=\"colab-df-buttons\">\n",
628
- "\n",
629
- " <div class=\"colab-df-container\">\n",
630
- " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5abb4a7a-3803-402a-84db-c273d86a8803')\"\n",
631
- " title=\"Convert this dataframe to an interactive table.\"\n",
632
- " style=\"display:none;\">\n",
633
- "\n",
634
- " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
635
- " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
636
- " </svg>\n",
637
- " </button>\n",
638
- "\n",
639
- " <style>\n",
640
- " .colab-df-container {\n",
641
- " display:flex;\n",
642
- " gap: 12px;\n",
643
- " }\n",
644
- "\n",
645
- " .colab-df-convert {\n",
646
- " background-color: #E8F0FE;\n",
647
- " border: none;\n",
648
- " border-radius: 50%;\n",
649
- " cursor: pointer;\n",
650
- " display: none;\n",
651
- " fill: #1967D2;\n",
652
- " height: 32px;\n",
653
- " padding: 0 0 0 0;\n",
654
- " width: 32px;\n",
655
- " }\n",
656
- "\n",
657
- " .colab-df-convert:hover {\n",
658
- " background-color: #E2EBFA;\n",
659
- " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
660
- " fill: #174EA6;\n",
661
- " }\n",
662
- "\n",
663
- " .colab-df-buttons div {\n",
664
- " margin-bottom: 4px;\n",
665
- " }\n",
666
- "\n",
667
- " [theme=dark] .colab-df-convert {\n",
668
- " background-color: #3B4455;\n",
669
- " fill: #D2E3FC;\n",
670
- " }\n",
671
- "\n",
672
- " [theme=dark] .colab-df-convert:hover {\n",
673
- " background-color: #434B5C;\n",
674
- " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
675
- " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
676
- " fill: #FFFFFF;\n",
677
- " }\n",
678
- " </style>\n",
679
- "\n",
680
- " <script>\n",
681
- " const buttonEl =\n",
682
- " document.querySelector('#df-5abb4a7a-3803-402a-84db-c273d86a8803 button.colab-df-convert');\n",
683
- " buttonEl.style.display =\n",
684
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
685
- "\n",
686
- " async function convertToInteractive(key) {\n",
687
- " const element = document.querySelector('#df-5abb4a7a-3803-402a-84db-c273d86a8803');\n",
688
- " const dataTable =\n",
689
- " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
690
- " [key], {});\n",
691
- " if (!dataTable) return;\n",
692
- "\n",
693
- " const docLinkHtml = 'Like what you see? Visit the ' +\n",
694
- " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
695
- " + ' to learn more about interactive tables.';\n",
696
- " element.innerHTML = '';\n",
697
- " dataTable['output_type'] = 'display_data';\n",
698
- " await google.colab.output.renderOutput(dataTable, element);\n",
699
- " const docLink = document.createElement('div');\n",
700
- " docLink.innerHTML = docLinkHtml;\n",
701
- " element.appendChild(docLink);\n",
702
- " }\n",
703
- " </script>\n",
704
- " </div>\n",
705
- "\n",
706
- "\n",
707
- "<div id=\"df-545be6b0-6c38-475a-88be-3aca9996155e\">\n",
708
- " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-545be6b0-6c38-475a-88be-3aca9996155e')\"\n",
709
- " title=\"Suggest charts\"\n",
710
- " style=\"display:none;\">\n",
711
- "\n",
712
- "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
713
- " width=\"24px\">\n",
714
- " <g>\n",
715
- " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
716
- " </g>\n",
717
- "</svg>\n",
718
- " </button>\n",
719
- "\n",
720
- "<style>\n",
721
- " .colab-df-quickchart {\n",
722
- " --bg-color: #E8F0FE;\n",
723
- " --fill-color: #1967D2;\n",
724
- " --hover-bg-color: #E2EBFA;\n",
725
- " --hover-fill-color: #174EA6;\n",
726
- " --disabled-fill-color: #AAA;\n",
727
- " --disabled-bg-color: #DDD;\n",
728
- " }\n",
729
- "\n",
730
- " [theme=dark] .colab-df-quickchart {\n",
731
- " --bg-color: #3B4455;\n",
732
- " --fill-color: #D2E3FC;\n",
733
- " --hover-bg-color: #434B5C;\n",
734
- " --hover-fill-color: #FFFFFF;\n",
735
- " --disabled-bg-color: #3B4455;\n",
736
- " --disabled-fill-color: #666;\n",
737
- " }\n",
738
- "\n",
739
- " .colab-df-quickchart {\n",
740
- " background-color: var(--bg-color);\n",
741
- " border: none;\n",
742
- " border-radius: 50%;\n",
743
- " cursor: pointer;\n",
744
- " display: none;\n",
745
- " fill: var(--fill-color);\n",
746
- " height: 32px;\n",
747
- " padding: 0;\n",
748
- " width: 32px;\n",
749
- " }\n",
750
- "\n",
751
- " .colab-df-quickchart:hover {\n",
752
- " background-color: var(--hover-bg-color);\n",
753
- " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
754
- " fill: var(--button-hover-fill-color);\n",
755
- " }\n",
756
- "\n",
757
- " .colab-df-quickchart-complete:disabled,\n",
758
- " .colab-df-quickchart-complete:disabled:hover {\n",
759
- " background-color: var(--disabled-bg-color);\n",
760
- " fill: var(--disabled-fill-color);\n",
761
- " box-shadow: none;\n",
762
- " }\n",
763
- "\n",
764
- " .colab-df-spinner {\n",
765
- " border: 2px solid var(--fill-color);\n",
766
- " border-color: transparent;\n",
767
- " border-bottom-color: var(--fill-color);\n",
768
- " animation:\n",
769
- " spin 1s steps(1) infinite;\n",
770
- " }\n",
771
- "\n",
772
- " @keyframes spin {\n",
773
- " 0% {\n",
774
- " border-color: transparent;\n",
775
- " border-bottom-color: var(--fill-color);\n",
776
- " border-left-color: var(--fill-color);\n",
777
- " }\n",
778
- " 20% {\n",
779
- " border-color: transparent;\n",
780
- " border-left-color: var(--fill-color);\n",
781
- " border-top-color: var(--fill-color);\n",
782
- " }\n",
783
- " 30% {\n",
784
- " border-color: transparent;\n",
785
- " border-left-color: var(--fill-color);\n",
786
- " border-top-color: var(--fill-color);\n",
787
- " border-right-color: var(--fill-color);\n",
788
- " }\n",
789
- " 40% {\n",
790
- " border-color: transparent;\n",
791
- " border-right-color: var(--fill-color);\n",
792
- " border-top-color: var(--fill-color);\n",
793
- " }\n",
794
- " 60% {\n",
795
- " border-color: transparent;\n",
796
- " border-right-color: var(--fill-color);\n",
797
- " }\n",
798
- " 80% {\n",
799
- " border-color: transparent;\n",
800
- " border-right-color: var(--fill-color);\n",
801
- " border-bottom-color: var(--fill-color);\n",
802
- " }\n",
803
- " 90% {\n",
804
- " border-color: transparent;\n",
805
- " border-bottom-color: var(--fill-color);\n",
806
- " }\n",
807
- " }\n",
808
- "</style>\n",
809
- "\n",
810
- " <script>\n",
811
- " async function quickchart(key) {\n",
812
- " const quickchartButtonEl =\n",
813
- " document.querySelector('#' + key + ' button');\n",
814
- " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
815
- " quickchartButtonEl.classList.add('colab-df-spinner');\n",
816
- " try {\n",
817
- " const charts = await google.colab.kernel.invokeFunction(\n",
818
- " 'suggestCharts', [key], {});\n",
819
- " } catch (error) {\n",
820
- " console.error('Error during call to suggestCharts:', error);\n",
821
- " }\n",
822
- " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
823
- " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
824
- " }\n",
825
- " (() => {\n",
826
- " let quickchartButtonEl =\n",
827
- " document.querySelector('#df-545be6b0-6c38-475a-88be-3aca9996155e button');\n",
828
- " quickchartButtonEl.style.display =\n",
829
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
830
- " })();\n",
831
- " </script>\n",
832
- "</div>\n",
833
- "\n",
834
- " </div>\n",
835
- " </div>\n"
836
- ],
837
- "application/vnd.google.colaboratory.intrinsic+json": {
838
- "type": "dataframe",
839
- "variable_name": "df1",
840
- "summary": "{\n \"name\": \"df1\",\n \"rows\": 24783,\n \"fields\": [\n {\n \"column\": \"Unnamed: 0\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 7299,\n \"min\": 0,\n \"max\": 25296,\n \"num_unique_values\": 24783,\n \"samples\": [\n 2326,\n 16283,\n 19362\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 9,\n \"num_unique_values\": 5,\n \"samples\": [\n 6,\n 7,\n 9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"hate_speech\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 7,\n \"num_unique_values\": 8,\n \"samples\": [\n 1,\n 6,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"offensive_language\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 9,\n \"num_unique_values\": 10,\n \"samples\": [\n 8,\n 3,\n 7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"neither\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 9,\n \"num_unique_values\": 10,\n \"samples\": [\n 8,\n 0,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"class\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 2,\n \"num_unique_values\": 3,\n \"samples\": [\n 2,\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tweet\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 24783,\n \"samples\": [\n \"934 8616\\ni got a missed call from yo bitch\",\n \"RT @KINGTUNCHI_: Fucking with a bad bitch you gone need some money lil homie!\",\n \"RT @eanahS__: @1inkkofrosess lol my credit ain't no where near good , but I know the right man for the job .. that ho nice though!\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"labels\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Offensive or Hate Speech\",\n \"Not Hate\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
841
- }
842
- },
843
- "metadata": {},
844
- "execution_count": 29
845
- }
846
- ],
847
- "source": [
848
- "df1['labels'].info\n",
849
- "df1.head()"
850
- ],
851
- "id": "8fdf617f"
852
- },
853
- {
854
- "cell_type": "markdown",
855
- "source": [
856
- "### Import the second dataset"
857
- ],
858
- "metadata": {
859
- "id": "9DgbrPGdSk5O"
860
- },
861
- "id": "9DgbrPGdSk5O"
862
- },
863
- {
864
- "cell_type": "markdown",
865
- "metadata": {
866
- "id": "a420ba1c"
867
- },
868
- "source": [
869
- "### Formated to two tables of tweets and labels"
870
- ],
871
- "id": "a420ba1c"
872
- },
873
- {
874
- "cell_type": "code",
875
- "execution_count": 30,
876
- "metadata": {
877
- "id": "5db5746b",
878
- "colab": {
879
- "base_uri": "https://localhost:8080/",
880
- "height": 206
881
- },
882
- "outputId": "1f2e2ba7-0288-4920-f852-3a4e4be5b3e4"
883
- },
884
- "outputs": [
885
- {
886
- "output_type": "execute_result",
887
- "data": {
888
- "text/plain": [
889
- " tweet labels\n",
890
- "0 !!! RT @mayasolovely: As a woman you shouldn't... Not Hate\n",
891
- "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... Offensive or Hate Speech\n",
892
- "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... Offensive or Hate Speech\n",
893
- "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... Offensive or Hate Speech\n",
894
- "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... Offensive or Hate Speech"
895
- ],
896
- "text/html": [
897
- "\n",
898
- " <div id=\"df-b2146b45-aaf4-41af-b85e-6e6f8253d784\" class=\"colab-df-container\">\n",
899
- " <div>\n",
900
- "<style scoped>\n",
901
- " .dataframe tbody tr th:only-of-type {\n",
902
- " vertical-align: middle;\n",
903
- " }\n",
904
- "\n",
905
- " .dataframe tbody tr th {\n",
906
- " vertical-align: top;\n",
907
- " }\n",
908
- "\n",
909
- " .dataframe thead th {\n",
910
- " text-align: right;\n",
911
- " }\n",
912
- "</style>\n",
913
- "<table border=\"1\" class=\"dataframe\">\n",
914
- " <thead>\n",
915
- " <tr style=\"text-align: right;\">\n",
916
- " <th></th>\n",
917
- " <th>tweet</th>\n",
918
- " <th>labels</th>\n",
919
- " </tr>\n",
920
- " </thead>\n",
921
- " <tbody>\n",
922
- " <tr>\n",
923
- " <th>0</th>\n",
924
- " <td>!!! RT @mayasolovely: As a woman you shouldn't...</td>\n",
925
- " <td>Not Hate</td>\n",
926
- " </tr>\n",
927
- " <tr>\n",
928
- " <th>1</th>\n",
929
- " <td>!!!!! RT @mleew17: boy dats cold...tyga dwn ba...</td>\n",
930
- " <td>Offensive or Hate Speech</td>\n",
931
- " </tr>\n",
932
- " <tr>\n",
933
- " <th>2</th>\n",
934
- " <td>!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...</td>\n",
935
- " <td>Offensive or Hate Speech</td>\n",
936
- " </tr>\n",
937
- " <tr>\n",
938
- " <th>3</th>\n",
939
- " <td>!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...</td>\n",
940
- " <td>Offensive or Hate Speech</td>\n",
941
- " </tr>\n",
942
- " <tr>\n",
943
- " <th>4</th>\n",
944
- " <td>!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...</td>\n",
945
- " <td>Offensive or Hate Speech</td>\n",
946
- " </tr>\n",
947
- " </tbody>\n",
948
- "</table>\n",
949
- "</div>\n",
950
- " <div class=\"colab-df-buttons\">\n",
951
- "\n",
952
- " <div class=\"colab-df-container\">\n",
953
- " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b2146b45-aaf4-41af-b85e-6e6f8253d784')\"\n",
954
- " title=\"Convert this dataframe to an interactive table.\"\n",
955
- " style=\"display:none;\">\n",
956
- "\n",
957
- " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
958
- " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
959
- " </svg>\n",
960
- " </button>\n",
961
- "\n",
962
- " <style>\n",
963
- " .colab-df-container {\n",
964
- " display:flex;\n",
965
- " gap: 12px;\n",
966
- " }\n",
967
- "\n",
968
- " .colab-df-convert {\n",
969
- " background-color: #E8F0FE;\n",
970
- " border: none;\n",
971
- " border-radius: 50%;\n",
972
- " cursor: pointer;\n",
973
- " display: none;\n",
974
- " fill: #1967D2;\n",
975
- " height: 32px;\n",
976
- " padding: 0 0 0 0;\n",
977
- " width: 32px;\n",
978
- " }\n",
979
- "\n",
980
- " .colab-df-convert:hover {\n",
981
- " background-color: #E2EBFA;\n",
982
- " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
983
- " fill: #174EA6;\n",
984
- " }\n",
985
- "\n",
986
- " .colab-df-buttons div {\n",
987
- " margin-bottom: 4px;\n",
988
- " }\n",
989
- "\n",
990
- " [theme=dark] .colab-df-convert {\n",
991
- " background-color: #3B4455;\n",
992
- " fill: #D2E3FC;\n",
993
- " }\n",
994
- "\n",
995
- " [theme=dark] .colab-df-convert:hover {\n",
996
- " background-color: #434B5C;\n",
997
- " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
998
- " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
999
- " fill: #FFFFFF;\n",
1000
- " }\n",
1001
- " </style>\n",
1002
- "\n",
1003
- " <script>\n",
1004
- " const buttonEl =\n",
1005
- " document.querySelector('#df-b2146b45-aaf4-41af-b85e-6e6f8253d784 button.colab-df-convert');\n",
1006
- " buttonEl.style.display =\n",
1007
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
1008
- "\n",
1009
- " async function convertToInteractive(key) {\n",
1010
- " const element = document.querySelector('#df-b2146b45-aaf4-41af-b85e-6e6f8253d784');\n",
1011
- " const dataTable =\n",
1012
- " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
1013
- " [key], {});\n",
1014
- " if (!dataTable) return;\n",
1015
- "\n",
1016
- " const docLinkHtml = 'Like what you see? Visit the ' +\n",
1017
- " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
1018
- " + ' to learn more about interactive tables.';\n",
1019
- " element.innerHTML = '';\n",
1020
- " dataTable['output_type'] = 'display_data';\n",
1021
- " await google.colab.output.renderOutput(dataTable, element);\n",
1022
- " const docLink = document.createElement('div');\n",
1023
- " docLink.innerHTML = docLinkHtml;\n",
1024
- " element.appendChild(docLink);\n",
1025
- " }\n",
1026
- " </script>\n",
1027
- " </div>\n",
1028
- "\n",
1029
- "\n",
1030
- "<div id=\"df-734b0701-3f7b-48a0-b2d5-1cbacd22ba01\">\n",
1031
- " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-734b0701-3f7b-48a0-b2d5-1cbacd22ba01')\"\n",
1032
- " title=\"Suggest charts\"\n",
1033
- " style=\"display:none;\">\n",
1034
- "\n",
1035
- "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
1036
- " width=\"24px\">\n",
1037
- " <g>\n",
1038
- " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
1039
- " </g>\n",
1040
- "</svg>\n",
1041
- " </button>\n",
1042
- "\n",
1043
- "<style>\n",
1044
- " .colab-df-quickchart {\n",
1045
- " --bg-color: #E8F0FE;\n",
1046
- " --fill-color: #1967D2;\n",
1047
- " --hover-bg-color: #E2EBFA;\n",
1048
- " --hover-fill-color: #174EA6;\n",
1049
- " --disabled-fill-color: #AAA;\n",
1050
- " --disabled-bg-color: #DDD;\n",
1051
- " }\n",
1052
- "\n",
1053
- " [theme=dark] .colab-df-quickchart {\n",
1054
- " --bg-color: #3B4455;\n",
1055
- " --fill-color: #D2E3FC;\n",
1056
- " --hover-bg-color: #434B5C;\n",
1057
- " --hover-fill-color: #FFFFFF;\n",
1058
- " --disabled-bg-color: #3B4455;\n",
1059
- " --disabled-fill-color: #666;\n",
1060
- " }\n",
1061
- "\n",
1062
- " .colab-df-quickchart {\n",
1063
- " background-color: var(--bg-color);\n",
1064
- " border: none;\n",
1065
- " border-radius: 50%;\n",
1066
- " cursor: pointer;\n",
1067
- " display: none;\n",
1068
- " fill: var(--fill-color);\n",
1069
- " height: 32px;\n",
1070
- " padding: 0;\n",
1071
- " width: 32px;\n",
1072
- " }\n",
1073
- "\n",
1074
- " .colab-df-quickchart:hover {\n",
1075
- " background-color: var(--hover-bg-color);\n",
1076
- " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
1077
- " fill: var(--button-hover-fill-color);\n",
1078
- " }\n",
1079
- "\n",
1080
- " .colab-df-quickchart-complete:disabled,\n",
1081
- " .colab-df-quickchart-complete:disabled:hover {\n",
1082
- " background-color: var(--disabled-bg-color);\n",
1083
- " fill: var(--disabled-fill-color);\n",
1084
- " box-shadow: none;\n",
1085
- " }\n",
1086
- "\n",
1087
- " .colab-df-spinner {\n",
1088
- " border: 2px solid var(--fill-color);\n",
1089
- " border-color: transparent;\n",
1090
- " border-bottom-color: var(--fill-color);\n",
1091
- " animation:\n",
1092
- " spin 1s steps(1) infinite;\n",
1093
- " }\n",
1094
- "\n",
1095
- " @keyframes spin {\n",
1096
- " 0% {\n",
1097
- " border-color: transparent;\n",
1098
- " border-bottom-color: var(--fill-color);\n",
1099
- " border-left-color: var(--fill-color);\n",
1100
- " }\n",
1101
- " 20% {\n",
1102
- " border-color: transparent;\n",
1103
- " border-left-color: var(--fill-color);\n",
1104
- " border-top-color: var(--fill-color);\n",
1105
- " }\n",
1106
- " 30% {\n",
1107
- " border-color: transparent;\n",
1108
- " border-left-color: var(--fill-color);\n",
1109
- " border-top-color: var(--fill-color);\n",
1110
- " border-right-color: var(--fill-color);\n",
1111
- " }\n",
1112
- " 40% {\n",
1113
- " border-color: transparent;\n",
1114
- " border-right-color: var(--fill-color);\n",
1115
- " border-top-color: var(--fill-color);\n",
1116
- " }\n",
1117
- " 60% {\n",
1118
- " border-color: transparent;\n",
1119
- " border-right-color: var(--fill-color);\n",
1120
- " }\n",
1121
- " 80% {\n",
1122
- " border-color: transparent;\n",
1123
- " border-right-color: var(--fill-color);\n",
1124
- " border-bottom-color: var(--fill-color);\n",
1125
- " }\n",
1126
- " 90% {\n",
1127
- " border-color: transparent;\n",
1128
- " border-bottom-color: var(--fill-color);\n",
1129
- " }\n",
1130
- " }\n",
1131
- "</style>\n",
1132
- "\n",
1133
- " <script>\n",
1134
- " async function quickchart(key) {\n",
1135
- " const quickchartButtonEl =\n",
1136
- " document.querySelector('#' + key + ' button');\n",
1137
- " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
1138
- " quickchartButtonEl.classList.add('colab-df-spinner');\n",
1139
- " try {\n",
1140
- " const charts = await google.colab.kernel.invokeFunction(\n",
1141
- " 'suggestCharts', [key], {});\n",
1142
- " } catch (error) {\n",
1143
- " console.error('Error during call to suggestCharts:', error);\n",
1144
- " }\n",
1145
- " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
1146
- " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
1147
- " }\n",
1148
- " (() => {\n",
1149
- " let quickchartButtonEl =\n",
1150
- " document.querySelector('#df-734b0701-3f7b-48a0-b2d5-1cbacd22ba01 button');\n",
1151
- " quickchartButtonEl.style.display =\n",
1152
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
1153
- " })();\n",
1154
- " </script>\n",
1155
- "</div>\n",
1156
- "\n",
1157
- " </div>\n",
1158
- " </div>\n"
1159
- ],
1160
- "application/vnd.google.colaboratory.intrinsic+json": {
1161
- "type": "dataframe",
1162
- "variable_name": "df1",
1163
- "summary": "{\n \"name\": \"df1\",\n \"rows\": 24783,\n \"fields\": [\n {\n \"column\": \"tweet\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 24783,\n \"samples\": [\n \"934 8616\\ni got a missed call from yo bitch\",\n \"RT @KINGTUNCHI_: Fucking with a bad bitch you gone need some money lil homie!\",\n \"RT @eanahS__: @1inkkofrosess lol my credit ain't no where near good , but I know the right man for the job .. that ho nice though!\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"labels\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Offensive or Hate Speech\",\n \"Not Hate\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
1164
- }
1165
- },
1166
- "metadata": {},
1167
- "execution_count": 30
1168
- }
1169
- ],
1170
- "source": [
1171
- "df1 = df1[['tweet', 'labels']]\n",
1172
- "df1 = df1[['tweet', 'labels']].fillna(0)\n",
1173
- "df1.head()"
1174
- ],
1175
- "id": "5db5746b"
1176
- },
1177
- {
1178
- "cell_type": "code",
1179
- "source": [],
1180
- "metadata": {
1181
- "id": "O7ibjvL6LdKO"
1182
- },
1183
- "id": "O7ibjvL6LdKO",
1184
- "execution_count": 30,
1185
- "outputs": []
1186
- },
1187
- {
1188
- "cell_type": "code",
1189
- "execution_count": 31,
1190
- "metadata": {
1191
- "id": "604ec08e"
1192
- },
1193
- "outputs": [],
1194
- "source": [
1195
- "def clean(text):\n",
1196
- " text = str(text).lower()\n",
1197
- " text = re.sub('\\[.*?\\]', '', text)\n",
1198
- " text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n",
1199
- " text = re.sub('<.*?>+', '', text)\n",
1200
- " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n",
1201
- " text = re.sub('\\n', '', text)\n",
1202
- " text = re.sub('\\w*\\d\\w*', \"\", text)\n",
1203
- " text = [word for word in text.split() if word not in stopword]\n",
1204
- " text = \" \".join(text)\n",
1205
- " return text\n",
1206
- "# Apply cleaning function to the 'tweet' column of both dataframes\n",
1207
- "df1['tweet'] = df1['tweet'].apply(clean)\n"
1208
- ],
1209
- "id": "604ec08e"
1210
- },
1211
- {
1212
- "cell_type": "code",
1213
- "source": [
1214
- "df1.head()"
1215
- ],
1216
- "metadata": {
1217
- "colab": {
1218
- "base_uri": "https://localhost:8080/",
1219
- "height": 206
1220
- },
1221
- "id": "XuArptokP5u4",
1222
- "outputId": "7f650ae3-4686-4585-8c70-e6bcdc0afc0e"
1223
- },
1224
- "id": "XuArptokP5u4",
1225
- "execution_count": 32,
1226
- "outputs": [
1227
- {
1228
- "output_type": "execute_result",
1229
- "data": {
1230
- "text/plain": [
1231
- " tweet labels\n",
1232
- "0 rt mayasolovely woman shouldnt complain cleani... Not Hate\n",
1233
- "1 rt boy dats coldtyga dwn bad cuffin dat hoe place Offensive or Hate Speech\n",
1234
- "2 rt urkindofbrand dawg rt ever fuck bitch start... Offensive or Hate Speech\n",
1235
- "3 rt cganderson vivabased look like tranny Offensive or Hate Speech\n",
1236
- "4 rt shenikaroberts shit hear might true might f... Offensive or Hate Speech"
1237
- ],
1238
- "text/html": [
1239
- "\n",
1240
- " <div id=\"df-c658d18f-79b4-4db6-8f49-2b9c3394f981\" class=\"colab-df-container\">\n",
1241
- " <div>\n",
1242
- "<style scoped>\n",
1243
- " .dataframe tbody tr th:only-of-type {\n",
1244
- " vertical-align: middle;\n",
1245
- " }\n",
1246
- "\n",
1247
- " .dataframe tbody tr th {\n",
1248
- " vertical-align: top;\n",
1249
- " }\n",
1250
- "\n",
1251
- " .dataframe thead th {\n",
1252
- " text-align: right;\n",
1253
- " }\n",
1254
- "</style>\n",
1255
- "<table border=\"1\" class=\"dataframe\">\n",
1256
- " <thead>\n",
1257
- " <tr style=\"text-align: right;\">\n",
1258
- " <th></th>\n",
1259
- " <th>tweet</th>\n",
1260
- " <th>labels</th>\n",
1261
- " </tr>\n",
1262
- " </thead>\n",
1263
- " <tbody>\n",
1264
- " <tr>\n",
1265
- " <th>0</th>\n",
1266
- " <td>rt mayasolovely woman shouldnt complain cleani...</td>\n",
1267
- " <td>Not Hate</td>\n",
1268
- " </tr>\n",
1269
- " <tr>\n",
1270
- " <th>1</th>\n",
1271
- " <td>rt boy dats coldtyga dwn bad cuffin dat hoe place</td>\n",
1272
- " <td>Offensive or Hate Speech</td>\n",
1273
- " </tr>\n",
1274
- " <tr>\n",
1275
- " <th>2</th>\n",
1276
- " <td>rt urkindofbrand dawg rt ever fuck bitch start...</td>\n",
1277
- " <td>Offensive or Hate Speech</td>\n",
1278
- " </tr>\n",
1279
- " <tr>\n",
1280
- " <th>3</th>\n",
1281
- " <td>rt cganderson vivabased look like tranny</td>\n",
1282
- " <td>Offensive or Hate Speech</td>\n",
1283
- " </tr>\n",
1284
- " <tr>\n",
1285
- " <th>4</th>\n",
1286
- " <td>rt shenikaroberts shit hear might true might f...</td>\n",
1287
- " <td>Offensive or Hate Speech</td>\n",
1288
- " </tr>\n",
1289
- " </tbody>\n",
1290
- "</table>\n",
1291
- "</div>\n",
1292
- " <div class=\"colab-df-buttons\">\n",
1293
- "\n",
1294
- " <div class=\"colab-df-container\">\n",
1295
- " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c658d18f-79b4-4db6-8f49-2b9c3394f981')\"\n",
1296
- " title=\"Convert this dataframe to an interactive table.\"\n",
1297
- " style=\"display:none;\">\n",
1298
- "\n",
1299
- " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
1300
- " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
1301
- " </svg>\n",
1302
- " </button>\n",
1303
- "\n",
1304
- " <style>\n",
1305
- " .colab-df-container {\n",
1306
- " display:flex;\n",
1307
- " gap: 12px;\n",
1308
- " }\n",
1309
- "\n",
1310
- " .colab-df-convert {\n",
1311
- " background-color: #E8F0FE;\n",
1312
- " border: none;\n",
1313
- " border-radius: 50%;\n",
1314
- " cursor: pointer;\n",
1315
- " display: none;\n",
1316
- " fill: #1967D2;\n",
1317
- " height: 32px;\n",
1318
- " padding: 0 0 0 0;\n",
1319
- " width: 32px;\n",
1320
- " }\n",
1321
- "\n",
1322
- " .colab-df-convert:hover {\n",
1323
- " background-color: #E2EBFA;\n",
1324
- " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
1325
- " fill: #174EA6;\n",
1326
- " }\n",
1327
- "\n",
1328
- " .colab-df-buttons div {\n",
1329
- " margin-bottom: 4px;\n",
1330
- " }\n",
1331
- "\n",
1332
- " [theme=dark] .colab-df-convert {\n",
1333
- " background-color: #3B4455;\n",
1334
- " fill: #D2E3FC;\n",
1335
- " }\n",
1336
- "\n",
1337
- " [theme=dark] .colab-df-convert:hover {\n",
1338
- " background-color: #434B5C;\n",
1339
- " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
1340
- " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
1341
- " fill: #FFFFFF;\n",
1342
- " }\n",
1343
- " </style>\n",
1344
- "\n",
1345
- " <script>\n",
1346
- " const buttonEl =\n",
1347
- " document.querySelector('#df-c658d18f-79b4-4db6-8f49-2b9c3394f981 button.colab-df-convert');\n",
1348
- " buttonEl.style.display =\n",
1349
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
1350
- "\n",
1351
- " async function convertToInteractive(key) {\n",
1352
- " const element = document.querySelector('#df-c658d18f-79b4-4db6-8f49-2b9c3394f981');\n",
1353
- " const dataTable =\n",
1354
- " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
1355
- " [key], {});\n",
1356
- " if (!dataTable) return;\n",
1357
- "\n",
1358
- " const docLinkHtml = 'Like what you see? Visit the ' +\n",
1359
- " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
1360
- " + ' to learn more about interactive tables.';\n",
1361
- " element.innerHTML = '';\n",
1362
- " dataTable['output_type'] = 'display_data';\n",
1363
- " await google.colab.output.renderOutput(dataTable, element);\n",
1364
- " const docLink = document.createElement('div');\n",
1365
- " docLink.innerHTML = docLinkHtml;\n",
1366
- " element.appendChild(docLink);\n",
1367
- " }\n",
1368
- " </script>\n",
1369
- " </div>\n",
1370
- "\n",
1371
- "\n",
1372
- "<div id=\"df-b03d12b4-7e50-4167-95b0-3df9e3d0a9e8\">\n",
1373
- " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-b03d12b4-7e50-4167-95b0-3df9e3d0a9e8')\"\n",
1374
- " title=\"Suggest charts\"\n",
1375
- " style=\"display:none;\">\n",
1376
- "\n",
1377
- "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
1378
- " width=\"24px\">\n",
1379
- " <g>\n",
1380
- " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
1381
- " </g>\n",
1382
- "</svg>\n",
1383
- " </button>\n",
1384
- "\n",
1385
- "<style>\n",
1386
- " .colab-df-quickchart {\n",
1387
- " --bg-color: #E8F0FE;\n",
1388
- " --fill-color: #1967D2;\n",
1389
- " --hover-bg-color: #E2EBFA;\n",
1390
- " --hover-fill-color: #174EA6;\n",
1391
- " --disabled-fill-color: #AAA;\n",
1392
- " --disabled-bg-color: #DDD;\n",
1393
- " }\n",
1394
- "\n",
1395
- " [theme=dark] .colab-df-quickchart {\n",
1396
- " --bg-color: #3B4455;\n",
1397
- " --fill-color: #D2E3FC;\n",
1398
- " --hover-bg-color: #434B5C;\n",
1399
- " --hover-fill-color: #FFFFFF;\n",
1400
- " --disabled-bg-color: #3B4455;\n",
1401
- " --disabled-fill-color: #666;\n",
1402
- " }\n",
1403
- "\n",
1404
- " .colab-df-quickchart {\n",
1405
- " background-color: var(--bg-color);\n",
1406
- " border: none;\n",
1407
- " border-radius: 50%;\n",
1408
- " cursor: pointer;\n",
1409
- " display: none;\n",
1410
- " fill: var(--fill-color);\n",
1411
- " height: 32px;\n",
1412
- " padding: 0;\n",
1413
- " width: 32px;\n",
1414
- " }\n",
1415
- "\n",
1416
- " .colab-df-quickchart:hover {\n",
1417
- " background-color: var(--hover-bg-color);\n",
1418
- " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
1419
- " fill: var(--button-hover-fill-color);\n",
1420
- " }\n",
1421
- "\n",
1422
- " .colab-df-quickchart-complete:disabled,\n",
1423
- " .colab-df-quickchart-complete:disabled:hover {\n",
1424
- " background-color: var(--disabled-bg-color);\n",
1425
- " fill: var(--disabled-fill-color);\n",
1426
- " box-shadow: none;\n",
1427
- " }\n",
1428
- "\n",
1429
- " .colab-df-spinner {\n",
1430
- " border: 2px solid var(--fill-color);\n",
1431
- " border-color: transparent;\n",
1432
- " border-bottom-color: var(--fill-color);\n",
1433
- " animation:\n",
1434
- " spin 1s steps(1) infinite;\n",
1435
- " }\n",
1436
- "\n",
1437
- " @keyframes spin {\n",
1438
- " 0% {\n",
1439
- " border-color: transparent;\n",
1440
- " border-bottom-color: var(--fill-color);\n",
1441
- " border-left-color: var(--fill-color);\n",
1442
- " }\n",
1443
- " 20% {\n",
1444
- " border-color: transparent;\n",
1445
- " border-left-color: var(--fill-color);\n",
1446
- " border-top-color: var(--fill-color);\n",
1447
- " }\n",
1448
- " 30% {\n",
1449
- " border-color: transparent;\n",
1450
- " border-left-color: var(--fill-color);\n",
1451
- " border-top-color: var(--fill-color);\n",
1452
- " border-right-color: var(--fill-color);\n",
1453
- " }\n",
1454
- " 40% {\n",
1455
- " border-color: transparent;\n",
1456
- " border-right-color: var(--fill-color);\n",
1457
- " border-top-color: var(--fill-color);\n",
1458
- " }\n",
1459
- " 60% {\n",
1460
- " border-color: transparent;\n",
1461
- " border-right-color: var(--fill-color);\n",
1462
- " }\n",
1463
- " 80% {\n",
1464
- " border-color: transparent;\n",
1465
- " border-right-color: var(--fill-color);\n",
1466
- " border-bottom-color: var(--fill-color);\n",
1467
- " }\n",
1468
- " 90% {\n",
1469
- " border-color: transparent;\n",
1470
- " border-bottom-color: var(--fill-color);\n",
1471
- " }\n",
1472
- " }\n",
1473
- "</style>\n",
1474
- "\n",
1475
- " <script>\n",
1476
- " async function quickchart(key) {\n",
1477
- " const quickchartButtonEl =\n",
1478
- " document.querySelector('#' + key + ' button');\n",
1479
- " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
1480
- " quickchartButtonEl.classList.add('colab-df-spinner');\n",
1481
- " try {\n",
1482
- " const charts = await google.colab.kernel.invokeFunction(\n",
1483
- " 'suggestCharts', [key], {});\n",
1484
- " } catch (error) {\n",
1485
- " console.error('Error during call to suggestCharts:', error);\n",
1486
- " }\n",
1487
- " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
1488
- " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
1489
- " }\n",
1490
- " (() => {\n",
1491
- " let quickchartButtonEl =\n",
1492
- " document.querySelector('#df-b03d12b4-7e50-4167-95b0-3df9e3d0a9e8 button');\n",
1493
- " quickchartButtonEl.style.display =\n",
1494
- " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
1495
- " })();\n",
1496
- " </script>\n",
1497
- "</div>\n",
1498
- "\n",
1499
- " </div>\n",
1500
- " </div>\n"
1501
- ],
1502
- "application/vnd.google.colaboratory.intrinsic+json": {
1503
- "type": "dataframe",
1504
- "variable_name": "df1",
1505
- "summary": "{\n \"name\": \"df1\",\n \"rows\": 24783,\n \"fields\": [\n {\n \"column\": \"tweet\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 24506,\n \"samples\": [\n \"didnt even get see baby today smh moms fault selfish bitch\",\n \"hoes got money mall ballin bitch buy something\",\n \"rt johnnyfootbali yeah kaepernick might biceps like greek god dude looks like conceived proboscis monkey\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"labels\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Offensive or Hate Speech\",\n \"Not Hate\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
1506
- }
1507
- },
1508
- "metadata": {},
1509
- "execution_count": 32
1510
- }
1511
- ]
1512
- },
1513
- {
1514
- "cell_type": "code",
1515
- "source": [
1516
- "# Use a pipeline as a high-level helper\n",
1517
- "from transformers import pipeline\n",
1518
- "\n",
1519
- "pipe = pipeline(\"text-classification\", model=\"facebook/roberta-hate-speech-dynabench-r4-target\")"
1520
- ],
1521
- "metadata": {
1522
- "id": "1ey-IvgELgAJ"
1523
- },
1524
- "id": "1ey-IvgELgAJ",
1525
- "execution_count": 33,
1526
- "outputs": []
1527
- },
1528
- {
1529
- "cell_type": "code",
1530
- "source": [
1531
- "# Install necessary libraries\n",
1532
- "!pip install transformers\n",
1533
- "\n",
1534
- "import pandas as pd\n",
1535
- "from sklearn.model_selection import train_test_split\n",
1536
- "from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification\n",
1537
- "from datasets import Dataset\n",
1538
- "from transformers import Trainer, TrainingArguments\n",
1539
- "\n",
1540
- "\n",
1541
- "# Load your CSV file into a pandas DataFrame\n",
1542
- "data = pd.read_csv(\"twitter_data.csv\")\n",
1543
- "\n",
1544
- "# Add the 'labels' column to the 'data' DataFrame\n",
1545
- "data['labels'] = [1 if tweet == 'hate_speech' else 0 for tweet in data['class']]\n",
1546
- "\n",
1547
- "# Split data into train and validation sets\n",
1548
- "train_texts, val_texts, train_labels, val_labels = train_test_split(data[\"tweet\"], data[\"labels\"], test_size=0.2, random_state=42)\n",
1549
- "\n",
1550
- "# Load pre-trained tokenizer and model\n",
1551
- "tokenizer = AutoTokenizer.from_pretrained(\"facebook/roberta-hate-speech-dynabench-r4-target\")\n",
1552
- "model = AutoModelForSequenceClassification.from_pretrained(\"facebook/roberta-hate-speech-dynabench-r4-target\")\n",
1553
- "\n",
1554
- "# Tokenize the input texts\n",
1555
- "train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)\n",
1556
- "val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)\n",
1557
- "\n",
1558
- "# Convert labels to tensors\n",
1559
- "train_labels = list(train_labels)\n",
1560
- "val_labels = list(val_labels)\n",
1561
- "\n",
1562
- "# Create datasets\n",
1563
- "train_dataset = Dataset.from_dict({\"input_ids\": train_encodings[\"input_ids\"],\n",
1564
- " \"attention_mask\": train_encodings[\"attention_mask\"],\n",
1565
- " \"labels\": train_labels})\n",
1566
- "\n",
1567
- "val_dataset = Dataset.from_dict({\"input_ids\": val_encodings[\"input_ids\"],\n",
1568
- " \"attention_mask\": val_encodings[\"attention_mask\"],\n",
1569
- " \"labels\": val_labels})\n",
1570
- "\n",
1571
- "# Fine-tune the model\n",
1572
- "model.train(True)\n",
1573
- "\n",
1574
- "# have to test model"
1575
- ],
1576
- "metadata": {
1577
- "colab": {
1578
- "base_uri": "https://localhost:8080/"
1579
- },
1580
- "id": "mq5qeFd1OemP",
1581
- "outputId": "43815cba-e57a-42da-8f1a-9058032ca5da"
1582
- },
1583
- "id": "mq5qeFd1OemP",
1584
- "execution_count": 40,
1585
- "outputs": [
1586
- {
1587
- "output_type": "stream",
1588
- "name": "stdout",
1589
- "text": [
1590
- "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.38.2)\n",
1591
- "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n",
1592
- "Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.20.3)\n",
1593
- "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.25.2)\n",
1594
- "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.0)\n",
1595
- "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
1596
- "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.12.25)\n",
1597
- "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
1598
- "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.2)\n",
1599
- "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.2)\n",
1600
- "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.2)\n",
1601
- "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (2023.6.0)\n",
1602
- "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (4.10.0)\n",
1603
- "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
1604
- "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n",
1605
- "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
1606
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.2.2)\n"
1607
- ]
1608
- },
1609
- {
1610
- "output_type": "execute_result",
1611
- "data": {
1612
- "text/plain": [
1613
- "RobertaForSequenceClassification(\n",
1614
- " (roberta): RobertaModel(\n",
1615
- " (embeddings): RobertaEmbeddings(\n",
1616
- " (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
1617
- " (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
1618
- " (token_type_embeddings): Embedding(1, 768)\n",
1619
- " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
1620
- " (dropout): Dropout(p=0.1, inplace=False)\n",
1621
- " )\n",
1622
- " (encoder): RobertaEncoder(\n",
1623
- " (layer): ModuleList(\n",
1624
- " (0-11): 12 x RobertaLayer(\n",
1625
- " (attention): RobertaAttention(\n",
1626
- " (self): RobertaSelfAttention(\n",
1627
- " (query): Linear(in_features=768, out_features=768, bias=True)\n",
1628
- " (key): Linear(in_features=768, out_features=768, bias=True)\n",
1629
- " (value): Linear(in_features=768, out_features=768, bias=True)\n",
1630
- " (dropout): Dropout(p=0.1, inplace=False)\n",
1631
- " )\n",
1632
- " (output): RobertaSelfOutput(\n",
1633
- " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
1634
- " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
1635
- " (dropout): Dropout(p=0.1, inplace=False)\n",
1636
- " )\n",
1637
- " )\n",
1638
- " (intermediate): RobertaIntermediate(\n",
1639
- " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
1640
- " (intermediate_act_fn): GELUActivation()\n",
1641
- " )\n",
1642
- " (output): RobertaOutput(\n",
1643
- " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
1644
- " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
1645
- " (dropout): Dropout(p=0.1, inplace=False)\n",
1646
- " )\n",
1647
- " )\n",
1648
- " )\n",
1649
- " )\n",
1650
- " )\n",
1651
- " (classifier): RobertaClassificationHead(\n",
1652
- " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
1653
- " (dropout): Dropout(p=0.1, inplace=False)\n",
1654
- " (out_proj): Linear(in_features=768, out_features=2, bias=True)\n",
1655
- " )\n",
1656
- ")"
1657
- ]
1658
- },
1659
- "metadata": {},
1660
- "execution_count": 40
1661
- }
1662
- ]
1663
- },
1664
- {
1665
- "cell_type": "code",
1666
- "source": [
1667
- "text_classifier = pipeline(\"text-classification\", model=model, tokenizer=tokenizer)\n",
1668
- "def test_model():\n",
1669
- " while True:\n",
1670
- " statement = input(\"Enter a statement to test (or type 'exit' to quit): \")\n",
1671
- " if statement.lower() == 'exit':\n",
1672
- " break\n",
1673
- " offensive_probabilities = text_classifier(statement)\n",
1674
- " print(offensive_probabilities)"
1675
- ],
1676
- "metadata": {
1677
- "id": "l6MAu_NnUdxG"
1678
- },
1679
- "id": "l6MAu_NnUdxG",
1680
- "execution_count": 43,
1681
- "outputs": []
1682
- },
1683
- {
1684
- "cell_type": "code",
1685
- "source": [
1686
- "test_model()"
1687
- ],
1688
- "metadata": {
1689
- "colab": {
1690
- "base_uri": "https://localhost:8080/"
1691
- },
1692
- "id": "YRnBVSW2UjyW",
1693
- "outputId": "f723c664-95eb-406e-9178-46bee6a7f5af"
1694
- },
1695
- "id": "YRnBVSW2UjyW",
1696
- "execution_count": null,
1697
- "outputs": [
1698
- {
1699
- "output_type": "stream",
1700
- "name": "stdout",
1701
- "text": [
1702
- "Enter a statement to test (or type 'exit' to quit): kill\n",
1703
- "[{'label': 'nothate', 'score': 0.9961766004562378}]\n"
1704
- ]
1705
- }
1706
- ]
1707
- },
1708
- {
1709
- "cell_type": "code",
1710
- "source": [],
1711
- "metadata": {
1712
- "id": "jCFfdS6CUNN7"
1713
- },
1714
- "id": "jCFfdS6CUNN7",
1715
- "execution_count": null,
1716
- "outputs": []
1717
- },
1718
- {
1719
- "cell_type": "markdown",
1720
- "source": [],
1721
- "metadata": {
1722
- "id": "XOIePsq1N5Fo"
1723
- },
1724
- "id": "XOIePsq1N5Fo"
1725
- },
1726
- {
1727
- "cell_type": "code",
1728
- "source": [
1729
- "from sklearn.metrics import accuracy_score, classification_report\n",
1730
- "\n",
1731
- "y_pred = clf.predict(X_test)\n",
1732
- "print(f\"Accuracy: {accuracy_score(y_test, y_pred)}\")\n",
1733
- "print(classification_report(y_test, y_pred))"
1734
- ],
1735
- "metadata": {
1736
- "id": "sdFRCXtGY3yI"
1737
- },
1738
- "id": "sdFRCXtGY3yI",
1739
- "execution_count": null,
1740
- "outputs": []
1741
- },
1742
- {
1743
- "cell_type": "code",
1744
- "execution_count": null,
1745
- "metadata": {
1746
- "id": "fb36a279"
1747
- },
1748
- "outputs": [],
1749
- "source": [],
1750
- "id": "fb36a279"
1751
- }
1752
- ],
1753
- "metadata": {
1754
- "colab": {
1755
- "provenance": []
1756
- },
1757
- "kernelspec": {
1758
- "display_name": "Python 3 (ipykernel)",
1759
- "language": "python",
1760
- "name": "python3"
1761
- },
1762
- "language_info": {
1763
- "codemirror_mode": {
1764
- "name": "ipython",
1765
- "version": 3
1766
- },
1767
- "file_extension": ".py",
1768
- "mimetype": "text/x-python",
1769
- "name": "python",
1770
- "nbconvert_exporter": "python",
1771
- "pygments_lexer": "ipython3",
1772
- "version": "3.11.5"
1773
- }
1774
- },
1775
- "nbformat": 4,
1776
- "nbformat_minor": 5
1777
- }