Kartik17 commited on
Commit
1b606c3
1 Parent(s): 368a008

Upload PROJECT TOXIC COMMENT ANALYZER.ipynb

Browse files
Files changed (1) hide show
  1. PROJECT TOXIC COMMENT ANALYZER.ipynb +1486 -0
PROJECT TOXIC COMMENT ANALYZER.ipynb ADDED
@@ -0,0 +1,1486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "a9a3a647",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
14
+ "\n"
15
+ ]
16
+ }
17
+ ],
18
+ "source": [
19
+ "import os\n",
20
+ "import pandas as pd\n",
21
+ "import tensorflow as tf\n",
22
+ "import numpy as np"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 167,
28
+ "id": "52960768",
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "data": {
33
+ "text/html": [
34
+ "<div>\n",
35
+ "<style scoped>\n",
36
+ " .dataframe tbody tr th:only-of-type {\n",
37
+ " vertical-align: middle;\n",
38
+ " }\n",
39
+ "\n",
40
+ " .dataframe tbody tr th {\n",
41
+ " vertical-align: top;\n",
42
+ " }\n",
43
+ "\n",
44
+ " .dataframe thead th {\n",
45
+ " text-align: right;\n",
46
+ " }\n",
47
+ "</style>\n",
48
+ "<table border=\"1\" class=\"dataframe\">\n",
49
+ " <thead>\n",
50
+ " <tr style=\"text-align: right;\">\n",
51
+ " <th></th>\n",
52
+ " <th>id</th>\n",
53
+ " <th>comment_text</th>\n",
54
+ " <th>toxic</th>\n",
55
+ " <th>severe_toxic</th>\n",
56
+ " <th>obscene</th>\n",
57
+ " <th>threat</th>\n",
58
+ " <th>insult</th>\n",
59
+ " <th>identity_hate</th>\n",
60
+ " </tr>\n",
61
+ " </thead>\n",
62
+ " <tbody>\n",
63
+ " <tr>\n",
64
+ " <th>0</th>\n",
65
+ " <td>0000997932d777bf</td>\n",
66
+ " <td>Explanation\\nWhy the edits made under my usern...</td>\n",
67
+ " <td>0</td>\n",
68
+ " <td>0</td>\n",
69
+ " <td>0</td>\n",
70
+ " <td>0</td>\n",
71
+ " <td>0</td>\n",
72
+ " <td>0</td>\n",
73
+ " </tr>\n",
74
+ " <tr>\n",
75
+ " <th>1</th>\n",
76
+ " <td>000103f0d9cfb60f</td>\n",
77
+ " <td>D'aww! He matches this background colour I'm s...</td>\n",
78
+ " <td>0</td>\n",
79
+ " <td>0</td>\n",
80
+ " <td>0</td>\n",
81
+ " <td>0</td>\n",
82
+ " <td>0</td>\n",
83
+ " <td>0</td>\n",
84
+ " </tr>\n",
85
+ " <tr>\n",
86
+ " <th>2</th>\n",
87
+ " <td>000113f07ec002fd</td>\n",
88
+ " <td>Hey man, I'm really not trying to edit war. It...</td>\n",
89
+ " <td>0</td>\n",
90
+ " <td>0</td>\n",
91
+ " <td>0</td>\n",
92
+ " <td>0</td>\n",
93
+ " <td>0</td>\n",
94
+ " <td>0</td>\n",
95
+ " </tr>\n",
96
+ " <tr>\n",
97
+ " <th>3</th>\n",
98
+ " <td>0001b41b1c6bb37e</td>\n",
99
+ " <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n",
100
+ " <td>0</td>\n",
101
+ " <td>0</td>\n",
102
+ " <td>0</td>\n",
103
+ " <td>0</td>\n",
104
+ " <td>0</td>\n",
105
+ " <td>0</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>4</th>\n",
109
+ " <td>0001d958c54c6e35</td>\n",
110
+ " <td>You, sir, are my hero. Any chance you remember...</td>\n",
111
+ " <td>0</td>\n",
112
+ " <td>0</td>\n",
113
+ " <td>0</td>\n",
114
+ " <td>0</td>\n",
115
+ " <td>0</td>\n",
116
+ " <td>0</td>\n",
117
+ " </tr>\n",
118
+ " </tbody>\n",
119
+ "</table>\n",
120
+ "</div>"
121
+ ],
122
+ "text/plain": [
123
+ " id comment_text toxic \\\n",
124
+ "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n",
125
+ "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n",
126
+ "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n",
127
+ "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n",
128
+ "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n",
129
+ "\n",
130
+ " severe_toxic obscene threat insult identity_hate \n",
131
+ "0 0 0 0 0 0 \n",
132
+ "1 0 0 0 0 0 \n",
133
+ "2 0 0 0 0 0 \n",
134
+ "3 0 0 0 0 0 \n",
135
+ "4 0 0 0 0 0 "
136
+ ]
137
+ },
138
+ "execution_count": 167,
139
+ "metadata": {},
140
+ "output_type": "execute_result"
141
+ },
142
+ {
143
+ "name": "stdout",
144
+ "output_type": "stream",
145
+ "text": [
146
+ "1/1 [==============================] - 0s 327ms/step\n"
147
+ ]
148
+ }
149
+ ],
150
+ "source": [
151
+ "data=pd.read_csv('train.csv')\n",
152
+ "data.head(5)"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 4,
158
+ "id": "4bb87073",
159
+ "metadata": {},
160
+ "outputs": [
161
+ {
162
+ "data": {
163
+ "text/plain": [
164
+ "\"Sorry if the word 'nonsense' was offensive to you. Anyway, I'm not intending to write anything in the article(wow they would jump on me for vandalism), I'm merely requesting that it be more encyclopedic so one can use it for school as a reference. I have been to the selective breeding page but it's almost a stub. It points to 'animal breeding' which is a short messy article that gives you no info. There must be someone around with expertise in eugenics? 93.161.107.169\""
165
+ ]
166
+ },
167
+ "execution_count": 4,
168
+ "metadata": {},
169
+ "output_type": "execute_result"
170
+ }
171
+ ],
172
+ "source": [
173
+ "data['comment_text'][8]"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 5,
179
+ "id": "c6e7509b",
180
+ "metadata": {},
181
+ "outputs": [
182
+ {
183
+ "data": {
184
+ "text/plain": [
185
+ "Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',\n",
186
+ " 'insult', 'identity_hate'],\n",
187
+ " dtype='object')"
188
+ ]
189
+ },
190
+ "execution_count": 5,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ " data.columns"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 6,
202
+ "id": "2802af7a",
203
+ "metadata": {},
204
+ "outputs": [
205
+ {
206
+ "data": {
207
+ "text/plain": [
208
+ "(159571, 8)"
209
+ ]
210
+ },
211
+ "execution_count": 6,
212
+ "metadata": {},
213
+ "output_type": "execute_result"
214
+ }
215
+ ],
216
+ "source": [
217
+ "data.shape"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 7,
223
+ "id": "97449fcb",
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "data": {
228
+ "text/plain": [
229
+ "toxic 0\n",
230
+ "severe_toxic 0\n",
231
+ "obscene 0\n",
232
+ "threat 0\n",
233
+ "insult 0\n",
234
+ "identity_hate 0\n",
235
+ "Name: 9, dtype: int64"
236
+ ]
237
+ },
238
+ "execution_count": 7,
239
+ "metadata": {},
240
+ "output_type": "execute_result"
241
+ }
242
+ ],
243
+ "source": [
244
+ "data[data.columns[2:]].iloc[9]"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": null,
250
+ "id": "8844c1b7",
251
+ "metadata": {},
252
+ "outputs": [],
253
+ "source": []
254
+ },
255
+ {
256
+ "cell_type": "markdown",
257
+ "id": "bbd67b78",
258
+ "metadata": {},
259
+ "source": [
260
+ "## Preprocessing"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 8,
266
+ "id": "6d23f922",
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "from tensorflow.keras.layers import TextVectorization"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": 9,
276
+ "id": "a3d9e014",
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "x=data['comment_text']\n",
281
+ "y=data[data.columns[2:]].values"
282
+ ]
283
+ },
284
+ {
285
+ "cell_type": "code",
286
+ "execution_count": 10,
287
+ "id": "eb1eefc0",
288
+ "metadata": {},
289
+ "outputs": [
290
+ {
291
+ "data": {
292
+ "text/plain": [
293
+ "0 Explanation\\nWhy the edits made under my usern...\n",
294
+ "1 D'aww! He matches this background colour I'm s...\n",
295
+ "2 Hey man, I'm really not trying to edit war. It...\n",
296
+ "3 \"\\nMore\\nI can't make any real suggestions on ...\n",
297
+ "4 You, sir, are my hero. Any chance you remember...\n",
298
+ " ... \n",
299
+ "159566 \":::::And for the second time of asking, when ...\n",
300
+ "159567 You should be ashamed of yourself \\n\\nThat is ...\n",
301
+ "159568 Spitzer \\n\\nUmm, theres no actual article for ...\n",
302
+ "159569 And it looks like it was actually you who put ...\n",
303
+ "159570 \"\\nAnd ... I really don't think you understand...\n",
304
+ "Name: comment_text, Length: 159571, dtype: object"
305
+ ]
306
+ },
307
+ "execution_count": 10,
308
+ "metadata": {},
309
+ "output_type": "execute_result"
310
+ }
311
+ ],
312
+ "source": [
313
+ "x"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "code",
318
+ "execution_count": 11,
319
+ "id": "414f8a4c",
320
+ "metadata": {},
321
+ "outputs": [
322
+ {
323
+ "data": {
324
+ "text/plain": [
325
+ "array([[0, 0, 0, 0, 0, 0],\n",
326
+ " [0, 0, 0, 0, 0, 0],\n",
327
+ " [0, 0, 0, 0, 0, 0],\n",
328
+ " ...,\n",
329
+ " [0, 0, 0, 0, 0, 0],\n",
330
+ " [0, 0, 0, 0, 0, 0],\n",
331
+ " [0, 0, 0, 0, 0, 0]], dtype=int64)"
332
+ ]
333
+ },
334
+ "execution_count": 11,
335
+ "metadata": {},
336
+ "output_type": "execute_result"
337
+ }
338
+ ],
339
+ "source": [
340
+ "y"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": 12,
346
+ "id": "70ec2244",
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "max_features=200000"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": 13,
356
+ "id": "b6a83b69",
357
+ "metadata": {},
358
+ "outputs": [
359
+ {
360
+ "name": "stdout",
361
+ "output_type": "stream",
362
+ "text": [
363
+ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n",
364
+ "\n"
365
+ ]
366
+ }
367
+ ],
368
+ "source": [
369
+ "vectorizer=TextVectorization(max_tokens=max_features,\n",
370
+ " output_sequence_length=1800,\n",
371
+ " output_mode='int')"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 14,
377
+ "id": "ba246221",
378
+ "metadata": {},
379
+ "outputs": [
380
+ {
381
+ "data": {
382
+ "text/plain": [
383
+ "['', '[UNK]']"
384
+ ]
385
+ },
386
+ "execution_count": 14,
387
+ "metadata": {},
388
+ "output_type": "execute_result"
389
+ }
390
+ ],
391
+ "source": [
392
+ "vectorizer.get_vocabulary()"
393
+ ]
394
+ },
395
+ {
396
+ "cell_type": "code",
397
+ "execution_count": 15,
398
+ "id": "9648914d",
399
+ "metadata": {},
400
+ "outputs": [
401
+ {
402
+ "name": "stdout",
403
+ "output_type": "stream",
404
+ "text": [
405
+ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\utils\\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.\n",
406
+ "\n"
407
+ ]
408
+ }
409
+ ],
410
+ "source": [
411
+ "vectorizer.adapt(x.values)"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": 16,
417
+ "id": "75b035a9",
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "data": {
422
+ "text/plain": [
423
+ "<tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 19, 7, 3666, 2891, 338], dtype=int64)>"
424
+ ]
425
+ },
426
+ "execution_count": 16,
427
+ "metadata": {},
428
+ "output_type": "execute_result"
429
+ }
430
+ ],
431
+ "source": [
432
+ "vectorizer(\"have you watched breaking bad\")[:5]"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": 17,
438
+ "id": "8854984d",
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "vectorized_text=vectorizer(x.values)"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 18,
448
+ "id": "9fb407a3",
449
+ "metadata": {},
450
+ "outputs": [
451
+ {
452
+ "data": {
453
+ "text/plain": [
454
+ "<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=\n",
455
+ "array([[ 645, 76, 2, ..., 0, 0, 0],\n",
456
+ " [ 1, 54, 2489, ..., 0, 0, 0],\n",
457
+ " [ 425, 441, 70, ..., 0, 0, 0],\n",
458
+ " ...,\n",
459
+ " [32445, 7392, 383, ..., 0, 0, 0],\n",
460
+ " [ 5, 12, 534, ..., 0, 0, 0],\n",
461
+ " [ 5, 8, 130, ..., 0, 0, 0]], dtype=int64)>"
462
+ ]
463
+ },
464
+ "execution_count": 18,
465
+ "metadata": {},
466
+ "output_type": "execute_result"
467
+ }
468
+ ],
469
+ "source": [
470
+ "vectorized_text"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": 19,
476
+ "id": "0aa74efc",
477
+ "metadata": {},
478
+ "outputs": [],
479
+ "source": [
480
+ "dataset=tf.data.Dataset.from_tensor_slices((vectorized_text, y))\n",
481
+ "dataset=dataset.cache()\n",
482
+ "dataset=dataset.shuffle(160000)\n",
483
+ "dataset=dataset.batch(16)\n",
484
+ "dataset=dataset.prefetch(8)"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "code",
489
+ "execution_count": 20,
490
+ "id": "ff040bf8",
491
+ "metadata": {},
492
+ "outputs": [
493
+ {
494
+ "data": {
495
+ "text/plain": [
496
+ "9973.1875"
497
+ ]
498
+ },
499
+ "execution_count": 20,
500
+ "metadata": {},
501
+ "output_type": "execute_result"
502
+ }
503
+ ],
504
+ "source": [
505
+ "159571/16"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 21,
511
+ "id": "fd8b18f5",
512
+ "metadata": {},
513
+ "outputs": [],
514
+ "source": [
515
+ "batch_x, batch_y = dataset.as_numpy_iterator().next()"
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": 22,
521
+ "id": "d81bb1af",
522
+ "metadata": {},
523
+ "outputs": [
524
+ {
525
+ "data": {
526
+ "text/plain": [
527
+ "(16, 1800)"
528
+ ]
529
+ },
530
+ "execution_count": 22,
531
+ "metadata": {},
532
+ "output_type": "execute_result"
533
+ }
534
+ ],
535
+ "source": [
536
+ "batch_x.shape"
537
+ ]
538
+ },
539
+ {
540
+ "cell_type": "code",
541
+ "execution_count": 23,
542
+ "id": "2cfeca51",
543
+ "metadata": {},
544
+ "outputs": [
545
+ {
546
+ "data": {
547
+ "text/plain": [
548
+ "(16, 6)"
549
+ ]
550
+ },
551
+ "execution_count": 23,
552
+ "metadata": {},
553
+ "output_type": "execute_result"
554
+ }
555
+ ],
556
+ "source": [
557
+ "batch_y.shape"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 24,
563
+ "id": "9d8a90ce",
564
+ "metadata": {},
565
+ "outputs": [
566
+ {
567
+ "data": {
568
+ "text/plain": [
569
+ "9974"
570
+ ]
571
+ },
572
+ "execution_count": 24,
573
+ "metadata": {},
574
+ "output_type": "execute_result"
575
+ }
576
+ ],
577
+ "source": [
578
+ "len(dataset)"
579
+ ]
580
+ },
581
+ {
582
+ "cell_type": "code",
583
+ "execution_count": 25,
584
+ "id": "5a111205",
585
+ "metadata": {},
586
+ "outputs": [
587
+ {
588
+ "data": {
589
+ "text/plain": [
590
+ "6981"
591
+ ]
592
+ },
593
+ "execution_count": 25,
594
+ "metadata": {},
595
+ "output_type": "execute_result"
596
+ }
597
+ ],
598
+ "source": [
599
+ "int(len(dataset)*.7)"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": 26,
605
+ "id": "34094209",
606
+ "metadata": {},
607
+ "outputs": [],
608
+ "source": [
609
+ "train=dataset.take(int(len(dataset)*.7))\n",
610
+ "val=dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))\n",
611
+ "test=dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))"
612
+ ]
613
+ },
614
+ {
615
+ "cell_type": "code",
616
+ "execution_count": 27,
617
+ "id": "2e5369af",
618
+ "metadata": {},
619
+ "outputs": [
620
+ {
621
+ "data": {
622
+ "text/plain": [
623
+ "(6981, 1994, 997)"
624
+ ]
625
+ },
626
+ "execution_count": 27,
627
+ "metadata": {},
628
+ "output_type": "execute_result"
629
+ }
630
+ ],
631
+ "source": [
632
+ "len(train),len(val),len(test)"
633
+ ]
634
+ },
635
+ {
636
+ "cell_type": "code",
637
+ "execution_count": 28,
638
+ "id": "3bb32ca4",
639
+ "metadata": {},
640
+ "outputs": [],
641
+ "source": [
642
+ "train_generator=train.as_numpy_iterator()"
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "code",
647
+ "execution_count": 29,
648
+ "id": "32f4500b",
649
+ "metadata": {},
650
+ "outputs": [
651
+ {
652
+ "data": {
653
+ "text/plain": [
654
+ "(array([[ 73, 9, 12, ..., 0, 0, 0],\n",
655
+ " [182862, 88, 7, ..., 0, 0, 0],\n",
656
+ " [ 4384, 274, 139, ..., 0, 0, 0],\n",
657
+ " ...,\n",
658
+ " [ 14, 9, 21, ..., 0, 0, 0],\n",
659
+ " [ 1188, 399, 123, ..., 0, 0, 0],\n",
660
+ " [ 46927, 175, 425, ..., 0, 0, 0]], dtype=int64),\n",
661
+ " array([[0, 0, 0, 0, 0, 0],\n",
662
+ " [0, 0, 0, 0, 0, 0],\n",
663
+ " [1, 0, 1, 0, 1, 0],\n",
664
+ " [0, 0, 0, 0, 0, 0],\n",
665
+ " [0, 0, 0, 0, 0, 0],\n",
666
+ " [0, 0, 0, 0, 0, 0],\n",
667
+ " [0, 0, 0, 0, 0, 0],\n",
668
+ " [0, 0, 0, 0, 0, 0],\n",
669
+ " [0, 0, 0, 0, 0, 0],\n",
670
+ " [0, 0, 0, 0, 0, 0],\n",
671
+ " [0, 0, 0, 0, 0, 0],\n",
672
+ " [0, 0, 0, 0, 0, 0],\n",
673
+ " [0, 0, 0, 0, 0, 0],\n",
674
+ " [0, 0, 0, 0, 0, 0],\n",
675
+ " [0, 0, 0, 0, 0, 0],\n",
676
+ " [0, 0, 0, 0, 0, 0]], dtype=int64))"
677
+ ]
678
+ },
679
+ "execution_count": 29,
680
+ "metadata": {},
681
+ "output_type": "execute_result"
682
+ }
683
+ ],
684
+ "source": [
685
+ "train_generator.next()"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": 30,
691
+ "id": "cbc9a9b2",
692
+ "metadata": {},
693
+ "outputs": [],
694
+ "source": [
695
+ "from tensorflow.keras.models import Sequential\n",
696
+ "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding"
697
+ ]
698
+ },
699
+ {
700
+ "cell_type": "code",
701
+ "execution_count": 31,
702
+ "id": "6dd6bf3d",
703
+ "metadata": {},
704
+ "outputs": [],
705
+ "source": [
706
+ "model=Sequential()"
707
+ ]
708
+ },
709
+ {
710
+ "cell_type": "code",
711
+ "execution_count": 32,
712
+ "id": "e33e5c86",
713
+ "metadata": {},
714
+ "outputs": [],
715
+ "source": [
716
+ "model.add(Embedding(max_features+1, 32))\n",
717
+ "model.add(Bidirectional(LSTM(32, activation='tanh')))\n",
718
+ "model.add(Dense(128, activation='relu'))\n",
719
+ "model.add(Dense(256, activation='relu'))\n",
720
+ "model.add(Dense(128, activation='relu'))\n",
721
+ "model.add(Dense(6, activation='sigmoid'))"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": 33,
727
+ "id": "6821b620",
728
+ "metadata": {},
729
+ "outputs": [
730
+ {
731
+ "name": "stdout",
732
+ "output_type": "stream",
733
+ "text": [
734
+ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\optimizers\\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n",
735
+ "\n"
736
+ ]
737
+ }
738
+ ],
739
+ "source": [
740
+ "model.compile(loss='BinaryCrossentropy', optimizer='adam', metrics=['accuracy'])"
741
+ ]
742
+ },
743
+ {
744
+ "cell_type": "code",
745
+ "execution_count": 34,
746
+ "id": "f06f01e5",
747
+ "metadata": {},
748
+ "outputs": [
749
+ {
750
+ "name": "stdout",
751
+ "output_type": "stream",
752
+ "text": [
753
+ "Model: \"sequential\"\n",
754
+ "_________________________________________________________________\n",
755
+ " Layer (type) Output Shape Param # \n",
756
+ "=================================================================\n",
757
+ " embedding (Embedding) (None, None, 32) 6400032 \n",
758
+ " \n",
759
+ " bidirectional (Bidirection (None, 64) 16640 \n",
760
+ " al) \n",
761
+ " \n",
762
+ " dense (Dense) (None, 128) 8320 \n",
763
+ " \n",
764
+ " dense_1 (Dense) (None, 256) 33024 \n",
765
+ " \n",
766
+ " dense_2 (Dense) (None, 128) 32896 \n",
767
+ " \n",
768
+ " dense_3 (Dense) (None, 6) 774 \n",
769
+ " \n",
770
+ "=================================================================\n",
771
+ "Total params: 6491686 (24.76 MB)\n",
772
+ "Trainable params: 6491686 (24.76 MB)\n",
773
+ "Non-trainable params: 0 (0.00 Byte)\n",
774
+ "_________________________________________________________________\n"
775
+ ]
776
+ }
777
+ ],
778
+ "source": [
779
+ "model.summary()"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": 36,
785
+ "id": "376ceed5",
786
+ "metadata": {},
787
+ "outputs": [
788
+ {
789
+ "name": "stdout",
790
+ "output_type": "stream",
791
+ "text": [
792
+ "Epoch 1/10\n",
793
+ "WARNING:tensorflow:From C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\engine\\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.\n",
794
+ "\n",
795
+ "6981/6981 [==============================] - 5071s 726ms/step - loss: 0.0635 - accuracy: 0.9855 - val_loss: 0.0452 - val_accuracy: 0.9946\n",
796
+ "Epoch 2/10\n",
797
+ "6981/6981 [==============================] - 4516s 647ms/step - loss: 0.0454 - accuracy: 0.9942 - val_loss: 0.0399 - val_accuracy: 0.9938\n",
798
+ "Epoch 3/10\n",
799
+ "6981/6981 [==============================] - 4100s 587ms/step - loss: 0.0407 - accuracy: 0.9889 - val_loss: 0.0373 - val_accuracy: 0.9941\n",
800
+ "Epoch 4/10\n",
801
+ "6981/6981 [==============================] - 4111s 589ms/step - loss: 0.0371 - accuracy: 0.9920 - val_loss: 0.0327 - val_accuracy: 0.9948\n",
802
+ "Epoch 5/10\n",
803
+ "6981/6981 [==============================] - 4691s 672ms/step - loss: 0.0334 - accuracy: 0.9941 - val_loss: 0.0302 - val_accuracy: 0.9940\n",
804
+ "Epoch 6/10\n",
805
+ "6981/6981 [==============================] - 5055s 724ms/step - loss: 0.0311 - accuracy: 0.9841 - val_loss: 0.0275 - val_accuracy: 0.9944\n",
806
+ "Epoch 7/10\n",
807
+ "6981/6981 [==============================] - 4508s 646ms/step - loss: 0.0277 - accuracy: 0.9937 - val_loss: 0.0245 - val_accuracy: 0.9930\n",
808
+ "Epoch 8/10\n",
809
+ "6981/6981 [==============================] - 4479s 642ms/step - loss: 0.0254 - accuracy: 0.9907 - val_loss: 0.0228 - val_accuracy: 0.9940\n",
810
+ "Epoch 9/10\n",
811
+ "6981/6981 [==============================] - 4501s 645ms/step - loss: 0.0228 - accuracy: 0.9892 - val_loss: 0.0193 - val_accuracy: 0.9950\n",
812
+ "Epoch 10/10\n",
813
+ "6981/6981 [==============================] - 4523s 648ms/step - loss: 0.0209 - accuracy: 0.9200 - val_loss: 0.0192 - val_accuracy: 0.9943\n"
814
+ ]
815
+ }
816
+ ],
817
+ "source": [
818
+ "history=model.fit(train, epochs=10, validation_data=val)"
819
+ ]
820
+ },
821
+ {
822
+ "cell_type": "code",
823
+ "execution_count": 37,
824
+ "id": "cb6501e6",
825
+ "metadata": {},
826
+ "outputs": [
827
+ {
828
+ "name": "stdout",
829
+ "output_type": "stream",
830
+ "text": [
831
+ "997/997 [==============================] - 158s 146ms/step - loss: 0.0188 - accuracy: 0.9940\n"
832
+ ]
833
+ },
834
+ {
835
+ "data": {
836
+ "text/plain": [
837
+ "[0.018809018656611443, 0.9939819574356079]"
838
+ ]
839
+ },
840
+ "execution_count": 37,
841
+ "metadata": {},
842
+ "output_type": "execute_result"
843
+ }
844
+ ],
845
+ "source": [
846
+ "model.evaluate(test)"
847
+ ]
848
+ },
849
+ {
850
+ "cell_type": "code",
851
+ "execution_count": 40,
852
+ "id": "92408998",
853
+ "metadata": {},
854
+ "outputs": [],
855
+ "source": [
856
+ "x_batch, y_batch = test.as_numpy_iterator().next()"
857
+ ]
858
+ },
859
+ {
860
+ "cell_type": "code",
861
+ "execution_count": 41,
862
+ "id": "1c555107",
863
+ "metadata": {},
864
+ "outputs": [
865
+ {
866
+ "name": "stdout",
867
+ "output_type": "stream",
868
+ "text": [
869
+ "1/1 [==============================] - 2s 2s/step\n"
870
+ ]
871
+ },
872
+ {
873
+ "data": {
874
+ "text/plain": [
875
+ "array([[0, 0, 0, 0, 0, 0],\n",
876
+ " [0, 0, 0, 0, 0, 0],\n",
877
+ " [1, 0, 0, 0, 0, 0],\n",
878
+ " [0, 0, 0, 0, 0, 0],\n",
879
+ " [0, 0, 0, 0, 0, 0],\n",
880
+ " [0, 0, 0, 0, 0, 0],\n",
881
+ " [0, 0, 0, 0, 0, 0],\n",
882
+ " [0, 0, 0, 0, 0, 0],\n",
883
+ " [0, 0, 0, 0, 0, 0],\n",
884
+ " [0, 0, 0, 0, 0, 0],\n",
885
+ " [0, 0, 0, 0, 0, 0],\n",
886
+ " [0, 0, 0, 0, 0, 0],\n",
887
+ " [0, 0, 0, 0, 0, 0],\n",
888
+ " [1, 0, 1, 0, 1, 0],\n",
889
+ " [0, 0, 0, 0, 0, 0],\n",
890
+ " [0, 0, 0, 0, 0, 0]])"
891
+ ]
892
+ },
893
+ "execution_count": 41,
894
+ "metadata": {},
895
+ "output_type": "execute_result"
896
+ }
897
+ ],
898
+ "source": [
899
+ "(model.predict(x_batch) > 0.5).astype(int)"
900
+ ]
901
+ },
902
+ {
903
+ "cell_type": "code",
904
+ "execution_count": 42,
905
+ "id": "26a06914",
906
+ "metadata": {},
907
+ "outputs": [
908
+ {
909
+ "data": {
910
+ "text/plain": [
911
+ "array([[0, 0, 0, 0, 0, 0],\n",
912
+ " [0, 0, 0, 0, 0, 0],\n",
913
+ " [1, 0, 0, 0, 0, 0],\n",
914
+ " [0, 0, 0, 0, 0, 0],\n",
915
+ " [0, 0, 0, 0, 0, 0],\n",
916
+ " [0, 0, 0, 0, 0, 0],\n",
917
+ " [0, 0, 0, 0, 0, 0],\n",
918
+ " [0, 0, 0, 0, 0, 0],\n",
919
+ " [0, 0, 0, 0, 0, 0],\n",
920
+ " [0, 0, 0, 0, 0, 0],\n",
921
+ " [0, 0, 0, 0, 0, 0],\n",
922
+ " [0, 0, 0, 0, 0, 0],\n",
923
+ " [0, 0, 0, 0, 0, 0],\n",
924
+ " [1, 0, 1, 0, 1, 0],\n",
925
+ " [0, 0, 0, 0, 0, 0],\n",
926
+ " [0, 0, 0, 0, 0, 0]], dtype=int64)"
927
+ ]
928
+ },
929
+ "execution_count": 42,
930
+ "metadata": {},
931
+ "output_type": "execute_result"
932
+ }
933
+ ],
934
+ "source": [
935
+ "y_batch"
936
+ ]
937
+ },
938
+ {
939
+ "cell_type": "code",
940
+ "execution_count": 49,
941
+ "id": "0ef7c06b",
942
+ "metadata": {},
943
+ "outputs": [],
944
+ "source": [
945
+ "input_text=vectorizer('I am coming to kill you pal')"
946
+ ]
947
+ },
948
+ {
949
+ "cell_type": "code",
950
+ "execution_count": 50,
951
+ "id": "5bb057fa",
952
+ "metadata": {},
953
+ "outputs": [
954
+ {
955
+ "data": {
956
+ "text/plain": [
957
+ "<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 8, 74, 939, 3, 950, 7, 5762], dtype=int64)>"
958
+ ]
959
+ },
960
+ "execution_count": 50,
961
+ "metadata": {},
962
+ "output_type": "execute_result"
963
+ }
964
+ ],
965
+ "source": [
966
+ "input_text[:7]"
967
+ ]
968
+ },
969
+ {
970
+ "cell_type": "code",
971
+ "execution_count": 51,
972
+ "id": "7ab223e7",
973
+ "metadata": {},
974
+ "outputs": [],
975
+ "source": [
976
+ "batch=test.as_numpy_iterator().next()"
977
+ ]
978
+ },
979
+ {
980
+ "cell_type": "code",
981
+ "execution_count": 52,
982
+ "id": "3986d97b",
983
+ "metadata": {},
984
+ "outputs": [
985
+ {
986
+ "name": "stdout",
987
+ "output_type": "stream",
988
+ "text": [
989
+ "1/1 [==============================] - 0s 78ms/step\n"
990
+ ]
991
+ }
992
+ ],
993
+ "source": [
994
+ "res=model.predict(np.expand_dims(input_text,0))"
995
+ ]
996
+ },
997
+ {
998
+ "cell_type": "code",
999
+ "execution_count": 53,
1000
+ "id": "5df2d7da",
1001
+ "metadata": {},
1002
+ "outputs": [
1003
+ {
1004
+ "data": {
1005
+ "text/plain": [
1006
+ "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n",
1007
+ " 'identity_hate'],\n",
1008
+ " dtype='object')"
1009
+ ]
1010
+ },
1011
+ "execution_count": 53,
1012
+ "metadata": {},
1013
+ "output_type": "execute_result"
1014
+ }
1015
+ ],
1016
+ "source": [
1017
+ "data.columns[2:]"
1018
+ ]
1019
+ },
1020
+ {
1021
+ "cell_type": "code",
1022
+ "execution_count": 54,
1023
+ "id": "ee22bb73",
1024
+ "metadata": {},
1025
+ "outputs": [
1026
+ {
1027
+ "data": {
1028
+ "text/plain": [
1029
+ "array([[0.54140395, 0.00114176, 0.01782109, 0.10045966, 0.0319472 ,\n",
1030
+ " 0.02094165]], dtype=float32)"
1031
+ ]
1032
+ },
1033
+ "execution_count": 54,
1034
+ "metadata": {},
1035
+ "output_type": "execute_result"
1036
+ }
1037
+ ],
1038
+ "source": [
1039
+ "res"
1040
+ ]
1041
+ },
1042
+ {
1043
+ "cell_type": "markdown",
1044
+ "id": "fa7378c8",
1045
+ "metadata": {},
1046
+ "source": [
1047
+ "## Evaluate the Model"
1048
+ ]
1049
+ },
1050
+ {
1051
+ "cell_type": "code",
1052
+ "execution_count": 59,
1053
+ "id": "c2b08a8c",
1054
+ "metadata": {},
1055
+ "outputs": [],
1056
+ "source": [
1057
+ "model.save('finalproject.keras')"
1058
+ ]
1059
+ },
1060
+ {
1061
+ "cell_type": "code",
1062
+ "execution_count": 60,
1063
+ "id": "71e114bc",
1064
+ "metadata": {},
1065
+ "outputs": [
1066
+ {
1067
+ "name": "stderr",
1068
+ "output_type": "stream",
1069
+ "text": [
1070
+ "C:\\Users\\karti\\anaconda3\\Lib\\site-packages\\keras\\src\\engine\\training.py:3103: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n",
1071
+ " saving_api.save_model(\n"
1072
+ ]
1073
+ }
1074
+ ],
1075
+ "source": [
1076
+ "model.save('finalprojecttoxic.h5')"
1077
+ ]
1078
+ },
1079
+ {
1080
+ "cell_type": "markdown",
1081
+ "id": "6abdcdb8",
1082
+ "metadata": {},
1083
+ "source": [
1084
+ "## Making a Language Translation"
1085
+ ]
1086
+ },
1087
+ {
1088
+ "cell_type": "code",
1089
+ "execution_count": 97,
1090
+ "id": "442cd16b",
1091
+ "metadata": {},
1092
+ "outputs": [],
1093
+ "source": [
1094
+ "from transformers import pipeline"
1095
+ ]
1096
+ },
1097
+ {
1098
+ "cell_type": "code",
1099
+ "execution_count": 125,
1100
+ "id": "95b31788",
1101
+ "metadata": {},
1102
+ "outputs": [],
1103
+ "source": [
1104
+ "translator_german=pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-de-en\", tokenizer=\"Helsinki-NLP/opus-mt-de-en\")"
1105
+ ]
1106
+ },
1107
+ {
1108
+ "cell_type": "code",
1109
+ "execution_count": 120,
1110
+ "id": "7e882490",
1111
+ "metadata": {},
1112
+ "outputs": [],
1113
+ "source": [
1114
+ "german=\"Hallo, wie heißt du?\""
1115
+ ]
1116
+ },
1117
+ {
1118
+ "cell_type": "code",
1119
+ "execution_count": 126,
1120
+ "id": "dcfefba8",
1121
+ "metadata": {},
1122
+ "outputs": [
1123
+ {
1124
+ "data": {
1125
+ "text/plain": [
1126
+ "\"Hello, what's your name?\""
1127
+ ]
1128
+ },
1129
+ "execution_count": 126,
1130
+ "metadata": {},
1131
+ "output_type": "execute_result"
1132
+ }
1133
+ ],
1134
+ "source": [
1135
+ "en_to_german=translator_german(german)\n",
1136
+ "en_to_german[0]['translation_text']"
1137
+ ]
1138
+ },
1139
+ {
1140
+ "cell_type": "code",
1141
+ "execution_count": 107,
1142
+ "id": "ea54de34",
1143
+ "metadata": {},
1144
+ "outputs": [],
1145
+ "source": [
1146
+ "translator_spanish = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-es-en\", tokenizer=\"Helsinki-NLP/opus-mt-es-en\")"
1147
+ ]
1148
+ },
1149
+ {
1150
+ "cell_type": "code",
1151
+ "execution_count": 117,
1152
+ "id": "07f1c640",
1153
+ "metadata": {},
1154
+ "outputs": [],
1155
+ "source": [
1156
+ "spanish_text = \"hola como estas\""
1157
+ ]
1158
+ },
1159
+ {
1160
+ "cell_type": "code",
1161
+ "execution_count": 124,
1162
+ "id": "76b5f447",
1163
+ "metadata": {},
1164
+ "outputs": [
1165
+ {
1166
+ "data": {
1167
+ "text/plain": [
1168
+ "'Hello, how are you?'"
1169
+ ]
1170
+ },
1171
+ "execution_count": 124,
1172
+ "metadata": {},
1173
+ "output_type": "execute_result"
1174
+ }
1175
+ ],
1176
+ "source": [
1177
+ "en_to_spanish = translator(spanish_text)\n",
1178
+ "en_to_spanish[0]['translation_text']"
1179
+ ]
1180
+ },
1181
+ {
1182
+ "cell_type": "markdown",
1183
+ "id": "e08fc4e7",
1184
+ "metadata": {},
1185
+ "source": [
1186
+ "## Test and Gradio"
1187
+ ]
1188
+ },
1189
+ {
1190
+ "cell_type": "code",
1191
+ "execution_count": 61,
1192
+ "id": "7d5cdcb8",
1193
+ "metadata": {},
1194
+ "outputs": [],
1195
+ "source": [
1196
+ "import gradio as gr"
1197
+ ]
1198
+ },
1199
+ {
1200
+ "cell_type": "code",
1201
+ "execution_count": 62,
1202
+ "id": "560ec8e5",
1203
+ "metadata": {},
1204
+ "outputs": [],
1205
+ "source": [
1206
+ "model=tf.keras.models.load_model('finalprojecttoxic.h5')"
1207
+ ]
1208
+ },
1209
+ {
1210
+ "cell_type": "code",
1211
+ "execution_count": 73,
1212
+ "id": "aaf4a3cd",
1213
+ "metadata": {},
1214
+ "outputs": [],
1215
+ "source": [
1216
+ "input_str=vectorizer('Hey i freaking hate you!. I\\'m going to hurt you!')"
1217
+ ]
1218
+ },
1219
+ {
1220
+ "cell_type": "code",
1221
+ "execution_count": 74,
1222
+ "id": "54761270",
1223
+ "metadata": {},
1224
+ "outputs": [
1225
+ {
1226
+ "name": "stdout",
1227
+ "output_type": "stream",
1228
+ "text": [
1229
+ "1/1 [==============================] - 0s 88ms/step\n"
1230
+ ]
1231
+ }
1232
+ ],
1233
+ "source": [
1234
+ "res=model.predict(np.expand_dims(input_str,0))"
1235
+ ]
1236
+ },
1237
+ {
1238
+ "cell_type": "code",
1239
+ "execution_count": 75,
1240
+ "id": "ba15136b",
1241
+ "metadata": {},
1242
+ "outputs": [
1243
+ {
1244
+ "data": {
1245
+ "text/plain": [
1246
+ "array([[0.9133858 , 0.00198671, 0.0333592 , 0.00411558, 0.71037763,\n",
1247
+ " 0.00563182]], dtype=float32)"
1248
+ ]
1249
+ },
1250
+ "execution_count": 75,
1251
+ "metadata": {},
1252
+ "output_type": "execute_result"
1253
+ }
1254
+ ],
1255
+ "source": [
1256
+ "res"
1257
+ ]
1258
+ },
1259
+ {
1260
+ "cell_type": "code",
1261
+ "execution_count": 72,
1262
+ "id": "c189f6c9",
1263
+ "metadata": {},
1264
+ "outputs": [
1265
+ {
1266
+ "data": {
1267
+ "text/plain": [
1268
+ "Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n",
1269
+ " 'identity_hate'],\n",
1270
+ " dtype='object')"
1271
+ ]
1272
+ },
1273
+ "execution_count": 72,
1274
+ "metadata": {},
1275
+ "output_type": "execute_result"
1276
+ }
1277
+ ],
1278
+ "source": [
1279
+ "data.columns[2:]"
1280
+ ]
1281
+ },
1282
+ {
1283
+ "cell_type": "code",
1284
+ "execution_count": 122,
1285
+ "id": "8c1fbac0",
1286
+ "metadata": {},
1287
+ "outputs": [],
1288
+ "source": [
1289
+ "translator_hindi = pipeline(\"translation\", model=\"Helsinki-NLP/opus-mt-hi-en\", tokenizer=\"Helsinki-NLP/opus-mt-hi-en\")"
1290
+ ]
1291
+ },
1292
+ {
1293
+ "cell_type": "code",
1294
+ "execution_count": 104,
1295
+ "id": "c8db9d6d",
1296
+ "metadata": {},
1297
+ "outputs": [],
1298
+ "source": [
1299
+ "hindi_text = \"नमस्ते, आप कैसे हैं?\""
1300
+ ]
1301
+ },
1302
+ {
1303
+ "cell_type": "code",
1304
+ "execution_count": 123,
1305
+ "id": "9c95d205",
1306
+ "metadata": {},
1307
+ "outputs": [
1308
+ {
1309
+ "data": {
1310
+ "text/plain": [
1311
+ "'Hello, how are you?'"
1312
+ ]
1313
+ },
1314
+ "execution_count": 123,
1315
+ "metadata": {},
1316
+ "output_type": "execute_result"
1317
+ }
1318
+ ],
1319
+ "source": [
1320
+ "en_to_hin = translator_hindi(hindi_text)\n",
1321
+ "en_to_hin[0]['translation_text']"
1322
+ ]
1323
+ },
1324
+ {
1325
+ "cell_type": "code",
1326
+ "execution_count": 131,
1327
+ "id": "3d25803f",
1328
+ "metadata": {},
1329
+ "outputs": [],
1330
+ "source": [
1331
+ "def translate_hindi(from_text):\n",
1332
+ " result2 = translator_hindi(from_text)\n",
1333
+ " \n",
1334
+ " return result2[0]['translation_text']"
1335
+ ]
1336
+ },
1337
+ {
1338
+ "cell_type": "code",
1339
+ "execution_count": 133,
1340
+ "id": "52108859",
1341
+ "metadata": {},
1342
+ "outputs": [
1343
+ {
1344
+ "data": {
1345
+ "text/plain": [
1346
+ "'Hello, how are you?'"
1347
+ ]
1348
+ },
1349
+ "execution_count": 133,
1350
+ "metadata": {},
1351
+ "output_type": "execute_result"
1352
+ }
1353
+ ],
1354
+ "source": [
1355
+ "translate_hindi('नमस्ते, आप कैसे हैं?')"
1356
+ ]
1357
+ },
1358
+ {
1359
+ "cell_type": "code",
1360
+ "execution_count": 94,
1361
+ "id": "837c3093",
1362
+ "metadata": {},
1363
+ "outputs": [],
1364
+ "source": [
1365
+ "def score_comment(comment):\n",
1366
+ " vectorized_comment = vectorizer([comment])\n",
1367
+ " results=model.predict(vectorized_comment)\n",
1368
+ " \n",
1369
+ " text=''\n",
1370
+ " for idx, col in enumerate(data.columns[2:]):\n",
1371
+ " text+= '{}: {}\\n'.format(col, results[0][idx]>0.5)\n",
1372
+ " \n",
1373
+ " return text"
1374
+ ]
1375
+ },
1376
+ {
1377
+ "cell_type": "code",
1378
+ "execution_count": 163,
1379
+ "id": "21ea015f",
1380
+ "metadata": {},
1381
+ "outputs": [],
1382
+ "source": [
1383
+ "def combined_models(input):\n",
1384
+ " output1=translate_hindi(input)\n",
1385
+ " output2=score_comment(input)\n",
1386
+ " \n",
1387
+ " return output1, output2"
1388
+ ]
1389
+ },
1390
+ {
1391
+ "cell_type": "code",
1392
+ "execution_count": 166,
1393
+ "id": "ca5d14a9",
1394
+ "metadata": {},
1395
+ "outputs": [
1396
+ {
1397
+ "name": "stdout",
1398
+ "output_type": "stream",
1399
+ "text": [
1400
+ "1/1 [==============================] - 0s 109ms/step\n"
1401
+ ]
1402
+ }
1403
+ ],
1404
+ "source": [
1405
+ "interface = gr.Interface(fn=combined_models, inputs=\"text\", outputs=[\"text\",\"text\"],title=\"Toxic Comment Analyzer\")"
1406
+ ]
1407
+ },
1408
+ {
1409
+ "cell_type": "code",
1410
+ "execution_count": 168,
1411
+ "id": "cb485bb9",
1412
+ "metadata": {},
1413
+ "outputs": [
1414
+ {
1415
+ "name": "stdout",
1416
+ "output_type": "stream",
1417
+ "text": [
1418
+ "Running on local URL: http://127.0.0.1:7871\n",
1419
+ "Running on public URL: https://27f88e54e3177749fa.gradio.live\n",
1420
+ "\n",
1421
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
1422
+ ]
1423
+ },
1424
+ {
1425
+ "data": {
1426
+ "text/html": [
1427
+ "<div><iframe src=\"https://27f88e54e3177749fa.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
1428
+ ],
1429
+ "text/plain": [
1430
+ "<IPython.core.display.HTML object>"
1431
+ ]
1432
+ },
1433
+ "metadata": {},
1434
+ "output_type": "display_data"
1435
+ },
1436
+ {
1437
+ "data": {
1438
+ "text/plain": []
1439
+ },
1440
+ "execution_count": 168,
1441
+ "metadata": {},
1442
+ "output_type": "execute_result"
1443
+ },
1444
+ {
1445
+ "name": "stdout",
1446
+ "output_type": "stream",
1447
+ "text": [
1448
+ "1/1 [==============================] - 0s 426ms/step\n"
1449
+ ]
1450
+ }
1451
+ ],
1452
+ "source": [
1453
+ "interface.launch(share=True)"
1454
+ ]
1455
+ },
1456
+ {
1457
+ "cell_type": "code",
1458
+ "execution_count": null,
1459
+ "id": "e30aa7aa",
1460
+ "metadata": {},
1461
+ "outputs": [],
1462
+ "source": []
1463
+ }
1464
+ ],
1465
+ "metadata": {
1466
+ "kernelspec": {
1467
+ "display_name": "Python 3 (ipykernel)",
1468
+ "language": "python",
1469
+ "name": "python3"
1470
+ },
1471
+ "language_info": {
1472
+ "codemirror_mode": {
1473
+ "name": "ipython",
1474
+ "version": 3
1475
+ },
1476
+ "file_extension": ".py",
1477
+ "mimetype": "text/x-python",
1478
+ "name": "python",
1479
+ "nbconvert_exporter": "python",
1480
+ "pygments_lexer": "ipython3",
1481
+ "version": "3.11.3"
1482
+ }
1483
+ },
1484
+ "nbformat": 4,
1485
+ "nbformat_minor": 5
1486
+ }