Shreyashftw commited on
Commit
4ee6ffc
·
verified ·
1 Parent(s): 73f11f9

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ jigsaw-toxic-comment-classification-challenge/test.csv/test.csv filter=lfs diff=lfs merge=lfs -text
37
+ jigsaw-toxic-comment-classification-challenge/train.csv/train.csv filter=lfs diff=lfs merge=lfs -text
38
+ toxicity.keras filter=lfs diff=lfs merge=lfs -text
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Update Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ update:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - uses: actions/setup-python@v2
14
+ with:
15
+ python-version: "3.8"
16
+
17
+ - name: Install dependencies
18
+ run: |
19
+ python -m pip install --upgrade pip
20
+ pip install gradio
21
+
22
+ - name: Update Space
23
+ run: |
24
+ gradio deploy main
25
+ env:
26
+ GRADSPACE_APP_FILE: Toxicity.py
27
+ GRADSPACE_TITLE: Hate_Speech_Detection
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Hate Speech Detection
3
- emoji: 🐢
4
- colorFrom: purple
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.15.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Hate_Speech_Detection
3
+ app_file: Toxicity.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.15.0
 
 
6
  ---
 
 
Toxicity.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Toxicity.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # # 0. Install Dependencies and Bring in Data
5
+ %pip install tensorflow pandas matplotlib scikit-learn
6
+ # In[2]:
7
+
8
+
9
+ import os
10
+ import pandas as pd
11
+ import tensorflow as tf
12
+ import numpy as np
13
+
14
+
15
+ # In[3]:
16
+
17
+
18
+ df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv', 'train.csv'))
19
+
20
+
21
+ # In[4]:
22
+
23
+
24
+ df.head()
25
+
26
+
27
+ # # 1. Preprocess
28
+
29
+ # In[5]:
30
+
31
+
32
+ get_ipython().system('pip list')
33
+
34
+
35
+ # In[6]:
36
+
37
+
38
+ from tensorflow.keras.layers import TextVectorization
39
+
40
+
41
+ # In[7]:
42
+
43
+
44
+ X = df['comment_text']
45
+ y = df[df.columns[2:]].values
46
+
47
+
48
+ # In[8]:
49
+
50
+
51
+ MAX_FEATURES = 200000 # number of words in the vocab
52
+
53
+
54
+ # In[9]:
55
+
56
+
57
+ vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
58
+ output_sequence_length=1800,
59
+ output_mode='int')
60
+
61
+
62
+ # In[10]:
63
+
64
+
65
+ vectorizer.adapt(X.values)
66
+
67
+
68
+ # In[11]:
69
+
70
+
71
+ vectorized_text = vectorizer(X.values)
72
+
73
+
74
+ # In[12]:
75
+
76
+
77
+ #MCSHBAP - map, chache, shuffle, batch, prefetch from_tensor_slices, list_file
78
+ dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
79
+ dataset = dataset.cache()
80
+ dataset = dataset.shuffle(160000)
81
+ dataset = dataset.batch(16)
82
+ dataset = dataset.prefetch(8) # helps bottlenecks
83
+
84
+
85
+ # In[13]:
86
+
87
+
88
+ train = dataset.take(int(len(dataset)*.7))
89
+ val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
90
+ test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))
91
+
92
+
93
+ # # 2. Create Sequential Model
94
+
95
+ # In[14]:
96
+
97
+
98
+ from tensorflow.keras.models import Sequential
99
+ from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
100
+
101
+
102
+ # In[15]:
103
+
104
+
105
+ model = Sequential()
106
+ # Create the embedding layer
107
+ model.add(Embedding(MAX_FEATURES+1, 32))
108
+ # Bidirectional LSTM Layer
109
+ model.add(Bidirectional(LSTM(32, activation='tanh')))
110
+ # Feature extractor Fully connected layers
111
+ model.add(Dense(128, activation='relu'))
112
+ model.add(Dense(256, activation='relu'))
113
+ model.add(Dense(128, activation='relu'))
114
+ # Final layer
115
+ model.add(Dense(6, activation='sigmoid'))
116
+
117
+
118
+ # In[16]:
119
+
120
+
121
+ model.compile(loss='BinaryCrossentropy', optimizer='Adam')
122
+
123
+
124
+ # In[17]:
125
+
126
+
127
+ model.summary()
128
+
129
+
130
+ # In[18]:
131
+
132
+
133
+ history = model.fit(train, epochs=1, validation_data=val)
134
+
135
+
136
+ # In[19]:
137
+
138
+
139
+ from matplotlib import pyplot as plt
140
+
141
+
142
+ # In[20]:
143
+
144
+
145
+ plt.figure(figsize=(8,5))
146
+ pd.DataFrame(history.history).plot()
147
+ plt.show()
148
+
149
+
150
+ # # 3. Make Predictions
151
+
152
+ # In[21]:
153
+
154
+
155
+ # Make Predictions
156
+ input_text = vectorizer(['You freaking suck! I am going to hit you.']) # Make sure to pass a list
157
+ res = model.predict(input_text)
158
+
159
+
160
+ # In[63]:
161
+
162
+
163
+ # Convert the predictions to binary values based on the threshold (0.5)
164
+ binary_predictions = (res > 0.5).astype(int)
165
+ print(binary_predictions)
166
+
167
+
168
+ # In[64]:
169
+
170
+
171
+ batch_X, batch_y = test.as_numpy_iterator().next()
172
+
173
+
174
+ # In[65]:
175
+
176
+
177
+ (model.predict(batch_X) > 0.5).astype(int)
178
+
179
+
180
+ # In[66]:
181
+
182
+
183
+ res.shape
184
+
185
+
186
+ # # 4. Evaluate Model
187
+
188
+ # In[67]:
189
+
190
+
191
+ from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
192
+
193
+
194
+ # In[68]:
195
+
196
+
197
+ pre = Precision()
198
+ re = Recall()
199
+ acc = CategoricalAccuracy()
200
+
201
+
202
+ # In[69]:
203
+
204
+
205
+ for batch in test.as_numpy_iterator():
206
+ # Unpack the batch
207
+ X_true, y_true = batch
208
+ # Make a prediction
209
+ yhat = model.predict(X_true)
210
+
211
+ # Flatten the predictions
212
+ y_true = y_true.flatten()
213
+ yhat = yhat.flatten()
214
+
215
+ pre.update_state(y_true, yhat)
216
+ re.update_state(y_true, yhat)
217
+ acc.update_state(y_true, yhat)
218
+
219
+
220
+ # In[81]:
221
+
222
+
223
+ print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')
224
+
225
+
226
+ # # 5. Test and Gradio
227
+
228
+ # In[72]:
229
+
230
+
231
+ get_ipython().system('pip install gradio jinja2')
232
+
233
+
234
+ # In[73]:
235
+
236
+
237
+ import tensorflow as tf
238
+ import gradio as gr
239
+
240
+
241
+ # In[85]:
242
+
243
+
244
+ model.save('toxicity.keras')
245
+
246
+
247
+ # In[86]:
248
+
249
+
250
+ model = tf.keras.models.load_model('toxicity.keras')
251
+
252
+
253
+ # In[87]:
254
+
255
+
256
+ input_str = vectorizer('hey i freaken hate you!')
257
+
258
+
259
+ # In[88]:
260
+
261
+
262
+ res = model.predict(np.expand_dims(input_str,0))
263
+
264
+
265
+ # In[89]:
266
+
267
+
268
+ res
269
+
270
+
271
+ # In[90]:
272
+
273
+
274
+ def score_comment(comment):
275
+ vectorized_comment = vectorizer([comment])
276
+ results = model.predict(vectorized_comment)
277
+
278
+ text = ''
279
+ for idx, col in enumerate(df.columns[2:]):
280
+ text += '{}: {}\n'.format(col, results[0][idx]>0.5)
281
+
282
+ return text
283
+
284
+
285
+ # In[93]:
286
+
287
+
288
+ interface = gr.Interface(fn=score_comment,
289
+ inputs="textbox", # Use "textbox" instead of gr.inputs.Textbox
290
+ outputs="text",
291
+ )
292
+
293
+
294
+ # In[94]:
295
+
296
+
297
+ interface.launch(share=True)
298
+
299
+
300
+ # In[ ]:
301
+
302
+
303
+
304
+
jigsaw-toxic-comment-classification-challenge/.DS_Store ADDED
Binary file (8.2 kB). View file
 
jigsaw-toxic-comment-classification-challenge/sample_submission.csv/sample_submission.csv ADDED
The diff for this file is too large to render. See raw diff
 
jigsaw-toxic-comment-classification-challenge/test.csv/test.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2513ce4abb98c4d1d216e3ca0d4377d57589a0989aa8c06a840509a16c786e8
3
+ size 60354593
jigsaw-toxic-comment-classification-challenge/test_labels.csv/test_labels.csv ADDED
The diff for this file is too large to render. See raw diff
 
jigsaw-toxic-comment-classification-challenge/train.csv/train.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd4084611bd27c939ba98e5e63bc3e5a2c1a4e99477dcba46c829e4c986c429d
3
+ size 68802655
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ tensorflow
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ matplotlib
toxicity.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb801227b73177f73c564d8fc07a4d1a5931ec82245ae2cf6c8394f069d62500
3
+ size 26006432