Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .DS_Store +0 -0
- .gitattributes +3 -0
- .github/workflows/update_space.yml +27 -0
- README.md +2 -8
- Toxicity.ipynb +0 -0
- Toxicity.py +304 -0
- jigsaw-toxic-comment-classification-challenge/.DS_Store +0 -0
- jigsaw-toxic-comment-classification-challenge/sample_submission.csv/sample_submission.csv +0 -0
- jigsaw-toxic-comment-classification-challenge/test.csv/test.csv +3 -0
- jigsaw-toxic-comment-classification-challenge/test_labels.csv/test_labels.csv +0 -0
- jigsaw-toxic-comment-classification-challenge/train.csv/train.csv +3 -0
- requirements.txt +5 -0
- toxicity.keras +3 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
jigsaw-toxic-comment-classification-challenge/test.csv/test.csv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
jigsaw-toxic-comment-classification-challenge/train.csv/train.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
toxicity.keras filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/update_space.yml
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Update Space
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
update:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- uses: actions/setup-python@v2
|
14 |
+
with:
|
15 |
+
python-version: "3.8"
|
16 |
+
|
17 |
+
- name: Install dependencies
|
18 |
+
run: |
|
19 |
+
python -m pip install --upgrade pip
|
20 |
+
pip install gradio
|
21 |
+
|
22 |
+
- name: Update Space
|
23 |
+
run: |
|
24 |
+
gradio deploy main
|
25 |
+
env:
|
26 |
+
GRADSPACE_APP_FILE: Toxicity.py
|
27 |
+
GRADSPACE_TITLE: Hate_Speech_Detection
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.15.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Hate_Speech_Detection
|
3 |
+
app_file: Toxicity.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.15.0
|
|
|
|
|
6 |
---
|
|
|
|
Toxicity.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Toxicity.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# # 0. Install Dependencies and Bring in Data
|
5 |
+
%pip install tensorflow pandas matplotlib scikit-learn
|
6 |
+
# In[2]:
|
7 |
+
|
8 |
+
|
9 |
+
import os
|
10 |
+
import pandas as pd
|
11 |
+
import tensorflow as tf
|
12 |
+
import numpy as np
|
13 |
+
|
14 |
+
|
15 |
+
# In[3]:
|
16 |
+
|
17 |
+
|
18 |
+
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv', 'train.csv'))
|
19 |
+
|
20 |
+
|
21 |
+
# In[4]:
|
22 |
+
|
23 |
+
|
24 |
+
df.head()
|
25 |
+
|
26 |
+
|
27 |
+
# # 1. Preprocess
|
28 |
+
|
29 |
+
# In[5]:
|
30 |
+
|
31 |
+
|
32 |
+
get_ipython().system('pip list')
|
33 |
+
|
34 |
+
|
35 |
+
# In[6]:
|
36 |
+
|
37 |
+
|
38 |
+
from tensorflow.keras.layers import TextVectorization
|
39 |
+
|
40 |
+
|
41 |
+
# In[7]:
|
42 |
+
|
43 |
+
|
44 |
+
X = df['comment_text']
|
45 |
+
y = df[df.columns[2:]].values
|
46 |
+
|
47 |
+
|
48 |
+
# In[8]:
|
49 |
+
|
50 |
+
|
51 |
+
MAX_FEATURES = 200000 # number of words in the vocab
|
52 |
+
|
53 |
+
|
54 |
+
# In[9]:
|
55 |
+
|
56 |
+
|
57 |
+
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
|
58 |
+
output_sequence_length=1800,
|
59 |
+
output_mode='int')
|
60 |
+
|
61 |
+
|
62 |
+
# In[10]:
|
63 |
+
|
64 |
+
|
65 |
+
vectorizer.adapt(X.values)
|
66 |
+
|
67 |
+
|
68 |
+
# In[11]:
|
69 |
+
|
70 |
+
|
71 |
+
vectorized_text = vectorizer(X.values)
|
72 |
+
|
73 |
+
|
74 |
+
# In[12]:
|
75 |
+
|
76 |
+
|
77 |
+
#MCSHBAP - map, chache, shuffle, batch, prefetch from_tensor_slices, list_file
|
78 |
+
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
|
79 |
+
dataset = dataset.cache()
|
80 |
+
dataset = dataset.shuffle(160000)
|
81 |
+
dataset = dataset.batch(16)
|
82 |
+
dataset = dataset.prefetch(8) # helps bottlenecks
|
83 |
+
|
84 |
+
|
85 |
+
# In[13]:
|
86 |
+
|
87 |
+
|
88 |
+
train = dataset.take(int(len(dataset)*.7))
|
89 |
+
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
|
90 |
+
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))
|
91 |
+
|
92 |
+
|
93 |
+
# # 2. Create Sequential Model
|
94 |
+
|
95 |
+
# In[14]:
|
96 |
+
|
97 |
+
|
98 |
+
from tensorflow.keras.models import Sequential
|
99 |
+
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
|
100 |
+
|
101 |
+
|
102 |
+
# In[15]:
|
103 |
+
|
104 |
+
|
105 |
+
model = Sequential()
|
106 |
+
# Create the embedding layer
|
107 |
+
model.add(Embedding(MAX_FEATURES+1, 32))
|
108 |
+
# Bidirectional LSTM Layer
|
109 |
+
model.add(Bidirectional(LSTM(32, activation='tanh')))
|
110 |
+
# Feature extractor Fully connected layers
|
111 |
+
model.add(Dense(128, activation='relu'))
|
112 |
+
model.add(Dense(256, activation='relu'))
|
113 |
+
model.add(Dense(128, activation='relu'))
|
114 |
+
# Final layer
|
115 |
+
model.add(Dense(6, activation='sigmoid'))
|
116 |
+
|
117 |
+
|
118 |
+
# In[16]:
|
119 |
+
|
120 |
+
|
121 |
+
model.compile(loss='BinaryCrossentropy', optimizer='Adam')
|
122 |
+
|
123 |
+
|
124 |
+
# In[17]:
|
125 |
+
|
126 |
+
|
127 |
+
model.summary()
|
128 |
+
|
129 |
+
|
130 |
+
# In[18]:
|
131 |
+
|
132 |
+
|
133 |
+
history = model.fit(train, epochs=1, validation_data=val)
|
134 |
+
|
135 |
+
|
136 |
+
# In[19]:
|
137 |
+
|
138 |
+
|
139 |
+
from matplotlib import pyplot as plt
|
140 |
+
|
141 |
+
|
142 |
+
# In[20]:
|
143 |
+
|
144 |
+
|
145 |
+
plt.figure(figsize=(8,5))
|
146 |
+
pd.DataFrame(history.history).plot()
|
147 |
+
plt.show()
|
148 |
+
|
149 |
+
|
150 |
+
# # 3. Make Predictions
|
151 |
+
|
152 |
+
# In[21]:
|
153 |
+
|
154 |
+
|
155 |
+
# Make Predictions
|
156 |
+
input_text = vectorizer(['You freaking suck! I am going to hit you.']) # Make sure to pass a list
|
157 |
+
res = model.predict(input_text)
|
158 |
+
|
159 |
+
|
160 |
+
# In[63]:
|
161 |
+
|
162 |
+
|
163 |
+
# Convert the predictions to binary values based on the threshold (0.5)
|
164 |
+
binary_predictions = (res > 0.5).astype(int)
|
165 |
+
print(binary_predictions)
|
166 |
+
|
167 |
+
|
168 |
+
# In[64]:
|
169 |
+
|
170 |
+
|
171 |
+
batch_X, batch_y = test.as_numpy_iterator().next()
|
172 |
+
|
173 |
+
|
174 |
+
# In[65]:
|
175 |
+
|
176 |
+
|
177 |
+
(model.predict(batch_X) > 0.5).astype(int)
|
178 |
+
|
179 |
+
|
180 |
+
# In[66]:
|
181 |
+
|
182 |
+
|
183 |
+
res.shape
|
184 |
+
|
185 |
+
|
186 |
+
# # 4. Evaluate Model
|
187 |
+
|
188 |
+
# In[67]:
|
189 |
+
|
190 |
+
|
191 |
+
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
|
192 |
+
|
193 |
+
|
194 |
+
# In[68]:
|
195 |
+
|
196 |
+
|
197 |
+
pre = Precision()
|
198 |
+
re = Recall()
|
199 |
+
acc = CategoricalAccuracy()
|
200 |
+
|
201 |
+
|
202 |
+
# In[69]:
|
203 |
+
|
204 |
+
|
205 |
+
for batch in test.as_numpy_iterator():
|
206 |
+
# Unpack the batch
|
207 |
+
X_true, y_true = batch
|
208 |
+
# Make a prediction
|
209 |
+
yhat = model.predict(X_true)
|
210 |
+
|
211 |
+
# Flatten the predictions
|
212 |
+
y_true = y_true.flatten()
|
213 |
+
yhat = yhat.flatten()
|
214 |
+
|
215 |
+
pre.update_state(y_true, yhat)
|
216 |
+
re.update_state(y_true, yhat)
|
217 |
+
acc.update_state(y_true, yhat)
|
218 |
+
|
219 |
+
|
220 |
+
# In[81]:
|
221 |
+
|
222 |
+
|
223 |
+
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')
|
224 |
+
|
225 |
+
|
226 |
+
# # 5. Test and Gradio
|
227 |
+
|
228 |
+
# In[72]:
|
229 |
+
|
230 |
+
|
231 |
+
get_ipython().system('pip install gradio jinja2')
|
232 |
+
|
233 |
+
|
234 |
+
# In[73]:
|
235 |
+
|
236 |
+
|
237 |
+
import tensorflow as tf
|
238 |
+
import gradio as gr
|
239 |
+
|
240 |
+
|
241 |
+
# In[85]:
|
242 |
+
|
243 |
+
|
244 |
+
model.save('toxicity.keras')
|
245 |
+
|
246 |
+
|
247 |
+
# In[86]:
|
248 |
+
|
249 |
+
|
250 |
+
model = tf.keras.models.load_model('toxicity.keras')
|
251 |
+
|
252 |
+
|
253 |
+
# In[87]:
|
254 |
+
|
255 |
+
|
256 |
+
input_str = vectorizer('hey i freaken hate you!')
|
257 |
+
|
258 |
+
|
259 |
+
# In[88]:
|
260 |
+
|
261 |
+
|
262 |
+
res = model.predict(np.expand_dims(input_str,0))
|
263 |
+
|
264 |
+
|
265 |
+
# In[89]:
|
266 |
+
|
267 |
+
|
268 |
+
res
|
269 |
+
|
270 |
+
|
271 |
+
# In[90]:
|
272 |
+
|
273 |
+
|
274 |
+
def score_comment(comment):
|
275 |
+
vectorized_comment = vectorizer([comment])
|
276 |
+
results = model.predict(vectorized_comment)
|
277 |
+
|
278 |
+
text = ''
|
279 |
+
for idx, col in enumerate(df.columns[2:]):
|
280 |
+
text += '{}: {}\n'.format(col, results[0][idx]>0.5)
|
281 |
+
|
282 |
+
return text
|
283 |
+
|
284 |
+
|
285 |
+
# In[93]:
|
286 |
+
|
287 |
+
|
288 |
+
interface = gr.Interface(fn=score_comment,
|
289 |
+
inputs="textbox", # Use "textbox" instead of gr.inputs.Textbox
|
290 |
+
outputs="text",
|
291 |
+
)
|
292 |
+
|
293 |
+
|
294 |
+
# In[94]:
|
295 |
+
|
296 |
+
|
297 |
+
interface.launch(share=True)
|
298 |
+
|
299 |
+
|
300 |
+
# In[ ]:
|
301 |
+
|
302 |
+
|
303 |
+
|
304 |
+
|
jigsaw-toxic-comment-classification-challenge/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
jigsaw-toxic-comment-classification-challenge/sample_submission.csv/sample_submission.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
jigsaw-toxic-comment-classification-challenge/test.csv/test.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2513ce4abb98c4d1d216e3ca0d4377d57589a0989aa8c06a840509a16c786e8
|
3 |
+
size 60354593
|
jigsaw-toxic-comment-classification-challenge/test_labels.csv/test_labels.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
jigsaw-toxic-comment-classification-challenge/train.csv/train.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd4084611bd27c939ba98e5e63bc3e5a2c1a4e99477dcba46c829e4c986c429d
|
3 |
+
size 68802655
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tensorflow
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
scikit-learn
|
5 |
+
matplotlib
|
toxicity.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb801227b73177f73c564d8fc07a4d1a5931ec82245ae2cf6c8394f069d62500
|
3 |
+
size 26006432
|