Commit
β’
0fac6f6
1
Parent(s):
4d00156
files updated
Browse files- .github/workflows/hf_push.yml +19 -0
- README.md +11 -0
- app.py +46 -0
- docs/eda.md +5 -1
- docs/experiments.md +3 -0
- mkdocs.yml +1 -1
- src/__init__.py +3 -0
- src/batch_predict.py +45 -0
- src/constants.py +10 -0
- src/single_predict.py +33 -0
.github/workflows/hf_push.yml
CHANGED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [master]
|
5 |
+
|
6 |
+
workflow_dispatch:
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
sync-to-hub:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
steps:
|
12 |
+
- uses: actions/checkout@v3
|
13 |
+
with:
|
14 |
+
fetch-depth: 0
|
15 |
+
lfs: true
|
16 |
+
- name: Push to hub
|
17 |
+
env:
|
18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
19 |
+
run: git push https://shivansh-ka:$HF_TOKEN@huggingface.co/spaces/shivansh-ka/Toxic-Comment-Classifier main
|
README.md
CHANGED
@@ -1 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Multilingual-Toxic-Comment-Classifier
|
|
|
1 |
+
---
|
2 |
+
title: Toxic Comment Classifier
|
3 |
+
emoji: π
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: streamlit
|
7 |
+
app_file: app.py
|
8 |
+
pinned: false
|
9 |
+
license: apache-2.0
|
10 |
+
---
|
11 |
+
|
12 |
# Multilingual-Toxic-Comment-Classifier
|
app.py
CHANGED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from src import *
|
4 |
+
|
5 |
+
single = SinglePrediction()
|
6 |
+
batch = BatchPrediction()
|
7 |
+
|
8 |
+
def single_predict(text):
|
9 |
+
st.success(f'{text} :thumbsup:')
|
10 |
+
preds = single.predict(text)
|
11 |
+
#st.plotly_chart(preds, theme=None, use_container_width=True)
|
12 |
+
|
13 |
+
def batch_predict(data):
|
14 |
+
if batch.data_validation(data):
|
15 |
+
st.success(f'Data Validation Successfull :thumbsup:')
|
16 |
+
preds = batch.predict(data)
|
17 |
+
return preds.to_csv(index=False).encode('utf-8')
|
18 |
+
else:
|
19 |
+
st.error(f'Data Validation Failed :thumbsdown:')
|
20 |
+
|
21 |
+
st.title('Toxic Comment Classifier')
|
22 |
+
menu = ["Single Value Prediciton","Batch Prediction"]
|
23 |
+
choice = st.sidebar.radio("Menu",menu)
|
24 |
+
|
25 |
+
if choice=="Single Value Prediciton":
|
26 |
+
st.subheader("Prediction")
|
27 |
+
#comment = st.text_input("Comment", 'Enter your comment here')
|
28 |
+
#trigger = st.button('Predict', on_click=single_predict(comment))
|
29 |
+
form = st.form("my_form")
|
30 |
+
comment = form.text_input("Enter comment")
|
31 |
+
form.form_submit_button("Predict",on_click=single_predict(comment))
|
32 |
+
else:
|
33 |
+
st.subheader("Batch Prediction")
|
34 |
+
csv_file = st.file_uploader("Upload Image",type=['csv','parquet'])
|
35 |
+
|
36 |
+
if csv_file is not None:
|
37 |
+
csv = batch_predict(csv_file)
|
38 |
+
st.download_button(
|
39 |
+
label="Predict and Download",
|
40 |
+
data=csv,
|
41 |
+
file_name='prediction.csv',
|
42 |
+
mime='text/csv',
|
43 |
+
)
|
44 |
+
|
45 |
+
|
46 |
+
|
docs/eda.md
CHANGED
@@ -1 +1,5 @@
|
|
1 |
-
# EDA
|
|
|
|
|
|
|
|
|
|
1 |
+
# EDA
|
2 |
+
|
3 |
+
```
|
4 |
+
|
5 |
+
```
|
docs/experiments.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# EXPERIMENTS PERFORMED
|
2 |
+
|
3 |
+
## Results
|
mkdocs.yml
CHANGED
@@ -4,7 +4,7 @@ site_name: Toxic Comment Classifier
|
|
4 |
nav:
|
5 |
- Home: index.md
|
6 |
- EDA: eda.md
|
7 |
-
|
8 |
|
9 |
site_author: Shivansh Kaushal
|
10 |
site_description: >-
|
|
|
4 |
nav:
|
5 |
- Home: index.md
|
6 |
- EDA: eda.md
|
7 |
+
- Experiments: experiments.md
|
8 |
|
9 |
site_author: Shivansh Kaushal
|
10 |
site_description: >-
|
src/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from src.batch_predict import *
|
3 |
+
from src.single_predict import *
|
src/batch_predict.py
CHANGED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import tensorflow as tf
|
4 |
+
import transformers
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
import os
|
7 |
+
from src.constants import *
|
8 |
+
import re
|
9 |
+
|
10 |
+
|
11 |
+
class BatchPrediction:
|
12 |
+
def __init__(self):
|
13 |
+
self.model = tf.keras.models.load_model(MODEL_PATH)
|
14 |
+
self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
|
15 |
+
|
16 |
+
def tokenizer(self, text:str):
|
17 |
+
tokens = self.tokenizer(text,
|
18 |
+
max_length=MAX_LEN,
|
19 |
+
truncation=True,
|
20 |
+
padding="max_length",
|
21 |
+
add_special_tokens=True,
|
22 |
+
return_tensors="tf",
|
23 |
+
return_token_type_ids = False)
|
24 |
+
return dict(tokens)
|
25 |
+
|
26 |
+
def data_validation(data):
|
27 |
+
df = pd.read_csv(data)
|
28 |
+
status=True
|
29 |
+
for column in df.columns:
|
30 |
+
if column not in ['id', 'comment_text']:
|
31 |
+
status=False
|
32 |
+
return status
|
33 |
+
|
34 |
+
def predict(self, data):
|
35 |
+
try:
|
36 |
+
df = pd.read_csv(data)
|
37 |
+
df.dropna(inplace=True)
|
38 |
+
df = df.comment_text.apply(lambda x: re.sub('\n',' ',x).strip())
|
39 |
+
input = self.tokenizer(df.comment_text.values.tolist())
|
40 |
+
preds = self.model.predict(input)
|
41 |
+
df['probabilities'] = preds
|
42 |
+
df['toxic'] = np.where(df['probabilities']>0.5, 1, 0)
|
43 |
+
return df
|
44 |
+
except Exception as e:
|
45 |
+
print(e)
|
src/constants.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
ROOT_DIR = os.getcwd()
|
4 |
+
MODEL_DIR_NAME = "serving_model"
|
5 |
+
MODEL_NAME = "roberta-fine-tuned-2"
|
6 |
+
MODEL_PATH = os.path.join(ROOT_DIR, MODEL_DIR_NAME,MODEL_NAME)
|
7 |
+
TOKENIZER_FILE_NAME = "tokenizer"
|
8 |
+
TOKENIZER_PATH = os.path.join(ROOT_DIR, MODEL_DIR_NAME, TOKENIZER_FILE_NAME)
|
9 |
+
MAX_LEN =192
|
10 |
+
BUFFER_SIZE=2048
|
src/single_predict.py
CHANGED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import tensorflow as tf
|
4 |
+
import transformers
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
import os
|
7 |
+
from src.constants import *
|
8 |
+
import re
|
9 |
+
|
10 |
+
|
11 |
+
class SinglePrediction:
|
12 |
+
def __init__(self):
|
13 |
+
self.model = tf.keras.models.load_model(MODEL_PATH)
|
14 |
+
self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
|
15 |
+
|
16 |
+
def tokenizer(self, text:str):
|
17 |
+
tokens = self.tokenizer(text,
|
18 |
+
max_length=MAX_LEN,
|
19 |
+
truncation=True,
|
20 |
+
padding="max_length",
|
21 |
+
add_special_tokens=True,
|
22 |
+
return_tensors="tf",
|
23 |
+
return_token_type_ids = False)
|
24 |
+
return dict(tokens)
|
25 |
+
|
26 |
+
def predict(self, text:str):
|
27 |
+
try:
|
28 |
+
text = re.sub('\n',' ',text).strip()
|
29 |
+
input = self.tokenizer(text)
|
30 |
+
preds = self.model.predict(input)[0][0]
|
31 |
+
return preds
|
32 |
+
except Exception as e:
|
33 |
+
print(e)
|