shivansh-ka commited on
Commit
0fac6f6
β€’
1 Parent(s): 4d00156

files updated

Browse files
.github/workflows/hf_push.yml CHANGED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [master]
5
+
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ sync-to-hub:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v3
13
+ with:
14
+ fetch-depth: 0
15
+ lfs: true
16
+ - name: Push to hub
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: git push https://shivansh-ka:$HF_TOKEN@huggingface.co/spaces/shivansh-ka/Toxic-Comment-Classifier main
README.md CHANGED
@@ -1 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
1
  # Multilingual-Toxic-Comment-Classifier
 
1
+ ---
2
+ title: Toxic Comment Classifier
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ app_file: app.py
8
+ pinned: false
9
+ license: apache-2.0
10
+ ---
11
+
12
  # Multilingual-Toxic-Comment-Classifier
app.py CHANGED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from src import *
4
+
5
+ single = SinglePrediction()
6
+ batch = BatchPrediction()
7
+
8
+ def single_predict(text):
9
+ st.success(f'{text} :thumbsup:')
10
+ preds = single.predict(text)
11
+ #st.plotly_chart(preds, theme=None, use_container_width=True)
12
+
13
+ def batch_predict(data):
14
+ if batch.data_validation(data):
15
+ st.success(f'Data Validation Successfull :thumbsup:')
16
+ preds = batch.predict(data)
17
+ return preds.to_csv(index=False).encode('utf-8')
18
+ else:
19
+ st.error(f'Data Validation Failed :thumbsdown:')
20
+
21
+ st.title('Toxic Comment Classifier')
22
+ menu = ["Single Value Prediciton","Batch Prediction"]
23
+ choice = st.sidebar.radio("Menu",menu)
24
+
25
+ if choice=="Single Value Prediciton":
26
+ st.subheader("Prediction")
27
+ #comment = st.text_input("Comment", 'Enter your comment here')
28
+ #trigger = st.button('Predict', on_click=single_predict(comment))
29
+ form = st.form("my_form")
30
+ comment = form.text_input("Enter comment")
31
+ form.form_submit_button("Predict",on_click=single_predict(comment))
32
+ else:
33
+ st.subheader("Batch Prediction")
34
+ csv_file = st.file_uploader("Upload Image",type=['csv','parquet'])
35
+
36
+ if csv_file is not None:
37
+ csv = batch_predict(csv_file)
38
+ st.download_button(
39
+ label="Predict and Download",
40
+ data=csv,
41
+ file_name='prediction.csv',
42
+ mime='text/csv',
43
+ )
44
+
45
+
46
+
docs/eda.md CHANGED
@@ -1 +1,5 @@
1
- # EDA
 
 
 
 
 
1
+ # EDA
2
+
3
+ ```
4
+
5
+ ```
docs/experiments.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # EXPERIMENTS PERFORMED
2
+
3
+ ## Results
mkdocs.yml CHANGED
@@ -4,7 +4,7 @@ site_name: Toxic Comment Classifier
4
  nav:
5
  - Home: index.md
6
  - EDA: eda.md
7
- #- Featurization: featurization.md
8
 
9
  site_author: Shivansh Kaushal
10
  site_description: >-
 
4
  nav:
5
  - Home: index.md
6
  - EDA: eda.md
7
+ - Experiments: experiments.md
8
 
9
  site_author: Shivansh Kaushal
10
  site_description: >-
src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ from src.batch_predict import *
3
+ from src.single_predict import *
src/batch_predict.py CHANGED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import transformers
5
+ from transformers import AutoTokenizer
6
+ import os
7
+ from src.constants import *
8
+ import re
9
+
10
+
11
+ class BatchPrediction:
12
+ def __init__(self):
13
+ self.model = tf.keras.models.load_model(MODEL_PATH)
14
+ self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
15
+
16
+ def tokenizer(self, text:str):
17
+ tokens = self.tokenizer(text,
18
+ max_length=MAX_LEN,
19
+ truncation=True,
20
+ padding="max_length",
21
+ add_special_tokens=True,
22
+ return_tensors="tf",
23
+ return_token_type_ids = False)
24
+ return dict(tokens)
25
+
26
+ def data_validation(data):
27
+ df = pd.read_csv(data)
28
+ status=True
29
+ for column in df.columns:
30
+ if column not in ['id', 'comment_text']:
31
+ status=False
32
+ return status
33
+
34
+ def predict(self, data):
35
+ try:
36
+ df = pd.read_csv(data)
37
+ df.dropna(inplace=True)
38
+ df = df.comment_text.apply(lambda x: re.sub('\n',' ',x).strip())
39
+ input = self.tokenizer(df.comment_text.values.tolist())
40
+ preds = self.model.predict(input)
41
+ df['probabilities'] = preds
42
+ df['toxic'] = np.where(df['probabilities']>0.5, 1, 0)
43
+ return df
44
+ except Exception as e:
45
+ print(e)
src/constants.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ ROOT_DIR = os.getcwd()
4
+ MODEL_DIR_NAME = "serving_model"
5
+ MODEL_NAME = "roberta-fine-tuned-2"
6
+ MODEL_PATH = os.path.join(ROOT_DIR, MODEL_DIR_NAME,MODEL_NAME)
7
+ TOKENIZER_FILE_NAME = "tokenizer"
8
+ TOKENIZER_PATH = os.path.join(ROOT_DIR, MODEL_DIR_NAME, TOKENIZER_FILE_NAME)
9
+ MAX_LEN =192
10
+ BUFFER_SIZE=2048
src/single_predict.py CHANGED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import transformers
5
+ from transformers import AutoTokenizer
6
+ import os
7
+ from src.constants import *
8
+ import re
9
+
10
+
11
+ class SinglePrediction:
12
+ def __init__(self):
13
+ self.model = tf.keras.models.load_model(MODEL_PATH)
14
+ self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
15
+
16
+ def tokenizer(self, text:str):
17
+ tokens = self.tokenizer(text,
18
+ max_length=MAX_LEN,
19
+ truncation=True,
20
+ padding="max_length",
21
+ add_special_tokens=True,
22
+ return_tensors="tf",
23
+ return_token_type_ids = False)
24
+ return dict(tokens)
25
+
26
+ def predict(self, text:str):
27
+ try:
28
+ text = re.sub('\n',' ',text).strip()
29
+ input = self.tokenizer(text)
30
+ preds = self.model.predict(input)[0][0]
31
+ return preds
32
+ except Exception as e:
33
+ print(e)