files updated

Browse files

Files changed (10) hide show

.github/workflows/hf_push.yml +19 -0
README.md +11 -0
app.py +46 -0
docs/eda.md +5 -1
docs/experiments.md +3 -0
mkdocs.yml +1 -1
src/__init__.py +3 -0
src/batch_predict.py +45 -0
src/constants.py +10 -0
src/single_predict.py +33 -0

.github/workflows/hf_push.yml CHANGED Viewed

	@@ -0,0 +1,19 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [master]
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://shivansh-ka:$HF_TOKEN@huggingface.co/spaces/shivansh-ka/Toxic-Comment-Classifier main

README.md CHANGED Viewed

	@@ -1 +1,12 @@











1	# Multilingual-Toxic-Comment-Classifier

+---
+title: Toxic Comment Classifier
+emoji: 🌍
+colorFrom: blue
+colorTo: yellow
+sdk: streamlit
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
 # Multilingual-Toxic-Comment-Classifier

app.py CHANGED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+import pandas as pd
+from src import *
+single = SinglePrediction()
+batch = BatchPrediction()
+def single_predict(text):
+    st.success(f'{text} :thumbsup:')
+    preds = single.predict(text)
+    #st.plotly_chart(preds, theme=None, use_container_width=True)
+def batch_predict(data):
+    if batch.data_validation(data):
+        st.success(f'Data Validation Successfull :thumbsup:')
+        preds = batch.predict(data)
+        return preds.to_csv(index=False).encode('utf-8')
+    else:
+        st.error(f'Data Validation Failed :thumbsdown:')
+st.title('Toxic Comment Classifier')
+menu = ["Single Value Prediciton","Batch Prediction"]
+choice = st.sidebar.radio("Menu",menu)
+if choice=="Single Value Prediciton":
+    st.subheader("Prediction")
+    #comment = st.text_input("Comment", 'Enter your comment here')
+    #trigger = st.button('Predict', on_click=single_predict(comment))
+    form = st.form("my_form")
+    comment = form.text_input("Enter comment")
+    form.form_submit_button("Predict",on_click=single_predict(comment))
+else:
+    st.subheader("Batch Prediction")
+    csv_file = st.file_uploader("Upload Image",type=['csv','parquet'])
+    if csv_file is not None:
+        csv = batch_predict(csv_file)
+        st.download_button(
+            label="Predict and Download",
+            data=csv,
+            file_name='prediction.csv',
+            mime='text/csv',
+        )

docs/eda.md CHANGED Viewed

	@@ -1 +1,5 @@
1	- # EDA

+# EDA
+```
+```

docs/experiments.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # EXPERIMENTS PERFORMED
2	+
3	+ ## Results

mkdocs.yml CHANGED Viewed

@@ -4,7 +4,7 @@ site_name: Toxic Comment Classifier
 nav:
   - Home: index.md
   - EDA: eda.md
-  #- Featurization: featurization.md
 site_author: Shivansh Kaushal
 site_description: >-

 nav:
   - Home: index.md
   - EDA: eda.md
+  - Experiments: experiments.md
 site_author: Shivansh Kaushal
 site_description: >-

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+
2	+ from src.batch_predict import *
3	+ from src.single_predict import *

src/batch_predict.py CHANGED Viewed

	@@ -0,0 +1,45 @@

+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import transformers
+from transformers import AutoTokenizer
+import os
+from src.constants import *
+import re
+class BatchPrediction:
+    def __init__(self):
+        self.model = tf.keras.models.load_model(MODEL_PATH)
+        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+    def tokenizer(self, text:str):
+        tokens = self.tokenizer(text,
+                                max_length=MAX_LEN,
+                                truncation=True,
+                                padding="max_length",
+                                add_special_tokens=True,
+                                return_tensors="tf",
+                                return_token_type_ids = False)
+        return dict(tokens)
+    def data_validation(data):
+        df = pd.read_csv(data)
+        status=True
+        for column in df.columns:
+            if column not in ['id', 'comment_text']:
+                status=False
+        return status
+    def predict(self, data):
+        try:
+            df = pd.read_csv(data)
+            df.dropna(inplace=True)
+            df = df.comment_text.apply(lambda x: re.sub('\n',' ',x).strip())
+            input = self.tokenizer(df.comment_text.values.tolist())
+            preds = self.model.predict(input)
+            df['probabilities'] = preds
+            df['toxic'] = np.where(df['probabilities']>0.5, 1, 0)
+            return df
+        except Exception as e:
+            print(e)

src/constants.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+ROOT_DIR = os.getcwd()
+MODEL_DIR_NAME = "serving_model"
+MODEL_NAME = "roberta-fine-tuned-2"
+MODEL_PATH = os.path.join(ROOT_DIR, MODEL_DIR_NAME,MODEL_NAME)
+TOKENIZER_FILE_NAME = "tokenizer"
+TOKENIZER_PATH = os.path.join(ROOT_DIR, MODEL_DIR_NAME, TOKENIZER_FILE_NAME)
+MAX_LEN =192
+BUFFER_SIZE=2048

src/single_predict.py CHANGED Viewed

	@@ -0,0 +1,33 @@

+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import transformers
+from transformers import AutoTokenizer
+import os
+from src.constants import *
+import re
+class SinglePrediction:
+    def __init__(self):
+        self.model = tf.keras.models.load_model(MODEL_PATH)
+        self.tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+    def tokenizer(self, text:str):
+        tokens = self.tokenizer(text,
+                                max_length=MAX_LEN,
+                                truncation=True,
+                                padding="max_length",
+                                add_special_tokens=True,
+                                return_tensors="tf",
+                                return_token_type_ids = False)
+        return dict(tokens)
+    def predict(self, text:str):
+        try:
+            text = re.sub('\n',' ',text).strip()
+            input = self.tokenizer(text)
+            preds = self.model.predict(input)[0][0]
+            return preds
+        except Exception as e:
+            print(e)