In [None]:
!pip install nltk scikit-learn

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
import tensorflow as tf
#import tensorflow_gpu
import urllib
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy, AUC
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
import string
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('wordnet2022')

In [None]:
def tf_tpu_or_gpu(device: str='gpu'):
    if device.lower() == 'gpu':
        print("Setting up GPU.....")
        device_name = tf.test.gpu_device_name()
        if "GPU" not in device_name:
            print("GPU device not found")
        print('Found GPU at: {}'.format(device_name))
        
        config = tf.compat.v1.ConfigProto() 
        config.gpu_options.allow_growth = True 
        sess = tf.compat.v1.Session(config=config) 
        tf.compat.v1.keras.backend.set_session(sess)
        
        print(config)
    
    elif device.lower() == 'tpu':
        print("Setting up TPU.....")
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        tpu_strategy = tf.distribute.TPUStrategy(tpu)
        print("REPLICAS: ", tpu_strategy.num_replicas_in_sync)
        
    else:
        raise Exception("Wrong Device Paramter Passed")

In [None]:
tf_tpu_or_gpu(device='tpu')

In [4]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print('Running on TPU ', tpu.master())
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)
print("REPLICAS: ", tpu_strategy.num_replicas_in_sync)

Running on TPU  
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.
INFO:tensorflow:Initializing the TPU system: local
INFO:tensorflow:Finished initializing TPU system.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:0, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:1, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:2, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:TPU:3, TPU, 0, 0)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/

device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

config = tf.compat.v1.ConfigProto() 
config.gpu_options.allow_growth = True 
sess = tf.compat.v1.Session(config=config) 
tf.compat.v1.keras.backend.set_session(sess)

In [5]:
class Config:
    URL = f"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv"
    FILE_NAME = "toxic_comment_data.csv"
    VOCAB_SIZE = 200000
    OUTPUT_DIM = 1800
    BUFFER_SIZE = 160000
    BATCH_SIZE = 16*8
    EPOCHS = 10
    BASE_LOG_DIR = "log_dir"
    CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,"models")

In [6]:
data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)
data = pd.read_csv("/kaggle/working/toxic_comment_data.csv")
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [8]:
data.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [9]:
data[data.columns.to_list()[2:]].apply(pd.Series.value_counts).T

Unnamed: 0,0,1
toxic,144277,15294
severe_toxic,157976,1595
obscene,151122,8449
threat,159093,478
insult,151694,7877
identity_hate,158166,1405


In [10]:
for column in data.columns:
    if data[column].dtype != 'O':
        value_count = data[column].value_counts()
        print(f"{column} value count\n{'--'*10}")
        print(f"0: {value_count[0]} | {round((value_count[0]/data.shape[0])*100,2)} %\n"
              f"1: {value_count[1]} | {round((value_count[1]/data.shape[0])*100,2)} %\n")

toxic value count
--------------------
0: 144277 | 90.42 %
1: 15294 | 9.58 %

severe_toxic value count
--------------------
0: 157976 | 99.0 %
1: 1595 | 1.0 %

obscene value count
--------------------
0: 151122 | 94.71 %
1: 8449 | 5.29 %

threat value count
--------------------
0: 159093 | 99.7 %
1: 478 | 0.3 %

insult value count
--------------------
0: 151694 | 95.06 %
1: 7877 | 4.94 %

identity_hate value count
--------------------
0: 158166 | 99.12 %
1: 1405 | 0.88 %



In [None]:
data["text_len"] = data["comment_text"].apply(lambda x: len(x.split()))
data[data["text_len"]==data["text_len"].max()]['comment_text']

In [11]:
X = data['comment_text']
y = data[data.columns[2:]].values

In [12]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [13]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

### Text Preprocessing

In [15]:
class Text_Cleaner:
    def __init__(self, data):
        self.data = data
        self.STOPWORDS = stopwords.words('english')
        self.wordnet = WordNetLemmatizer()
        
    def new_line_code(self, x:str)->str:
        pattern = "\n"
        x = re.sub(pattern,' ', x).strip().lower()
        return x

    def remove_punctuations(self, x:str)->str:
        x = x.translate(str.maketrans('','',string.punctuation))
        return x

    def remove_stopwords(self, x:str)->str:
        sent=[]
        for word in x.split():
            if word not in self.STOPWORDS:
                sent.append(word)
        return ' '.join(sent)

    def lemmatization(self, x:str)->str:
        sent=[]
        for word in x.split():
            sent.append(self.wordnet.lemmatize(word))
        return ' '.join(sent)
    
    def clean_text(self):
        self.data = self.data.apply(self.new_line_code)
        self.data = self.data.apply(self.remove_punctuations)
        self.data = self.data.apply(self.remove_stopwords)
        self.data = self.data.apply(self.lemmatization)
        self.data = self.data.apply(lambda x: x.strip())
        return self.data

In [16]:
X = Text_Cleaner(X).clean_text()

In [17]:
X

0         explanation edits made username hardcore metal...
1         daww match background colour im seemingly stuc...
2         hey man im really trying edit war guy constant...
3         cant make real suggestion improvement wondered...
4                       sir hero chance remember page thats
                                ...                        
159566    second time asking view completely contradicts...
159567       ashamed horrible thing put talk page 128611993
159568    spitzer umm there actual article prostitution ...
159569    look like actually put speedy first version de...
159570    really dont think understand came idea bad rig...
Name: comment_text, Length: 159571, dtype: object

### Model Building

In [None]:
vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,
                               output_sequence_length=Config.OUTPUT_DIM,
                               output_mode='int')
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)

In [37]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(Config.BUFFER_SIZE)
dataset = dataset.batch(Config.BATCH_SIZE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [38]:
train = dataset.take(int(len(dataset)*0.8))
val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))
#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [35]:
def create_model():
    
    LAYERS = [
              Embedding(Config.VOCAB_SIZE+1, 32),
              Bidirectional(LSTM(64, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)),
              Bidirectional(LSTM(32)),
              Dense(128, activation='relu'),
              Dropout(0.1),
              Dense(256, activation='relu'),
              Dropout(0.1),
              Dense(128, activation='relu'),
              Dense(6, activation='sigmoid')]
    
    model = Sequential(LAYERS)
    return model

In [34]:
def callbacks(base_dir="."):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
    ckpt_file = os.path.join(Config.CHECKPOINT_DIR,"model")
    os.makedirs(ckpt_file,exist_ok=True)

    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(
      filepath = ckpt_file,
      save_best_only = True)

    callback_list = [early_stopping,
                     ckpt_cb]
    return callback_list
callbacks_list = callbacks()

In [36]:
with tpu_strategy.scope():
    model = create_model()
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss=tf.keras.losses.binary_crossentropy,
                  metrics=AUC(multi_label=True, num_labels=6))

In [39]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_4 (Bidirectio  (None, None, 128)        49664     
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 256)              

In [24]:
len(train)

997

In [40]:
history = model.fit(train, 
                    epochs=Config.EPOCHS,
                    steps_per_epoch=len(train),
                    validation_data=val,
                    callbacks=callbacks_list)

Epoch 1/10


2023-05-03 14:42:57.854226: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.
2023-05-03 14:42:58.165317: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add_42/ReadVariableOp.




2023-05-03 15:05:36.690047: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.
2023-05-03 15:05:36.851778: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Add/ReadVariableOp.


INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 2/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 3/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 4/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 5/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 6/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 7/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 8/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 9/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets


Epoch 10/10



INFO:tensorflow:Assets written to: log_dir/models/model/assets


INFO:tensorflow:Assets written to: log_dir/models/model/assets




In [42]:
def model_evaluation(model, vectorizer: TextVectorization, pred_data: pd.Series, y_true):
    #pred_data = Text_Cleaner(pred_data).clean_text()
    #vectorized_text = vectorizer(pred_data)
    y_pred = model.predict(pred_data)
    try:
        precision = precision_score(y_true, (y_pred>0.5).astype(int), average="macro")
        recall = recall_score(y_true, (y_pred>0.5).astype(int), average="macro")
        f1 = f1_score(y_true, (y_pred>0.5).astype(int), average="macro")
        auc = roc_auc_score(y_true, y_pred, average="macro")
    except Exception as e:
        print(e)
        
    print(f"Precision: {precision}\n"
          f"Recall: {recall}\n"
          f"F1-Score: {f1}\n"
          f"ROC-AUC-Score: {auc}")
    return (precision, recall, f1, auc)

In [None]:
model.evaluate(test)

In [41]:
model.save("model_4.h5")

In [55]:
x_train = np.concatenate([x for x, y in train])
y_train = np.concatenate([y for x, y in train])
result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_train, y_true=y_train)

Precision: 0.034067329786671804
Recall: 0.03396435372259718
F1-Score: 0.03375883387877523
ROC-AUC-Score: 0.4963643308231378


In [53]:
x_val = np.concatenate([x for x, y in val])
y_val = np.concatenate([y for x, y in val])
result_train=model_evaluation(model=model, vectorizer=vectorizer, pred_data=x_val, y_true=y_val)

Precision: 0.03615509646190422
Recall: 0.03674059129986899
F1-Score: 0.03625622443975915
ROC-AUC-Score: 0.4868083116383068
