In [None]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
#import tensorflow_gpu
import urllib
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
from sklearn.metrics import roc_auc_score, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
import string
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('wordnet2022')

In [None]:
def tf_tpu_or_gpu(device: str='gpu'):
 if device.lower() == 'gpu':
 print("Setting up GPU.....")
 device_name = tf.test.gpu_device_name()
 if "GPU" not in device_name:
 print("GPU device not found")
 print('Found GPU at: {}'.format(device_name))
 config = tf.compat.v1.ConfigProto() 
 config.gpu_options.allow_growth = True 
 sess = tf.compat.v1.Session(config=config) 
 tf.compat.v1.keras.backend.set_session(sess)
 print(config)
 
 elif device.lower() == 'tpu':
 print("Setting up TPU.....")
 tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
 print('Running on TPU ', tpu.master())
 tf.config.experimental_connect_to_cluster(tpu)
 tf.tpu.experimental.initialize_tpu_system(tpu)
 tpu_strategy = tf.distribute.TPUStrategy(tpu)
 print("REPLICAS: ", tpu_strategy.num_replicas_in_sync)

 else:
 raise Exception("Wrong Device Paramter Passed")

In [None]:
tf_tpu_or_gpu(device='tpu')

In [None]:
class Config:
 URL = f"https://raw.githubusercontent.com/nicknochnack/CommentToxicity/main/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv"
 FILE_NAME = "toxic_comment_data.csv"
 VOCAB_SIZE = 200000
 OUTPUT_DIM = 1800
 BUFFER_SIZE = 160000
 BATCH_SIZE = 16*8
 EPOCHS = 10
 BASE_LOG_DIR = "log_dir"
 CHECKPOINT_DIR = os.path.join(BASE_LOG_DIR,"models")

In [None]:
data =urllib.request.urlretrieve(Config.URL, filename=Config.FILE_NAME)
data = pd.read_csv("/kaggle/working/toxic_comment_data.csv")
data.head()

In [None]:
X = data['comment_text']
y = data[data.columns[2:]].values

In [None]:
X

In [None]:
y

In [None]:
class Text_Cleaner:
 def __init__(self, data):
 self.data = data
 self.STOPWORDS = stopwords.words('english')
 self.wordnet = WordNetLemmatizer()
 
 def new_line_code(self, x:str)->str:
 pattern = "\n"
 x = re.sub(pattern,' ', x).strip().lower()
 return x

 def remove_punctuations(self, x:str)->str:
 x = x.translate(str.maketrans('','',string.punctuation))
 return x

 def remove_stopwords(self, x:str)->str:
 sent=[]
 for word in x.split():
 if word not in self.STOPWORDS:
 sent.append(word)
 return ' '.join(sent)

 def lemmatization(self, x:str)->str:
 sent=[]
 for word in x.split():
 sent.append(self.wordnet.lemmatize(word))
 return ' '.join(sent)
 
 def clean_text(self):
 self.data = self.data.apply(self.new_line_code)
 self.data = self.data.apply(self.remove_punctuations)
 self.data = self.data.apply(self.remove_stopwords)
 self.data = self.data.apply(self.lemmatization)
 self.data = self.data.apply(lambda x: x.strip())
 return self.data

In [None]:
X = Text_Cleaner(X).clean_text()

In [None]:
X

In [None]:
vectorizer = TextVectorization(max_tokens=Config.VOCAB_SIZE,
 output_sequence_length=Config.OUTPUT_DIM,
 output_mode='int')
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)`

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(Config.BUFFER_SIZE)
dataset = dataset.batch(Config.BATCH_SIZE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
train = dataset.take(int(len(dataset)*0.8))
val = dataset.skip(int(len(dataset)*0.8)).take(int(len(dataset)*0.2))
#test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [None]:
def callbacks(base_dir="."):
 early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
 ckpt_file = os.path.join(Config.CHECKPOINT_DIR,"model")
 os.makedirs(ckpt_file,exist_ok=True)

 ckpt_cb = tf.keras.callbacks.ModelCheckpoint(
 filepath = ckpt_file,
 save_best_only = True)

 callback_list = [early_stopping,
 ckpt_cb]
 return callback_list
callbacks_list = callbacks()

In [None]:
def create_model():
 LAYERS = [
 Embedding(Config.VOCAB_SIZE+1, 32),
 Bidirectional(LSTM(64, return_sequences=True, activation='tanh')),
 Bidirectional(LSTM(32)),
 Dense(128, activation='relu'),
 Dense(256, activation='relu'),
 Dense(128, activation='relu'),
 Dense(6, activation='sigmoid')]
 
 model = Sequential(LAYERS)
 return model

In [None]:
with tpu_strategy.scope():
 model = create_model()
 model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
 loss=tf.keras.losses.binary_crossentropy,
 metrics=AUC(multi_label=True, num_labels=6))
model.summary()

In [None]:
history = model.fit(train, 
 epochs=Config.EPOCHS,
 steps_per_epoch=len(train),
 validation_data=val,
 callbacks=callbacks_list)

In [None]:
def model_evaluation(model, pred_data: pd.Series, y_true):
 y_pred = model.predict(pred_data)
 try:
 precision = precision_score(y_true, (y_pred>0.5).astype(int), average="macro")
 recall = recall_score(y_true, (y_pred>0.5).astype(int), average="macro")
 f1 = f1_score(y_true, (y_pred>0.5).astype(int), average="macro")
 auc = roc_auc_score(y_true, y_pred, average="macro")
 except Exception as e:
 print(e)
 
 print(f"Precision: {precision}\n"
 f"Recall: {recall}\n"
 f"F1-Score: {f1}\n"
 f"ROC-AUC-Score: {auc}")
 return (precision, recall, f1, auc)

In [None]:
model.save("model_3.h5")

In [None]:
x_train = np.concatenate([x for x, y in train])
y_train = np.concatenate([y for x, y in train])
result_train=model_evaluation(model=model, pred_data=x_train, y_true=y_train)

In [None]:
x_val = np.concatenate([x for x, y in val])
y_val = np.concatenate([y for x, y in val])
result_train=model_evaluation(model=model, pred_data=x_val, y_true=y_val)