# -*- coding: utf-8 -*- """Detection_model.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/18hnebi4AGf55vyqvnxcZhk3oJWcZEyCu """ import pandas as pd df= messages = pd.read_csv('/content/dataset.tsv', sep='\t',names=["label","message"] ) df.head df.shape #independent feature X=list(df['message']) #dependent feature Y=list(df['label']) pd.get_dummies(Y,drop_first=True) Y=list(pd.get_dummies(Y,drop_first=True)['label']) Y #train-test split from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 0) #pip install transformers #we use bert tokenizer for our bert base model from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') train_encodings = tokenizer(X_train, truncation=True, padding=True) test_encoding = tokenizer(X_test, truncation=True, padding=True) train_encodings import tensorflow as tf train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),Y_train)) test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encoding),Y_test)) train_dataset from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments # Define your training arguments training_args = TFTrainingArguments( output_dir="./output", evaluation_strategy="steps", # You might also set this to "epoch" eval_steps=None, # Set this to None if you don't want periodic evaluations save_total_limit=2, num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8, ) with training_args.strategy.scope(): model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased") trainer = TFTrainer( model=model, #instatitaing the model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, #training dataset eval_dataset=test_dataset #evaluation dataset ) trainer.train() trainer.evaluate(test_dataset) trainer.predict(test_dataset) trainer.predict(test_dataset)[1].shape output=trainer.predict(test_dataset)[1] #to create confusion matrix from sklearn.metrics import confusion_matrix cm=confusion_matrix(Y_test,output) cm #saving our model trainer.save_model('detection_model')