|
|
|
"""Detection_model.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/18hnebi4AGf55vyqvnxcZhk3oJWcZEyCu |
|
""" |
|
|
|
import pandas as pd |
|
df= messages = pd.read_csv('/content/dataset.tsv', sep='\t',names=["label","message"] ) |
|
df.head |
|
|
|
df.shape |
|
|
|
|
|
X=list(df['message']) |
|
|
|
|
|
Y=list(df['label']) |
|
|
|
pd.get_dummies(Y,drop_first=True) |
|
|
|
Y=list(pd.get_dummies(Y,drop_first=True)['label']) |
|
|
|
Y |
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 0) |
|
|
|
|
|
|
|
|
|
from transformers import BertTokenizer |
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
|
|
|
train_encodings = tokenizer(X_train, truncation=True, padding=True) |
|
test_encoding = tokenizer(X_test, truncation=True, padding=True) |
|
|
|
train_encodings |
|
|
|
import tensorflow as tf |
|
|
|
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),Y_train)) |
|
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encoding),Y_test)) |
|
|
|
train_dataset |
|
|
|
from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments |
|
|
|
|
|
training_args = TFTrainingArguments( |
|
output_dir="./output", |
|
evaluation_strategy="steps", |
|
eval_steps=None, |
|
save_total_limit=2, |
|
num_train_epochs=3, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
) |
|
|
|
with training_args.strategy.scope(): |
|
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased") |
|
|
|
trainer = TFTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=test_dataset |
|
) |
|
|
|
trainer.train() |
|
|
|
trainer.evaluate(test_dataset) |
|
|
|
trainer.predict(test_dataset) |
|
|
|
trainer.predict(test_dataset)[1].shape |
|
|
|
output=trainer.predict(test_dataset)[1] |
|
|
|
|
|
from sklearn.metrics import confusion_matrix |
|
|
|
cm=confusion_matrix(Y_test,output) |
|
cm |
|
|
|
|
|
trainer.save_model('detection_model') |