File size: 2,398 Bytes
00b7179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""Detection_model.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/18hnebi4AGf55vyqvnxcZhk3oJWcZEyCu
"""

import pandas as pd
df= messages = pd.read_csv('/content/dataset.tsv', sep='\t',names=["label","message"] )
df.head

df.shape

#independent feature
X=list(df['message'])

#dependent feature
Y=list(df['label'])

pd.get_dummies(Y,drop_first=True)

Y=list(pd.get_dummies(Y,drop_first=True)['label'])

Y

#train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 0)

#pip install transformers

#we use bert tokenizer for our bert base model
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encoding = tokenizer(X_test, truncation=True, padding=True)

train_encodings

import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),Y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encoding),Y_test))

train_dataset

from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments

# Define your training arguments
training_args = TFTrainingArguments(
    output_dir="./output",
    evaluation_strategy="steps",  # You might also set this to "epoch"
    eval_steps=None,  # Set this to None if you don't want periodic evaluations
    save_total_limit=2,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
)

with training_args.strategy.scope():
  model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

trainer = TFTrainer(
    model=model,                       #instatitaing the model to be trained
    args=training_args,                # training arguments, defined above
    train_dataset=train_dataset,       #training dataset
    eval_dataset=test_dataset          #evaluation dataset
)

trainer.train()

trainer.evaluate(test_dataset)

trainer.predict(test_dataset)

trainer.predict(test_dataset)[1].shape

output=trainer.predict(test_dataset)[1]

#to create confusion matrix
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(Y_test,output)
cm

#saving our model
trainer.save_model('detection_model')