Jashveenraj commited on
Commit
00b7179
1 Parent(s): 7cb23f3

Upload detection_model.py

Browse files
Files changed (1) hide show
  1. detection_model.py +90 -0
detection_model.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Detection_model.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/18hnebi4AGf55vyqvnxcZhk3oJWcZEyCu
8
+ """
9
+
10
+ import pandas as pd
11
+ df= messages = pd.read_csv('/content/dataset.tsv', sep='\t',names=["label","message"] )
12
+ df.head
13
+
14
+ df.shape
15
+
16
+ #independent feature
17
+ X=list(df['message'])
18
+
19
+ #dependent feature
20
+ Y=list(df['label'])
21
+
22
+ pd.get_dummies(Y,drop_first=True)
23
+
24
+ Y=list(pd.get_dummies(Y,drop_first=True)['label'])
25
+
26
+ Y
27
+
28
+ #train-test split
29
+ from sklearn.model_selection import train_test_split
30
+ X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 0)
31
+
32
+ #pip install transformers
33
+
34
+ #we use bert tokenizer for our bert base model
35
+ from transformers import BertTokenizer
36
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
37
+
38
+ train_encodings = tokenizer(X_train, truncation=True, padding=True)
39
+ test_encoding = tokenizer(X_test, truncation=True, padding=True)
40
+
41
+ train_encodings
42
+
43
+ import tensorflow as tf
44
+
45
+ train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),Y_train))
46
+ test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encoding),Y_test))
47
+
48
+ train_dataset
49
+
50
+ from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
51
+
52
+ # Define your training arguments
53
+ training_args = TFTrainingArguments(
54
+ output_dir="./output",
55
+ evaluation_strategy="steps", # You might also set this to "epoch"
56
+ eval_steps=None, # Set this to None if you don't want periodic evaluations
57
+ save_total_limit=2,
58
+ num_train_epochs=3,
59
+ per_device_train_batch_size=8,
60
+ per_device_eval_batch_size=8,
61
+ )
62
+
63
+ with training_args.strategy.scope():
64
+ model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
65
+
66
+ trainer = TFTrainer(
67
+ model=model, #instatitaing the model to be trained
68
+ args=training_args, # training arguments, defined above
69
+ train_dataset=train_dataset, #training dataset
70
+ eval_dataset=test_dataset #evaluation dataset
71
+ )
72
+
73
+ trainer.train()
74
+
75
+ trainer.evaluate(test_dataset)
76
+
77
+ trainer.predict(test_dataset)
78
+
79
+ trainer.predict(test_dataset)[1].shape
80
+
81
+ output=trainer.predict(test_dataset)[1]
82
+
83
+ #to create confusion matrix
84
+ from sklearn.metrics import confusion_matrix
85
+
86
+ cm=confusion_matrix(Y_test,output)
87
+ cm
88
+
89
+ #saving our model
90
+ trainer.save_model('detection_model')