AliArshad commited on
Commit
be8976e
1 Parent(s): 44fecf0

Create code.py

Browse files
Files changed (1) hide show
  1. code.py +157 -0
code.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
5
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
6
+
7
+ # Mount Google Drive
8
+ from google.colab import drive
9
+ drive.mount('/content/drive', force_remount=True)
10
+
11
+ # Path to your Excel file in Google Drive
12
+ file_path = '/content/drive/My Drive/filtered_data.xlsx'
13
+
14
+ # Read the Excel file into a pandas DataFrame
15
+ df = pd.read_excel(file_path)
16
+
17
+ # Selecting only the necessary columns
18
+ selected_columns = ['Short Description', 'Severity Label', 'Project']
19
+ new_df = df[selected_columns].copy()
20
+
21
+ # Exclude bug reports with 'normal' severity
22
+ filtered_df = new_df[new_df['Severity Label'] != 'normal']
23
+
24
+ # Define mapping for label conversion
25
+ severity_mapping = {
26
+ 'blocker': 'severe',
27
+ 'critical': 'severe',
28
+ 'major': 'severe',
29
+ 'trivial': 'non-severe',
30
+ 'minor': 'non-severe'
31
+ }
32
+
33
+ # Replace severity labels according to the mapping
34
+ filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)
35
+
36
+ # Mapping string labels to numeric representations
37
+ label_mapping = {'non-severe': 0, 'severe': 1}
38
+ filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)
39
+
40
+
41
+ ####
42
+
43
+ # Initialize XLNet tokenizer and model
44
+ tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
45
+
46
+
47
+ # Define parameters
48
+ max_len = 100 # Max sequence length
49
+ batch_size = 32
50
+ epochs = 5
51
+
52
+ # Initialize evaluation results dictionary
53
+ evaluation_results = {}
54
+
55
+ # Iterate through each unique project as the test set
56
+ for test_project in filtered_df['Project'].unique():
57
+
58
+ # Reinitialize the model for each test project
59
+ model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Define num_labels for binary classification
60
+
61
+ # Select data for the current test project
62
+ test_data = filtered_df[filtered_df['Project'] == test_project]
63
+ train_data = filtered_df[filtered_df['Project'] != test_project]
64
+
65
+ # Split train and test data
66
+ train_texts = train_data['Short Description'].tolist()
67
+ train_labels = train_data['Severity Label'].tolist()
68
+ test_texts = test_data['Short Description'].tolist()
69
+ test_labels = test_data['Severity Label'].tolist()
70
+
71
+ # Tokenize train and test data
72
+ train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
73
+ test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)
74
+
75
+ # Create PyTorch datasets
76
+ class CustomDataset(torch.utils.data.Dataset):
77
+ def __init__(self, encodings, labels):
78
+ self.encodings = encodings
79
+ self.labels = labels
80
+
81
+ def __getitem__(self, idx):
82
+ item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
83
+ item['labels'] = torch.tensor(self.labels[idx])
84
+ return item
85
+
86
+ def __len__(self):
87
+ return len(self.labels)
88
+
89
+ train_dataset = CustomDataset(train_encodings, train_labels)
90
+ test_dataset = CustomDataset(test_encodings, test_labels)
91
+
92
+ # Define training arguments
93
+ training_args = TrainingArguments(
94
+ output_dir='./results', # output directory
95
+ num_train_epochs=epochs, # total number of training epochs
96
+ per_device_train_batch_size=batch_size, # batch size per device during training
97
+ per_device_eval_batch_size=batch_size, # batch size for evaluation
98
+ warmup_steps=500, # number of warmup steps for learning rate scheduler
99
+ weight_decay=0.01, # strength of weight decay
100
+ logging_dir='./logs', # directory for storing logs
101
+ )
102
+
103
+ # Define trainer
104
+ trainer = Trainer(
105
+ model=model,
106
+ args=training_args,
107
+ train_dataset=train_dataset,
108
+ )
109
+
110
+ # Train the model
111
+ trainer.train()
112
+
113
+ # Save the model to Google Drive
114
+ model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
115
+ torch.save(model.state_dict(), model_save_path)
116
+ print(f"Model saved to '{model_save_path}'")
117
+
118
+ # Evaluate the model
119
+ predictions = trainer.predict(test_dataset)
120
+ preds = np.argmax(predictions.predictions, axis=1)
121
+
122
+ # Calculate evaluation metrics
123
+ accuracy = accuracy_score(test_labels, preds)
124
+ precision = precision_score(test_labels, preds)
125
+ recall = recall_score(test_labels, preds)
126
+ f1 = f1_score(test_labels, preds)
127
+ mcc = matthews_corrcoef(test_labels, preds)
128
+ conf_matrix = confusion_matrix(test_labels, preds)
129
+
130
+ # Store evaluation results for the current test project
131
+ evaluation_results[test_project] = {
132
+ 'Accuracy': accuracy,
133
+ 'Precision': precision,
134
+ 'Recall': recall,
135
+ 'F1-score': f1,
136
+ 'MCC': mcc,
137
+ 'Confusion Matrix': conf_matrix
138
+ }
139
+
140
+ # Print evaluation results for all test projects
141
+ for project, results in evaluation_results.items():
142
+ print(f"Evaluation results for Test Project '{project}':")
143
+ for metric, value in results.items():
144
+ if metric != 'Confusion Matrix':
145
+ print(f"{metric}: {value}")
146
+ else:
147
+ print(f"{metric}:")
148
+ print(value)
149
+ print("------------------------------")
150
+
151
+ # Convert evaluation results to a DataFrame
152
+ df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')
153
+
154
+ # Save results to an Excel file
155
+ excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
156
+ df_results.to_excel(excel_file_name)
157
+ print(f"Results saved to '{excel_file_name}'")