File size: 5,598 Bytes
be8976e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import numpy as np
import pandas as pd
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Path to your Excel file in Google Drive
file_path = '/content/drive/My Drive/filtered_data.xlsx'

# Read the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)

# Selecting only the necessary columns
selected_columns = ['Short Description', 'Severity Label', 'Project']
new_df = df[selected_columns].copy()

# Exclude bug reports with 'normal' severity
filtered_df = new_df[new_df['Severity Label'] != 'normal']

# Define mapping for label conversion
severity_mapping = {
    'blocker': 'severe',
    'critical': 'severe',
    'major': 'severe',
    'trivial': 'non-severe',
    'minor': 'non-severe'
}

# Replace severity labels according to the mapping
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)

# Mapping string labels to numeric representations
label_mapping = {'non-severe': 0, 'severe': 1}
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)


####

# Initialize XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')


# Define parameters
max_len = 100  # Max sequence length
batch_size = 32
epochs = 5

# Initialize evaluation results dictionary
evaluation_results = {}

# Iterate through each unique project as the test set
for test_project in filtered_df['Project'].unique():

    # Reinitialize the model for each test project
    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)  # Define num_labels for binary classification

    # Select data for the current test project
    test_data = filtered_df[filtered_df['Project'] == test_project]
    train_data = filtered_df[filtered_df['Project'] != test_project]

    # Split train and test data
    train_texts = train_data['Short Description'].tolist()
    train_labels = train_data['Severity Label'].tolist()
    test_texts = test_data['Short Description'].tolist()
    test_labels = test_data['Severity Label'].tolist()

    # Tokenize train and test data
    train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
    test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)

    # Create PyTorch datasets
    class CustomDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = CustomDataset(train_encodings, train_labels)
    test_dataset = CustomDataset(test_encodings, test_labels)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        num_train_epochs=epochs,  # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=batch_size,   # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
    )

    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    # Train the model
    trainer.train()

    # Save the model to Google Drive
    model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to '{model_save_path}'")

    # Evaluate the model
    predictions = trainer.predict(test_dataset)
    preds = np.argmax(predictions.predictions, axis=1)

    # Calculate evaluation metrics
    accuracy = accuracy_score(test_labels, preds)
    precision = precision_score(test_labels, preds)
    recall = recall_score(test_labels, preds)
    f1 = f1_score(test_labels, preds)
    mcc = matthews_corrcoef(test_labels, preds)
    conf_matrix = confusion_matrix(test_labels, preds)

    # Store evaluation results for the current test project
    evaluation_results[test_project] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'MCC': mcc,
        'Confusion Matrix': conf_matrix
    }

# Print evaluation results for all test projects
for project, results in evaluation_results.items():
    print(f"Evaluation results for Test Project '{project}':")
    for metric, value in results.items():
        if metric != 'Confusion Matrix':
            print(f"{metric}: {value}")
        else:
            print(f"{metric}:")
            print(value)
    print("------------------------------")

# Convert evaluation results to a DataFrame
df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')

# Save results to an Excel file
excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
df_results.to_excel(excel_file_name)
print(f"Results saved to '{excel_file_name}'")