AliArshad's picture
Create code.py
be8976e
raw
history blame
No virus
5.6 kB
import numpy as np
import pandas as pd
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# Path to your Excel file in Google Drive
file_path = '/content/drive/My Drive/filtered_data.xlsx'
# Read the Excel file into a pandas DataFrame
df = pd.read_excel(file_path)
# Selecting only the necessary columns
selected_columns = ['Short Description', 'Severity Label', 'Project']
new_df = df[selected_columns].copy()
# Exclude bug reports with 'normal' severity
filtered_df = new_df[new_df['Severity Label'] != 'normal']
# Define mapping for label conversion
severity_mapping = {
'blocker': 'severe',
'critical': 'severe',
'major': 'severe',
'trivial': 'non-severe',
'minor': 'non-severe'
}
# Replace severity labels according to the mapping
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)
# Mapping string labels to numeric representations
label_mapping = {'non-severe': 0, 'severe': 1}
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)
####
# Initialize XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# Define parameters
max_len = 100 # Max sequence length
batch_size = 32
epochs = 5
# Initialize evaluation results dictionary
evaluation_results = {}
# Iterate through each unique project as the test set
for test_project in filtered_df['Project'].unique():
# Reinitialize the model for each test project
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Define num_labels for binary classification
# Select data for the current test project
test_data = filtered_df[filtered_df['Project'] == test_project]
train_data = filtered_df[filtered_df['Project'] != test_project]
# Split train and test data
train_texts = train_data['Short Description'].tolist()
train_labels = train_data['Severity Label'].tolist()
test_texts = test_data['Short Description'].tolist()
test_labels = test_data['Severity Label'].tolist()
# Tokenize train and test data
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)
# Create PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)
# Define training arguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=epochs, # total number of training epochs
per_device_train_batch_size=batch_size, # batch size per device during training
per_device_eval_batch_size=batch_size, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
)
# Define trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Train the model
trainer.train()
# Save the model to Google Drive
model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to '{model_save_path}'")
# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, preds)
precision = precision_score(test_labels, preds)
recall = recall_score(test_labels, preds)
f1 = f1_score(test_labels, preds)
mcc = matthews_corrcoef(test_labels, preds)
conf_matrix = confusion_matrix(test_labels, preds)
# Store evaluation results for the current test project
evaluation_results[test_project] = {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1-score': f1,
'MCC': mcc,
'Confusion Matrix': conf_matrix
}
# Print evaluation results for all test projects
for project, results in evaluation_results.items():
print(f"Evaluation results for Test Project '{project}':")
for metric, value in results.items():
if metric != 'Confusion Matrix':
print(f"{metric}: {value}")
else:
print(f"{metric}:")
print(value)
print("------------------------------")
# Convert evaluation results to a DataFrame
df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')
# Save results to an Excel file
excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
df_results.to_excel(excel_file_name)
print(f"Results saved to '{excel_file_name}'")