Spaces:
Running
Running
import numpy as np | |
import pandas as pd | |
import torch | |
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments | |
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix | |
# Mount Google Drive | |
from google.colab import drive | |
drive.mount('/content/drive', force_remount=True) | |
# Path to your Excel file in Google Drive | |
file_path = '/content/drive/My Drive/filtered_data.xlsx' | |
# Read the Excel file into a pandas DataFrame | |
df = pd.read_excel(file_path) | |
# Selecting only the necessary columns | |
selected_columns = ['Short Description', 'Severity Label', 'Project'] | |
new_df = df[selected_columns].copy() | |
# Exclude bug reports with 'normal' severity | |
filtered_df = new_df[new_df['Severity Label'] != 'normal'] | |
# Define mapping for label conversion | |
severity_mapping = { | |
'blocker': 'severe', | |
'critical': 'severe', | |
'major': 'severe', | |
'trivial': 'non-severe', | |
'minor': 'non-severe' | |
} | |
# Replace severity labels according to the mapping | |
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping) | |
# Mapping string labels to numeric representations | |
label_mapping = {'non-severe': 0, 'severe': 1} | |
filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping) | |
#### | |
# Initialize XLNet tokenizer and model | |
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') | |
# Define parameters | |
max_len = 100 # Max sequence length | |
batch_size = 32 | |
epochs = 5 | |
# Initialize evaluation results dictionary | |
evaluation_results = {} | |
# Iterate through each unique project as the test set | |
for test_project in filtered_df['Project'].unique(): | |
# Reinitialize the model for each test project | |
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Define num_labels for binary classification | |
# Select data for the current test project | |
test_data = filtered_df[filtered_df['Project'] == test_project] | |
train_data = filtered_df[filtered_df['Project'] != test_project] | |
# Split train and test data | |
train_texts = train_data['Short Description'].tolist() | |
train_labels = train_data['Severity Label'].tolist() | |
test_texts = test_data['Short Description'].tolist() | |
test_labels = test_data['Severity Label'].tolist() | |
# Tokenize train and test data | |
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len) | |
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len) | |
# Create PyTorch datasets | |
class CustomDataset(torch.utils.data.Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
item['labels'] = torch.tensor(self.labels[idx]) | |
return item | |
def __len__(self): | |
return len(self.labels) | |
train_dataset = CustomDataset(train_encodings, train_labels) | |
test_dataset = CustomDataset(test_encodings, test_labels) | |
# Define training arguments | |
training_args = TrainingArguments( | |
output_dir='./results', # output directory | |
num_train_epochs=epochs, # total number of training epochs | |
per_device_train_batch_size=batch_size, # batch size per device during training | |
per_device_eval_batch_size=batch_size, # batch size for evaluation | |
warmup_steps=500, # number of warmup steps for learning rate scheduler | |
weight_decay=0.01, # strength of weight decay | |
logging_dir='./logs', # directory for storing logs | |
) | |
# Define trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
) | |
# Train the model | |
trainer.train() | |
# Save the model to Google Drive | |
model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project) | |
torch.save(model.state_dict(), model_save_path) | |
print(f"Model saved to '{model_save_path}'") | |
# Evaluate the model | |
predictions = trainer.predict(test_dataset) | |
preds = np.argmax(predictions.predictions, axis=1) | |
# Calculate evaluation metrics | |
accuracy = accuracy_score(test_labels, preds) | |
precision = precision_score(test_labels, preds) | |
recall = recall_score(test_labels, preds) | |
f1 = f1_score(test_labels, preds) | |
mcc = matthews_corrcoef(test_labels, preds) | |
conf_matrix = confusion_matrix(test_labels, preds) | |
# Store evaluation results for the current test project | |
evaluation_results[test_project] = { | |
'Accuracy': accuracy, | |
'Precision': precision, | |
'Recall': recall, | |
'F1-score': f1, | |
'MCC': mcc, | |
'Confusion Matrix': conf_matrix | |
} | |
# Print evaluation results for all test projects | |
for project, results in evaluation_results.items(): | |
print(f"Evaluation results for Test Project '{project}':") | |
for metric, value in results.items(): | |
if metric != 'Confusion Matrix': | |
print(f"{metric}: {value}") | |
else: | |
print(f"{metric}:") | |
print(value) | |
print("------------------------------") | |
# Convert evaluation results to a DataFrame | |
df_results = pd.DataFrame.from_dict(evaluation_results, orient='index') | |
# Save results to an Excel file | |
excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx' | |
df_results.to_excel(excel_file_name) | |
print(f"Results saved to '{excel_file_name}'") | |