Spaces:

AliArshad
/

SeverityPrediction

Runtime error

App Files Files Community

SeverityPrediction / code.py

AliArshad

Create code.py

be8976e 9 months ago

raw

history blame

No virus

5.6 kB

	import numpy as np
	import pandas as pd
	import torch
	from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

	# Mount Google Drive
	from google.colab import drive
	drive.mount('/content/drive', force_remount=True)

	# Path to your Excel file in Google Drive
	file_path = '/content/drive/My Drive/filtered_data.xlsx'

	# Read the Excel file into a pandas DataFrame
	df = pd.read_excel(file_path)

	# Selecting only the necessary columns
	selected_columns = ['Short Description', 'Severity Label', 'Project']
	new_df = df[selected_columns].copy()

	# Exclude bug reports with 'normal' severity
	filtered_df = new_df[new_df['Severity Label'] != 'normal']

	# Define mapping for label conversion
	severity_mapping = {
	'blocker': 'severe',
	'critical': 'severe',
	'major': 'severe',
	'trivial': 'non-severe',
	'minor': 'non-severe'
	}

	# Replace severity labels according to the mapping
	filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(severity_mapping)

	# Mapping string labels to numeric representations
	label_mapping = {'non-severe': 0, 'severe': 1}
	filtered_df.loc[:, 'Severity Label'] = filtered_df['Severity Label'].map(label_mapping)


	####

	# Initialize XLNet tokenizer and model
	tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')


	# Define parameters
	max_len = 100 # Max sequence length
	batch_size = 32
	epochs = 5

	# Initialize evaluation results dictionary
	evaluation_results = {}

	# Iterate through each unique project as the test set
	for test_project in filtered_df['Project'].unique():

	# Reinitialize the model for each test project
	model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Define num_labels for binary classification

	# Select data for the current test project
	test_data = filtered_df[filtered_df['Project'] == test_project]
	train_data = filtered_df[filtered_df['Project'] != test_project]

	# Split train and test data
	train_texts = train_data['Short Description'].tolist()
	train_labels = train_data['Severity Label'].tolist()
	test_texts = test_data['Short Description'].tolist()
	test_labels = test_data['Severity Label'].tolist()

	# Tokenize train and test data
	train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=max_len)
	test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=max_len)

	# Create PyTorch datasets
	class CustomDataset(torch.utils.data.Dataset):
	def __init__(self, encodings, labels):
	self.encodings = encodings
	self.labels = labels

	def __getitem__(self, idx):
	item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
	item['labels'] = torch.tensor(self.labels[idx])
	return item

	def __len__(self):
	return len(self.labels)

	train_dataset = CustomDataset(train_encodings, train_labels)
	test_dataset = CustomDataset(test_encodings, test_labels)

	# Define training arguments
	training_args = TrainingArguments(
	output_dir='./results', # output directory
	num_train_epochs=epochs, # total number of training epochs
	per_device_train_batch_size=batch_size, # batch size per device during training
	per_device_eval_batch_size=batch_size, # batch size for evaluation
	warmup_steps=500, # number of warmup steps for learning rate scheduler
	weight_decay=0.01, # strength of weight decay
	logging_dir='./logs', # directory for storing logs
	)

	# Define trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	)

	# Train the model
	trainer.train()

	# Save the model to Google Drive
	model_save_path = '/content/drive/My Drive/XLNet_model_project_{}.pt'.format(test_project)
	torch.save(model.state_dict(), model_save_path)
	print(f"Model saved to '{model_save_path}'")

	# Evaluate the model
	predictions = trainer.predict(test_dataset)
	preds = np.argmax(predictions.predictions, axis=1)

	# Calculate evaluation metrics
	accuracy = accuracy_score(test_labels, preds)
	precision = precision_score(test_labels, preds)
	recall = recall_score(test_labels, preds)
	f1 = f1_score(test_labels, preds)
	mcc = matthews_corrcoef(test_labels, preds)
	conf_matrix = confusion_matrix(test_labels, preds)

	# Store evaluation results for the current test project
	evaluation_results[test_project] = {
	'Accuracy': accuracy,
	'Precision': precision,
	'Recall': recall,
	'F1-score': f1,
	'MCC': mcc,
	'Confusion Matrix': conf_matrix
	}

	# Print evaluation results for all test projects
	for project, results in evaluation_results.items():
	print(f"Evaluation results for Test Project '{project}':")
	for metric, value in results.items():
	if metric != 'Confusion Matrix':
	print(f"{metric}: {value}")
	else:
	print(f"{metric}:")
	print(value)
	print("------------------------------")

	# Convert evaluation results to a DataFrame
	df_results = pd.DataFrame.from_dict(evaluation_results, orient='index')

	# Save results to an Excel file
	excel_file_name = '/content/drive/My Drive/evaluation_results_XLNet.xlsx'
	df_results.to_excel(excel_file_name)
	print(f"Results saved to '{excel_file_name}'")