Spaces:

kbberendsen
/

reshark

Sleeping

App Files Files Community

reshark / dashboard /modules /classification.py

kbberendsen

update docker and data locations

320450f over 1 year ago

raw

history blame contribute delete

7.45 kB

	# Importing required packages
	import pickle
	import pandas as pd
	import re
	import numpy as np
	import torch.nn.functional as F
	from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
	from transformers import AutoModelForSequenceClassification, AutoTokenizer


	# Loading data
	parquet_file = 'data/data_dump_ai_assingment.parquet'
	df = pd.read_parquet(parquet_file, engine='pyarrow')


	# Setting 3 random campaigns aside as testing examples for final models
	campaign_ids = [8, 123, 256]
	df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]


	# Clean text
	def clean_text(text):
	# Use a regular expression to remove non-alphabetic characters
	cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)

	# Remove multiple consecutive spaces and leading/trailing spaces
	cleaned_text = ' '.join(cleaned_text.split())

	# Lower texts
	cleaned_text = cleaned_text.lower()

	return cleaned_text


	def combine_text(df_single_lead):
	# Changing column types
	df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
	df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
	df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')

	# Combine text columns
	df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'

	# Clean text
	df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))

	return df_single_lead


	# Function to test model performance
	def model_predict(model, tokenizer, X_test, y_test, batch_size=32):
	text_test = X_test.to_list()
	labels_test = y_test.to_list()

	# Split the test data into batches to prevent large memory allocation
	batch_size = batch_size
	num_samples = len(text_test)
	num_batches = (num_samples + batch_size - 1) // batch_size # Calculate the number of batches

	# Initialize an empty list to store predicted labels
	predicted_labels_test = []

	# Initialize an empty list to store predicted probabilities
	predicted_proba_test = []

	# Iterate over batches
	for i in range(num_batches):
	start_idx = i * batch_size
	end_idx = min((i + 1) * batch_size, num_samples)

	# Get a batch of text and labels
	batch_text = text_test[start_idx:end_idx]
	batch_labels = labels_test[start_idx:end_idx]

	# Encode the batch
	encoded_data = tokenizer(batch_text, padding=True, truncation=True, return_tensors='pt')

	# Forward pass through the model
	logits = model(encoded_data['input_ids'], attention_mask=encoded_data['attention_mask']).logits

	# Get predicted labels for the batch
	batch_predicted_labels = logits.argmax(dim=1).tolist()

	# Append the batch predictions to the overall list
	predicted_labels_test.extend(batch_predicted_labels)

	# Apply softmax to logits to retrieve probabilities and put them in a cleaned list
	softmax_proba = F.softmax(logits, dim=-1)
	batch_predicted_proba= [tensor.tolist() for tensor in softmax_proba]

	# Append the batch probabilities to the overall list
	predicted_proba_test.extend(batch_predicted_proba)

	return predicted_labels_test, predicted_proba_test


	# Calculate performance metrics
	def compute_metrics(predictions, true_labels):
	f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
	f1 = round(f1_score(true_labels, predictions),3)
	accuracy = round(accuracy_score(true_labels, predictions),3)
	recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
	precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
	performance_metrics = {
	'F1 weighted': f1_weighted,
	'F1': f1,
	'Accuracy': accuracy,
	'Recall': recall,
	'Precision': precision
	}

	return performance_metrics


	# Loading XGB model
	with open('models/xgb_tuned_2/xgb_model_tuned_2.pkl', 'rb') as model_file:
	xgb_model_tuned_2 = pickle.load(model_file)

	# Loading XGB vectorizer
	with open('models/xgb_tuned_2/vectorizer.pkl', 'rb') as model_file:
	vectorizer = pickle.load(model_file)


	# Loading BERT model
	distil_bert_tokenizer_tuned_2 = AutoTokenizer.from_pretrained('models/distil_bert_tuned_2')

	# Loading BERT tokenizer
	distil_bert_model_tuned_2 = AutoModelForSequenceClassification.from_pretrained(
	'models/distil_bert_tuned_2', num_labels=2)


	# Classify single lead data
	def classify(CAMPAIGN_ID, LEAD_ID, proba_cutoff=50, model_type='XGB', full_campaign=False):

	if full_campaign == True:
	# Select full campaign data
	df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
	else:
	# Selecting single lead data
	df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]

	# True labels
	true_labels = df['employee_is_selected'].tolist()

	# Combining text columns
	df = combine_text(df)

	# Vectorize text with tfidf vectorizer
	tfidf_matrix = vectorizer.transform(df['combined_text'])

	# Selecing model
	if model_type=='XGB':
	model = xgb_model_tuned_2
	# Predictions
	predictions = model.predict(tfidf_matrix)
	# Prediction porabilities of being 1 (selected)
	predictions_proba_1 = model.predict_proba(tfidf_matrix)[:, 1].tolist()

	elif model_type=='BERT':
	predictions, predicted_test_proba = model_predict(model = distil_bert_model_tuned_2,
	tokenizer = distil_bert_tokenizer_tuned_2,
	X_test = df['combined_text'],
	y_test = df['employee_is_selected'])
	# Prediction porabilities of being 1 (selected)
	predictions_proba_1 = [lists[1] for lists in predicted_test_proba]

	# Alter predictions based on rank_cutoff value
	cutoff_predictions = [1 if probability >= (proba_cutoff/100) else 0 for probability in predictions_proba_1]

	# Use argsort to get the indices that would sort the list in descending order
	sorted_indices = np.argsort(predictions_proba_1)[::-1]

	# Create dataframe columns and ranking
	df['cutoff_prediction'] = cutoff_predictions
	df['prediction_proba_1'] = predictions_proba_1
	df = df.sort_values(by='prediction_proba_1', ascending=False)
	df['ranking'] = [i+1 for i in range(len(df['prediction_proba_1']))]
	df['prediction_proba_1'] = df['prediction_proba_1'].round(3)

	df = df[['ranking', 'prediction_proba_1', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction']].sort_values(by='prediction_proba_1', ascending=False)
	df_123 = df[(df['ranking'].isin([1, 2, 3])) & (df['cutoff_prediction'] == 1)].sort_values(by='ranking')

	performance_metrics = compute_metrics(cutoff_predictions, true_labels)
	df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Score'])
	df_performance_metrics.reset_index(inplace=True, names=['Metric'])

	return df, df_123, df_performance_metrics