reshark / dashboard /modules /classification.py
kbberendsen's picture
update docker and data locations
320450f
raw
history blame contribute delete
7.45 kB
# Importing required packages
import pickle
import pandas as pd
import re
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Loading data
parquet_file = 'data/data_dump_ai_assingment.parquet'
df = pd.read_parquet(parquet_file, engine='pyarrow')
# Setting 3 random campaigns aside as testing examples for final models
campaign_ids = [8, 123, 256]
df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
# Clean text
def clean_text(text):
# Use a regular expression to remove non-alphabetic characters
cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
# Remove multiple consecutive spaces and leading/trailing spaces
cleaned_text = ' '.join(cleaned_text.split())
# Lower texts
cleaned_text = cleaned_text.lower()
return cleaned_text
def combine_text(df_single_lead):
# Changing column types
df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
# Combine text columns
df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
# Clean text
df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
return df_single_lead
# Function to test model performance
def model_predict(model, tokenizer, X_test, y_test, batch_size=32):
text_test = X_test.to_list()
labels_test = y_test.to_list()
# Split the test data into batches to prevent large memory allocation
batch_size = batch_size
num_samples = len(text_test)
num_batches = (num_samples + batch_size - 1) // batch_size # Calculate the number of batches
# Initialize an empty list to store predicted labels
predicted_labels_test = []
# Initialize an empty list to store predicted probabilities
predicted_proba_test = []
# Iterate over batches
for i in range(num_batches):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, num_samples)
# Get a batch of text and labels
batch_text = text_test[start_idx:end_idx]
batch_labels = labels_test[start_idx:end_idx]
# Encode the batch
encoded_data = tokenizer(batch_text, padding=True, truncation=True, return_tensors='pt')
# Forward pass through the model
logits = model(encoded_data['input_ids'], attention_mask=encoded_data['attention_mask']).logits
# Get predicted labels for the batch
batch_predicted_labels = logits.argmax(dim=1).tolist()
# Append the batch predictions to the overall list
predicted_labels_test.extend(batch_predicted_labels)
# Apply softmax to logits to retrieve probabilities and put them in a cleaned list
softmax_proba = F.softmax(logits, dim=-1)
batch_predicted_proba= [tensor.tolist() for tensor in softmax_proba]
# Append the batch probabilities to the overall list
predicted_proba_test.extend(batch_predicted_proba)
return predicted_labels_test, predicted_proba_test
# Calculate performance metrics
def compute_metrics(predictions, true_labels):
f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
f1 = round(f1_score(true_labels, predictions),3)
accuracy = round(accuracy_score(true_labels, predictions),3)
recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
performance_metrics = {
'F1 weighted': f1_weighted,
'F1': f1,
'Accuracy': accuracy,
'Recall': recall,
'Precision': precision
}
return performance_metrics
# Loading XGB model
with open('models/xgb_tuned_2/xgb_model_tuned_2.pkl', 'rb') as model_file:
xgb_model_tuned_2 = pickle.load(model_file)
# Loading XGB vectorizer
with open('models/xgb_tuned_2/vectorizer.pkl', 'rb') as model_file:
vectorizer = pickle.load(model_file)
# Loading BERT model
distil_bert_tokenizer_tuned_2 = AutoTokenizer.from_pretrained('models/distil_bert_tuned_2')
# Loading BERT tokenizer
distil_bert_model_tuned_2 = AutoModelForSequenceClassification.from_pretrained(
'models/distil_bert_tuned_2', num_labels=2)
# Classify single lead data
def classify(CAMPAIGN_ID, LEAD_ID, proba_cutoff=50, model_type='XGB', full_campaign=False):
if full_campaign == True:
# Select full campaign data
df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
else:
# Selecting single lead data
df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
# True labels
true_labels = df['employee_is_selected'].tolist()
# Combining text columns
df = combine_text(df)
# Vectorize text with tfidf vectorizer
tfidf_matrix = vectorizer.transform(df['combined_text'])
# Selecing model
if model_type=='XGB':
model = xgb_model_tuned_2
# Predictions
predictions = model.predict(tfidf_matrix)
# Prediction porabilities of being 1 (selected)
predictions_proba_1 = model.predict_proba(tfidf_matrix)[:, 1].tolist()
elif model_type=='BERT':
predictions, predicted_test_proba = model_predict(model = distil_bert_model_tuned_2,
tokenizer = distil_bert_tokenizer_tuned_2,
X_test = df['combined_text'],
y_test = df['employee_is_selected'])
# Prediction porabilities of being 1 (selected)
predictions_proba_1 = [lists[1] for lists in predicted_test_proba]
# Alter predictions based on rank_cutoff value
cutoff_predictions = [1 if probability >= (proba_cutoff/100) else 0 for probability in predictions_proba_1]
# Use argsort to get the indices that would sort the list in descending order
sorted_indices = np.argsort(predictions_proba_1)[::-1]
# Create dataframe columns and ranking
df['cutoff_prediction'] = cutoff_predictions
df['prediction_proba_1'] = predictions_proba_1
df = df.sort_values(by='prediction_proba_1', ascending=False)
df['ranking'] = [i+1 for i in range(len(df['prediction_proba_1']))]
df['prediction_proba_1'] = df['prediction_proba_1'].round(3)
df = df[['ranking', 'prediction_proba_1', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction']].sort_values(by='prediction_proba_1', ascending=False)
df_123 = df[(df['ranking'].isin([1, 2, 3])) & (df['cutoff_prediction'] == 1)].sort_values(by='ranking')
performance_metrics = compute_metrics(cutoff_predictions, true_labels)
df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Score'])
df_performance_metrics.reset_index(inplace=True, names=['Metric'])
return df, df_123, df_performance_metrics