kbberendsen's picture
update support texts and light gbm model names
850b42f
raw
history blame contribute delete
9.22 kB
# Importing required packages
import pickle
import pandas as pd
import re
import numpy as np
import lightgbm as lgbm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ndcg_score
# Loading data
parquet_file = 'data/data_dump_ai_assingment.parquet'
df = pd.read_parquet(parquet_file, engine='pyarrow')
# Setting 3 random campaigns aside as testing examples for final models
campaign_ids = [8, 123, 256]
df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
# Clean text
def clean_text(text):
# Use a regular expression to remove non-alphabetic characters
cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
# Remove multiple consecutive spaces and leading/trailing spaces
cleaned_text = ' '.join(cleaned_text.split())
# Lower texts
cleaned_text = cleaned_text.lower()
return cleaned_text
def combine_text(df_single_lead):
# Changing column types
df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
# Combine text columns
df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
# Clean text
df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
return df_single_lead
# Calculate performance metrics
def compute_metrics(predictions, true_labels):
f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
f1 = round(f1_score(true_labels, predictions),3)
accuracy = round(accuracy_score(true_labels, predictions),3)
recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
performance_metrics = {
'F1 weighted': f1_weighted,
'F1': f1,
'Accuracy': accuracy,
'Recall': recall,
'Precision': precision
}
return performance_metrics
# Loading LGBM models
with open('models/lgbm_model_1/lgbm_model_1.pkl', 'rb') as model_file:
lgbm_model_1 = pickle.load(model_file)
with open('models/lgbm_model_2/lgbm_model_2.pkl', 'rb') as model_file:
lgbm_model_2 = pickle.load(model_file)
with open('models/lgbm_model_3/lgbm_model_3.pkl', 'rb') as model_file:
lgbm_model_3 = pickle.load(model_file)
# Loading LGBM vectorizer
with open('models/lgbm_model_1/vectorizer.pkl', 'rb') as model_file:
vectorizer = pickle.load(model_file)
# Rank whole campaing (per lead group) to obtain max and min scores used for scaling prediction scores
# Function to properly test a model on the test set by calculating score per group
def rank_campaign(CAMPAIGN_ID, ranker=lgbm_model_3, rank_cutoff=50):
# Create empty lists to store predictions and true labels for each query group (lead id groups)
campaign_predictions = []
campaign_predictions_cutoff = []
campaign_true_labels = []
campaign_ndcg_scores = []
campaign_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
query_group_ids = campaign_data['lead_id']
# Iterate over query groups (in this case lead ids)
lead_ids = np.unique(query_group_ids)
for lead_id in lead_ids:
# Filter the data for the specific lead_id
single_lead_data = campaign_data[campaign_data['lead_id'] == lead_id]
# Only predict ranking is lead contains more than 1 employee
if len(single_lead_data)>1:
single_lead_data = combine_text(single_lead_data)
# Preprocess the text features for the single lead
single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
# Predict single lead scores
single_lead_pred = ranker.predict(single_lead_tfidf)
# Store predictions
campaign_predictions.extend(single_lead_pred)
campaign_true_labels.extend(single_lead_data['employee_is_selected'])
# Store lead NDCG score
# k is 3 unless single lead data has less than 4 items
if len(single_lead_data) < 4:
k = len(single_lead_data)
else:
k = 3
ndcg_lead = ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k)
campaign_ndcg_scores.append(ndcg_lead)
else:
pass
# Get max and min value of campaign prediction scores
campaign_predictions_max = max(campaign_predictions)
campaign_predictions_min = min(campaign_predictions)
# Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
campaign_predictions_scaled = [(prediction - campaign_predictions_min) / (campaign_predictions_max - campaign_predictions_min) for prediction in campaign_predictions]
# Define binary predictions based on rank_cutoff value
cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in campaign_predictions_scaled]
# Get performance metrics using binary cutoff_predictions
performance_metrics = compute_metrics(true_labels=campaign_true_labels, predictions=cutoff_predictions)
df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Value'])
df_performance_metrics.reset_index(inplace=True, names=['Metric'])
# Get average NDCG score
ndcg_avg = round(sum(campaign_ndcg_scores) / len(campaign_ndcg_scores),3)
df_campaign_ndcg = {'NDCG@k score': ndcg_avg, 'k': k}
df_campaign_ndcg = pd.DataFrame.from_dict(data=df_campaign_ndcg, orient='index', columns=['Value'])
df_campaign_ndcg.reset_index(inplace=True, names=['Metric'])
# Merge perfromance metrics and average NDCG score
df_campaign_rank = pd.concat([df_performance_metrics, df_campaign_ndcg], ignore_index=True)
return campaign_predictions_max, campaign_predictions_min, df_campaign_rank
# Rank single lead
def rank_single_lead(CAMPAIGN_ID, LEAD_ID, rank_cutoff=50, ranker=lgbm_model_3):
if ranker == "Light GBM 1":
ranker = lgbm_model_1
elif ranker == "Light GBM 2":
ranker = lgbm_model_2
elif ranker == "Light GBM 3":
ranker = lgbm_model_3
# Selecting single lead data and combine text columns used for ranking
single_lead_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
single_lead_data = combine_text(single_lead_data)
# Preprocess the text features for the single lead
single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
# Predict single lead
single_lead_pred = ranker.predict(single_lead_tfidf)
single_lead_data['predicted_score'] = single_lead_pred
# Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
campaign_max_value, campaign_min_value, df_campaign_rank= rank_campaign(CAMPAIGN_ID, ranker, rank_cutoff)
single_lead_data['scaled_predicted_score'] = (single_lead_data['predicted_score'] - campaign_min_value) / (campaign_max_value - campaign_min_value)
# Define binary predictions based on rank_cutoff value
cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in single_lead_data['scaled_predicted_score']]
single_lead_data['cutoff_prediction'] = cutoff_predictions
# Rank employees and create output dataframe
ranked_list = [i+1 for i in range(len(single_lead_data['predicted_score']))]
single_lead_data = single_lead_data.sort_values(by='predicted_score', ascending=False)
single_lead_data['ranking'] = ranked_list
single_lead_data['scaled_predicted_score'] = single_lead_data['scaled_predicted_score'].round(3)
single_lead_data['predicted_score'] = single_lead_data['predicted_score'].round(3)
single_lead_data = single_lead_data[['ranking', 'scaled_predicted_score', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction', 'predicted_score']]
# Top 3 dataframe
df_123 = single_lead_data[(single_lead_data['ranking'].isin([1, 2, 3])) & (single_lead_data['cutoff_prediction'] == 1)].sort_values(by='ranking')
# k is 3 unless single lead data has less than 4 items
if len(single_lead_data) < 4:
k = len(single_lead_data)
else:
k = 3
# Compute NDCG score
ndcg = round(ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k), 3)
df_ndcg_data = {'NDCG@k score': ndcg, 'k': k}
df_ndcg = pd.DataFrame.from_dict(data=df_ndcg_data, orient='index', columns=['Value'])
df_ndcg.reset_index(inplace=True, names=['Metric'])
# Print data and overall ndcg score
#print(f'NDCG Score on Test Data: {ndcg:.4f}')
return single_lead_data, df_123, df_ndcg, df_campaign_rank