Spaces:
Sleeping
Sleeping
File size: 9,219 Bytes
f83431c 320450f f83431c 850b42f f83431c 850b42f f83431c 850b42f f83431c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# Importing required packages
import pickle
import pandas as pd
import re
import numpy as np
import lightgbm as lgbm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ndcg_score
# Loading data
parquet_file = 'data/data_dump_ai_assingment.parquet'
df = pd.read_parquet(parquet_file, engine='pyarrow')
# Setting 3 random campaigns aside as testing examples for final models
campaign_ids = [8, 123, 256]
df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
# Clean text
def clean_text(text):
# Use a regular expression to remove non-alphabetic characters
cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
# Remove multiple consecutive spaces and leading/trailing spaces
cleaned_text = ' '.join(cleaned_text.split())
# Lower texts
cleaned_text = cleaned_text.lower()
return cleaned_text
def combine_text(df_single_lead):
# Changing column types
df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
# Combine text columns
df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
# Clean text
df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
return df_single_lead
# Calculate performance metrics
def compute_metrics(predictions, true_labels):
f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
f1 = round(f1_score(true_labels, predictions),3)
accuracy = round(accuracy_score(true_labels, predictions),3)
recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
performance_metrics = {
'F1 weighted': f1_weighted,
'F1': f1,
'Accuracy': accuracy,
'Recall': recall,
'Precision': precision
}
return performance_metrics
# Loading LGBM models
with open('models/lgbm_model_1/lgbm_model_1.pkl', 'rb') as model_file:
lgbm_model_1 = pickle.load(model_file)
with open('models/lgbm_model_2/lgbm_model_2.pkl', 'rb') as model_file:
lgbm_model_2 = pickle.load(model_file)
with open('models/lgbm_model_3/lgbm_model_3.pkl', 'rb') as model_file:
lgbm_model_3 = pickle.load(model_file)
# Loading LGBM vectorizer
with open('models/lgbm_model_1/vectorizer.pkl', 'rb') as model_file:
vectorizer = pickle.load(model_file)
# Rank whole campaing (per lead group) to obtain max and min scores used for scaling prediction scores
# Function to properly test a model on the test set by calculating score per group
def rank_campaign(CAMPAIGN_ID, ranker=lgbm_model_3, rank_cutoff=50):
# Create empty lists to store predictions and true labels for each query group (lead id groups)
campaign_predictions = []
campaign_predictions_cutoff = []
campaign_true_labels = []
campaign_ndcg_scores = []
campaign_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
query_group_ids = campaign_data['lead_id']
# Iterate over query groups (in this case lead ids)
lead_ids = np.unique(query_group_ids)
for lead_id in lead_ids:
# Filter the data for the specific lead_id
single_lead_data = campaign_data[campaign_data['lead_id'] == lead_id]
# Only predict ranking is lead contains more than 1 employee
if len(single_lead_data)>1:
single_lead_data = combine_text(single_lead_data)
# Preprocess the text features for the single lead
single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
# Predict single lead scores
single_lead_pred = ranker.predict(single_lead_tfidf)
# Store predictions
campaign_predictions.extend(single_lead_pred)
campaign_true_labels.extend(single_lead_data['employee_is_selected'])
# Store lead NDCG score
# k is 3 unless single lead data has less than 4 items
if len(single_lead_data) < 4:
k = len(single_lead_data)
else:
k = 3
ndcg_lead = ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k)
campaign_ndcg_scores.append(ndcg_lead)
else:
pass
# Get max and min value of campaign prediction scores
campaign_predictions_max = max(campaign_predictions)
campaign_predictions_min = min(campaign_predictions)
# Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
campaign_predictions_scaled = [(prediction - campaign_predictions_min) / (campaign_predictions_max - campaign_predictions_min) for prediction in campaign_predictions]
# Define binary predictions based on rank_cutoff value
cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in campaign_predictions_scaled]
# Get performance metrics using binary cutoff_predictions
performance_metrics = compute_metrics(true_labels=campaign_true_labels, predictions=cutoff_predictions)
df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Value'])
df_performance_metrics.reset_index(inplace=True, names=['Metric'])
# Get average NDCG score
ndcg_avg = round(sum(campaign_ndcg_scores) / len(campaign_ndcg_scores),3)
df_campaign_ndcg = {'NDCG@k score': ndcg_avg, 'k': k}
df_campaign_ndcg = pd.DataFrame.from_dict(data=df_campaign_ndcg, orient='index', columns=['Value'])
df_campaign_ndcg.reset_index(inplace=True, names=['Metric'])
# Merge perfromance metrics and average NDCG score
df_campaign_rank = pd.concat([df_performance_metrics, df_campaign_ndcg], ignore_index=True)
return campaign_predictions_max, campaign_predictions_min, df_campaign_rank
# Rank single lead
def rank_single_lead(CAMPAIGN_ID, LEAD_ID, rank_cutoff=50, ranker=lgbm_model_3):
if ranker == "Light GBM 1":
ranker = lgbm_model_1
elif ranker == "Light GBM 2":
ranker = lgbm_model_2
elif ranker == "Light GBM 3":
ranker = lgbm_model_3
# Selecting single lead data and combine text columns used for ranking
single_lead_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
single_lead_data = combine_text(single_lead_data)
# Preprocess the text features for the single lead
single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])
# Predict single lead
single_lead_pred = ranker.predict(single_lead_tfidf)
single_lead_data['predicted_score'] = single_lead_pred
# Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
campaign_max_value, campaign_min_value, df_campaign_rank= rank_campaign(CAMPAIGN_ID, ranker, rank_cutoff)
single_lead_data['scaled_predicted_score'] = (single_lead_data['predicted_score'] - campaign_min_value) / (campaign_max_value - campaign_min_value)
# Define binary predictions based on rank_cutoff value
cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in single_lead_data['scaled_predicted_score']]
single_lead_data['cutoff_prediction'] = cutoff_predictions
# Rank employees and create output dataframe
ranked_list = [i+1 for i in range(len(single_lead_data['predicted_score']))]
single_lead_data = single_lead_data.sort_values(by='predicted_score', ascending=False)
single_lead_data['ranking'] = ranked_list
single_lead_data['scaled_predicted_score'] = single_lead_data['scaled_predicted_score'].round(3)
single_lead_data['predicted_score'] = single_lead_data['predicted_score'].round(3)
single_lead_data = single_lead_data[['ranking', 'scaled_predicted_score', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction', 'predicted_score']]
# Top 3 dataframe
df_123 = single_lead_data[(single_lead_data['ranking'].isin([1, 2, 3])) & (single_lead_data['cutoff_prediction'] == 1)].sort_values(by='ranking')
# k is 3 unless single lead data has less than 4 items
if len(single_lead_data) < 4:
k = len(single_lead_data)
else:
k = 3
# Compute NDCG score
ndcg = round(ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k), 3)
df_ndcg_data = {'NDCG@k score': ndcg, 'k': k}
df_ndcg = pd.DataFrame.from_dict(data=df_ndcg_data, orient='index', columns=['Value'])
df_ndcg.reset_index(inplace=True, names=['Metric'])
# Print data and overall ndcg score
#print(f'NDCG Score on Test Data: {ndcg:.4f}')
return single_lead_data, df_123, df_ndcg, df_campaign_rank |