Spaces:
Sleeping
Sleeping
# Importing required packages | |
import pickle | |
import pandas as pd | |
import re | |
import numpy as np | |
import lightgbm as lgbm | |
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ndcg_score | |
# Loading data | |
parquet_file = 'data/data_dump_ai_assingment.parquet' | |
df = pd.read_parquet(parquet_file, engine='pyarrow') | |
# Setting 3 random campaigns aside as testing examples for final models | |
campaign_ids = [8, 123, 256] | |
df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True] | |
# Clean text | |
def clean_text(text): | |
# Use a regular expression to remove non-alphabetic characters | |
cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text) | |
# Remove multiple consecutive spaces and leading/trailing spaces | |
cleaned_text = ' '.join(cleaned_text.split()) | |
# Lower texts | |
cleaned_text = cleaned_text.lower() | |
return cleaned_text | |
def combine_text(df_single_lead): | |
# Changing column types | |
df_single_lead['current_position'] = df_single_lead['current_position'].astype('str') | |
df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str') | |
df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str') | |
# Combine text columns | |
df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees' | |
# Clean text | |
df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row)) | |
return df_single_lead | |
# Calculate performance metrics | |
def compute_metrics(predictions, true_labels): | |
f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3) | |
f1 = round(f1_score(true_labels, predictions),3) | |
accuracy = round(accuracy_score(true_labels, predictions),3) | |
recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3) | |
precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3) | |
performance_metrics = { | |
'F1 weighted': f1_weighted, | |
'F1': f1, | |
'Accuracy': accuracy, | |
'Recall': recall, | |
'Precision': precision | |
} | |
return performance_metrics | |
# Loading LGBM models | |
with open('models/lgbm_model_1/lgbm_model_1.pkl', 'rb') as model_file: | |
lgbm_model_1 = pickle.load(model_file) | |
with open('models/lgbm_model_2/lgbm_model_2.pkl', 'rb') as model_file: | |
lgbm_model_2 = pickle.load(model_file) | |
with open('models/lgbm_model_3/lgbm_model_3.pkl', 'rb') as model_file: | |
lgbm_model_3 = pickle.load(model_file) | |
# Loading LGBM vectorizer | |
with open('models/lgbm_model_1/vectorizer.pkl', 'rb') as model_file: | |
vectorizer = pickle.load(model_file) | |
# Rank whole campaing (per lead group) to obtain max and min scores used for scaling prediction scores | |
# Function to properly test a model on the test set by calculating score per group | |
def rank_campaign(CAMPAIGN_ID, ranker=lgbm_model_3, rank_cutoff=50): | |
# Create empty lists to store predictions and true labels for each query group (lead id groups) | |
campaign_predictions = [] | |
campaign_predictions_cutoff = [] | |
campaign_true_labels = [] | |
campaign_ndcg_scores = [] | |
campaign_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)] | |
query_group_ids = campaign_data['lead_id'] | |
# Iterate over query groups (in this case lead ids) | |
lead_ids = np.unique(query_group_ids) | |
for lead_id in lead_ids: | |
# Filter the data for the specific lead_id | |
single_lead_data = campaign_data[campaign_data['lead_id'] == lead_id] | |
# Only predict ranking is lead contains more than 1 employee | |
if len(single_lead_data)>1: | |
single_lead_data = combine_text(single_lead_data) | |
# Preprocess the text features for the single lead | |
single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text']) | |
# Predict single lead scores | |
single_lead_pred = ranker.predict(single_lead_tfidf) | |
# Store predictions | |
campaign_predictions.extend(single_lead_pred) | |
campaign_true_labels.extend(single_lead_data['employee_is_selected']) | |
# Store lead NDCG score | |
# k is 3 unless single lead data has less than 4 items | |
if len(single_lead_data) < 4: | |
k = len(single_lead_data) | |
else: | |
k = 3 | |
ndcg_lead = ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k) | |
campaign_ndcg_scores.append(ndcg_lead) | |
else: | |
pass | |
# Get max and min value of campaign prediction scores | |
campaign_predictions_max = max(campaign_predictions) | |
campaign_predictions_min = min(campaign_predictions) | |
# Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign | |
campaign_predictions_scaled = [(prediction - campaign_predictions_min) / (campaign_predictions_max - campaign_predictions_min) for prediction in campaign_predictions] | |
# Define binary predictions based on rank_cutoff value | |
cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in campaign_predictions_scaled] | |
# Get performance metrics using binary cutoff_predictions | |
performance_metrics = compute_metrics(true_labels=campaign_true_labels, predictions=cutoff_predictions) | |
df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Value']) | |
df_performance_metrics.reset_index(inplace=True, names=['Metric']) | |
# Get average NDCG score | |
ndcg_avg = round(sum(campaign_ndcg_scores) / len(campaign_ndcg_scores),3) | |
df_campaign_ndcg = {'NDCG@k score': ndcg_avg, 'k': k} | |
df_campaign_ndcg = pd.DataFrame.from_dict(data=df_campaign_ndcg, orient='index', columns=['Value']) | |
df_campaign_ndcg.reset_index(inplace=True, names=['Metric']) | |
# Merge perfromance metrics and average NDCG score | |
df_campaign_rank = pd.concat([df_performance_metrics, df_campaign_ndcg], ignore_index=True) | |
return campaign_predictions_max, campaign_predictions_min, df_campaign_rank | |
# Rank single lead | |
def rank_single_lead(CAMPAIGN_ID, LEAD_ID, rank_cutoff=50, ranker=lgbm_model_3): | |
if ranker == "Light GBM 1": | |
ranker = lgbm_model_1 | |
elif ranker == "Light GBM 2": | |
ranker = lgbm_model_2 | |
elif ranker == "Light GBM 3": | |
ranker = lgbm_model_3 | |
# Selecting single lead data and combine text columns used for ranking | |
single_lead_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)] | |
single_lead_data = combine_text(single_lead_data) | |
# Preprocess the text features for the single lead | |
single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text']) | |
# Predict single lead | |
single_lead_pred = ranker.predict(single_lead_tfidf) | |
single_lead_data['predicted_score'] = single_lead_pred | |
# Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign | |
campaign_max_value, campaign_min_value, df_campaign_rank= rank_campaign(CAMPAIGN_ID, ranker, rank_cutoff) | |
single_lead_data['scaled_predicted_score'] = (single_lead_data['predicted_score'] - campaign_min_value) / (campaign_max_value - campaign_min_value) | |
# Define binary predictions based on rank_cutoff value | |
cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in single_lead_data['scaled_predicted_score']] | |
single_lead_data['cutoff_prediction'] = cutoff_predictions | |
# Rank employees and create output dataframe | |
ranked_list = [i+1 for i in range(len(single_lead_data['predicted_score']))] | |
single_lead_data = single_lead_data.sort_values(by='predicted_score', ascending=False) | |
single_lead_data['ranking'] = ranked_list | |
single_lead_data['scaled_predicted_score'] = single_lead_data['scaled_predicted_score'].round(3) | |
single_lead_data['predicted_score'] = single_lead_data['predicted_score'].round(3) | |
single_lead_data = single_lead_data[['ranking', 'scaled_predicted_score', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction', 'predicted_score']] | |
# Top 3 dataframe | |
df_123 = single_lead_data[(single_lead_data['ranking'].isin([1, 2, 3])) & (single_lead_data['cutoff_prediction'] == 1)].sort_values(by='ranking') | |
# k is 3 unless single lead data has less than 4 items | |
if len(single_lead_data) < 4: | |
k = len(single_lead_data) | |
else: | |
k = 3 | |
# Compute NDCG score | |
ndcg = round(ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k), 3) | |
df_ndcg_data = {'NDCG@k score': ndcg, 'k': k} | |
df_ndcg = pd.DataFrame.from_dict(data=df_ndcg_data, orient='index', columns=['Value']) | |
df_ndcg.reset_index(inplace=True, names=['Metric']) | |
# Print data and overall ndcg score | |
#print(f'NDCG Score on Test Data: {ndcg:.4f}') | |
return single_lead_data, df_123, df_ndcg, df_campaign_rank |