# Importing required packages import pickle import pandas as pd import re import numpy as np import lightgbm as lgbm from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ndcg_score # Loading data parquet_file = 'data/data_dump_ai_assingment.parquet' df = pd.read_parquet(parquet_file, engine='pyarrow') # Setting 3 random campaigns aside as testing examples for final models campaign_ids = [8, 123, 256] df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True] # Clean text def clean_text(text): # Use a regular expression to remove non-alphabetic characters cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text) # Remove multiple consecutive spaces and leading/trailing spaces cleaned_text = ' '.join(cleaned_text.split()) # Lower texts cleaned_text = cleaned_text.lower() return cleaned_text def combine_text(df_single_lead): # Changing column types df_single_lead['current_position'] = df_single_lead['current_position'].astype('str') df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str') df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str') # Combine text columns df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees' # Clean text df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row)) return df_single_lead # Calculate performance metrics def compute_metrics(predictions, true_labels): f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3) f1 = round(f1_score(true_labels, predictions),3) accuracy = round(accuracy_score(true_labels, predictions),3) recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3) precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3) performance_metrics = { 'F1 weighted': f1_weighted, 'F1': f1, 'Accuracy': accuracy, 'Recall': recall, 'Precision': precision } return performance_metrics # Loading LGBM models with open('models/lgbm_model_1/lgbm_model_1.pkl', 'rb') as model_file: lgbm_model_1 = pickle.load(model_file) with open('models/lgbm_model_2/lgbm_model_2.pkl', 'rb') as model_file: lgbm_model_2 = pickle.load(model_file) with open('models/lgbm_model_3/lgbm_model_3.pkl', 'rb') as model_file: lgbm_model_3 = pickle.load(model_file) # Loading LGBM vectorizer with open('models/lgbm_model_1/vectorizer.pkl', 'rb') as model_file: vectorizer = pickle.load(model_file) # Rank whole campaing (per lead group) to obtain max and min scores used for scaling prediction scores # Function to properly test a model on the test set by calculating score per group def rank_campaign(CAMPAIGN_ID, ranker=lgbm_model_3, rank_cutoff=50): # Create empty lists to store predictions and true labels for each query group (lead id groups) campaign_predictions = [] campaign_predictions_cutoff = [] campaign_true_labels = [] campaign_ndcg_scores = [] campaign_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)] query_group_ids = campaign_data['lead_id'] # Iterate over query groups (in this case lead ids) lead_ids = np.unique(query_group_ids) for lead_id in lead_ids: # Filter the data for the specific lead_id single_lead_data = campaign_data[campaign_data['lead_id'] == lead_id] # Only predict ranking is lead contains more than 1 employee if len(single_lead_data)>1: single_lead_data = combine_text(single_lead_data) # Preprocess the text features for the single lead single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text']) # Predict single lead scores single_lead_pred = ranker.predict(single_lead_tfidf) # Store predictions campaign_predictions.extend(single_lead_pred) campaign_true_labels.extend(single_lead_data['employee_is_selected']) # Store lead NDCG score # k is 3 unless single lead data has less than 4 items if len(single_lead_data) < 4: k = len(single_lead_data) else: k = 3 ndcg_lead = ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k) campaign_ndcg_scores.append(ndcg_lead) else: pass # Get max and min value of campaign prediction scores campaign_predictions_max = max(campaign_predictions) campaign_predictions_min = min(campaign_predictions) # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign campaign_predictions_scaled = [(prediction - campaign_predictions_min) / (campaign_predictions_max - campaign_predictions_min) for prediction in campaign_predictions] # Define binary predictions based on rank_cutoff value cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in campaign_predictions_scaled] # Get performance metrics using binary cutoff_predictions performance_metrics = compute_metrics(true_labels=campaign_true_labels, predictions=cutoff_predictions) df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Value']) df_performance_metrics.reset_index(inplace=True, names=['Metric']) # Get average NDCG score ndcg_avg = round(sum(campaign_ndcg_scores) / len(campaign_ndcg_scores),3) df_campaign_ndcg = {'NDCG@k score': ndcg_avg, 'k': k} df_campaign_ndcg = pd.DataFrame.from_dict(data=df_campaign_ndcg, orient='index', columns=['Value']) df_campaign_ndcg.reset_index(inplace=True, names=['Metric']) # Merge perfromance metrics and average NDCG score df_campaign_rank = pd.concat([df_performance_metrics, df_campaign_ndcg], ignore_index=True) return campaign_predictions_max, campaign_predictions_min, df_campaign_rank # Rank single lead def rank_single_lead(CAMPAIGN_ID, LEAD_ID, rank_cutoff=50, ranker=lgbm_model_3): if ranker == "Light GBM 1": ranker = lgbm_model_1 elif ranker == "Light GBM 2": ranker = lgbm_model_2 elif ranker == "Light GBM 3": ranker = lgbm_model_3 # Selecting single lead data and combine text columns used for ranking single_lead_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)] single_lead_data = combine_text(single_lead_data) # Preprocess the text features for the single lead single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text']) # Predict single lead single_lead_pred = ranker.predict(single_lead_tfidf) single_lead_data['predicted_score'] = single_lead_pred # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign campaign_max_value, campaign_min_value, df_campaign_rank= rank_campaign(CAMPAIGN_ID, ranker, rank_cutoff) single_lead_data['scaled_predicted_score'] = (single_lead_data['predicted_score'] - campaign_min_value) / (campaign_max_value - campaign_min_value) # Define binary predictions based on rank_cutoff value cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in single_lead_data['scaled_predicted_score']] single_lead_data['cutoff_prediction'] = cutoff_predictions # Rank employees and create output dataframe ranked_list = [i+1 for i in range(len(single_lead_data['predicted_score']))] single_lead_data = single_lead_data.sort_values(by='predicted_score', ascending=False) single_lead_data['ranking'] = ranked_list single_lead_data['scaled_predicted_score'] = single_lead_data['scaled_predicted_score'].round(3) single_lead_data['predicted_score'] = single_lead_data['predicted_score'].round(3) single_lead_data = single_lead_data[['ranking', 'scaled_predicted_score', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction', 'predicted_score']] # Top 3 dataframe df_123 = single_lead_data[(single_lead_data['ranking'].isin([1, 2, 3])) & (single_lead_data['cutoff_prediction'] == 1)].sort_values(by='ranking') # k is 3 unless single lead data has less than 4 items if len(single_lead_data) < 4: k = len(single_lead_data) else: k = 3 # Compute NDCG score ndcg = round(ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k), 3) df_ndcg_data = {'NDCG@k score': ndcg, 'k': k} df_ndcg = pd.DataFrame.from_dict(data=df_ndcg_data, orient='index', columns=['Value']) df_ndcg.reset_index(inplace=True, names=['Metric']) # Print data and overall ndcg score #print(f'NDCG Score on Test Data: {ndcg:.4f}') return single_lead_data, df_123, df_ndcg, df_campaign_rank