File size: 9,219 Bytes
f83431c
 
 
 
 
 
 
 
 
 
320450f
f83431c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850b42f
f83431c
850b42f
f83431c
850b42f
f83431c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# Importing required packages
import pickle
import pandas as pd
import re
import numpy as np
import lightgbm as lgbm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, ndcg_score


# Loading data
parquet_file = 'data/data_dump_ai_assingment.parquet'
df = pd.read_parquet(parquet_file, engine='pyarrow')


# Setting 3 random campaigns aside as testing examples for final models
campaign_ids = [8, 123, 256]
df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]


# Clean text
def clean_text(text):
    # Use a regular expression to remove non-alphabetic characters
    cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
    
    # Remove multiple consecutive spaces and leading/trailing spaces
    cleaned_text = ' '.join(cleaned_text.split())
    
    # Lower texts
    cleaned_text = cleaned_text.lower()

    return cleaned_text


def combine_text(df_single_lead):
    # Changing column types
    df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
    df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
    df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')

    # Combine text columns
    df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'

    # Clean text
    df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
    
    return df_single_lead


# Calculate performance metrics
def compute_metrics(predictions, true_labels):
    f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
    f1 = round(f1_score(true_labels, predictions),3)
    accuracy = round(accuracy_score(true_labels, predictions),3)
    recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
    precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
    performance_metrics = {
        'F1 weighted': f1_weighted,
        'F1': f1,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision
        }
    
    return performance_metrics


# Loading LGBM models
with open('models/lgbm_model_1/lgbm_model_1.pkl', 'rb') as model_file:
    lgbm_model_1 = pickle.load(model_file)

with open('models/lgbm_model_2/lgbm_model_2.pkl', 'rb') as model_file:
    lgbm_model_2 = pickle.load(model_file)

with open('models/lgbm_model_3/lgbm_model_3.pkl', 'rb') as model_file:
    lgbm_model_3 = pickle.load(model_file)

# Loading LGBM vectorizer
with open('models/lgbm_model_1/vectorizer.pkl', 'rb') as model_file:
    vectorizer = pickle.load(model_file)


# Rank whole campaing (per lead group) to obtain max and min scores used for scaling prediction scores
# Function to properly test a model on the test set by calculating score per group
def rank_campaign(CAMPAIGN_ID, ranker=lgbm_model_3, rank_cutoff=50):
    # Create empty lists to store predictions and true labels for each query group (lead id groups)
    campaign_predictions = []
    campaign_predictions_cutoff = []
    campaign_true_labels = []
    campaign_ndcg_scores = []
    
    campaign_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
    query_group_ids = campaign_data['lead_id']

    # Iterate over query groups (in this case lead ids)
    lead_ids = np.unique(query_group_ids)
    for lead_id in lead_ids:
            # Filter the data for the specific lead_id
            single_lead_data = campaign_data[campaign_data['lead_id'] == lead_id]

            # Only predict ranking is lead contains more than 1 employee
            if len(single_lead_data)>1:

                single_lead_data = combine_text(single_lead_data)

                # Preprocess the text features for the single lead
                single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])

                # Predict single lead scores
                single_lead_pred = ranker.predict(single_lead_tfidf)
                
                # Store predictions
                campaign_predictions.extend(single_lead_pred)
                campaign_true_labels.extend(single_lead_data['employee_is_selected'])

                # Store lead NDCG score
                # k is 3 unless single lead data has less than 4 items
                if len(single_lead_data) < 4:
                    k = len(single_lead_data)
                else:
                    k = 3

                ndcg_lead = ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k)
                campaign_ndcg_scores.append(ndcg_lead)                

            else:
                pass

    # Get max and min value of campaign prediction scores
    campaign_predictions_max = max(campaign_predictions)
    campaign_predictions_min = min(campaign_predictions)

    # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
    campaign_predictions_scaled = [(prediction - campaign_predictions_min) / (campaign_predictions_max - campaign_predictions_min) for prediction in campaign_predictions]
    
    # Define binary predictions based on rank_cutoff value
    cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in campaign_predictions_scaled]

    # Get performance metrics using binary cutoff_predictions
    performance_metrics = compute_metrics(true_labels=campaign_true_labels, predictions=cutoff_predictions)
    df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Value'])
    df_performance_metrics.reset_index(inplace=True, names=['Metric'])

    # Get average NDCG score
    ndcg_avg = round(sum(campaign_ndcg_scores) / len(campaign_ndcg_scores),3)
    df_campaign_ndcg = {'NDCG@k score': ndcg_avg, 'k': k}
    df_campaign_ndcg = pd.DataFrame.from_dict(data=df_campaign_ndcg, orient='index', columns=['Value'])
    df_campaign_ndcg.reset_index(inplace=True, names=['Metric'])

    # Merge perfromance metrics and average NDCG score
    df_campaign_rank = pd.concat([df_performance_metrics, df_campaign_ndcg], ignore_index=True)

    return campaign_predictions_max, campaign_predictions_min, df_campaign_rank


# Rank single lead
def rank_single_lead(CAMPAIGN_ID, LEAD_ID, rank_cutoff=50, ranker=lgbm_model_3):
    if ranker == "Light GBM 1":
        ranker = lgbm_model_1
    elif ranker == "Light GBM 2":
        ranker = lgbm_model_2
    elif ranker == "Light GBM 3":
        ranker = lgbm_model_3

    # Selecting single lead data and combine text columns used for ranking
    single_lead_data = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
    single_lead_data = combine_text(single_lead_data)

    # Preprocess the text features for the single lead
    single_lead_tfidf = vectorizer.transform(single_lead_data['combined_text'])

    # Predict single lead
    single_lead_pred = ranker.predict(single_lead_tfidf)
    single_lead_data['predicted_score'] = single_lead_pred

    # Scale predicted score between 0 and 1 using the max and min predicted scores of the whole campaign
    campaign_max_value, campaign_min_value, df_campaign_rank= rank_campaign(CAMPAIGN_ID, ranker, rank_cutoff)
    single_lead_data['scaled_predicted_score'] = (single_lead_data['predicted_score'] - campaign_min_value) / (campaign_max_value - campaign_min_value)

    # Define binary predictions based on rank_cutoff value
    cutoff_predictions = [1 if prediction >= (rank_cutoff/100) else 0 for prediction in single_lead_data['scaled_predicted_score']]
    single_lead_data['cutoff_prediction'] = cutoff_predictions
    
    # Rank employees and create output dataframe
    ranked_list = [i+1 for i in range(len(single_lead_data['predicted_score']))]
    single_lead_data = single_lead_data.sort_values(by='predicted_score', ascending=False)
    single_lead_data['ranking'] = ranked_list
    single_lead_data['scaled_predicted_score'] = single_lead_data['scaled_predicted_score'].round(3)
    single_lead_data['predicted_score'] = single_lead_data['predicted_score'].round(3)
    single_lead_data = single_lead_data[['ranking', 'scaled_predicted_score', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction', 'predicted_score']]

    # Top 3 dataframe
    df_123 = single_lead_data[(single_lead_data['ranking'].isin([1, 2, 3])) & (single_lead_data['cutoff_prediction'] == 1)].sort_values(by='ranking')
    
    # k is 3 unless single lead data has less than 4 items
    if len(single_lead_data) < 4:
        k = len(single_lead_data)
    else:
        k = 3

    # Compute NDCG score
    ndcg = round(ndcg_score(y_true=[single_lead_data['employee_is_selected']], y_score=[single_lead_pred], k=k), 3)

    df_ndcg_data = {'NDCG@k score': ndcg, 'k': k}
    df_ndcg = pd.DataFrame.from_dict(data=df_ndcg_data, orient='index', columns=['Value'])
    df_ndcg.reset_index(inplace=True, names=['Metric'])

    # Print data and overall ndcg score
    #print(f'NDCG Score on Test Data: {ndcg:.4f}')
    return single_lead_data, df_123, df_ndcg, df_campaign_rank