File size: 7,451 Bytes
f83431c
 
 
 
 
 
 
 
 
 
 
320450f
f83431c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Importing required packages
import pickle
import pandas as pd
import re
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# Loading data
parquet_file = 'data/data_dump_ai_assingment.parquet'
df = pd.read_parquet(parquet_file, engine='pyarrow')


# Setting 3 random campaigns aside as testing examples for final models
campaign_ids = [8, 123, 256]
df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]


# Clean text
def clean_text(text):
    # Use a regular expression to remove non-alphabetic characters
    cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
    
    # Remove multiple consecutive spaces and leading/trailing spaces
    cleaned_text = ' '.join(cleaned_text.split())
    
    # Lower texts
    cleaned_text = cleaned_text.lower()

    return cleaned_text


def combine_text(df_single_lead):
    # Changing column types
    df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
    df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
    df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')

    # Combine text columns
    df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'

    # Clean text
    df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
    
    return df_single_lead


# Function to test model performance
def model_predict(model, tokenizer, X_test, y_test, batch_size=32):
    text_test = X_test.to_list()
    labels_test = y_test.to_list()

    # Split the test data into batches to prevent large memory allocation
    batch_size = batch_size
    num_samples = len(text_test)
    num_batches = (num_samples + batch_size - 1) // batch_size  # Calculate the number of batches

    # Initialize an empty list to store predicted labels
    predicted_labels_test = []

    # Initialize an empty list to store predicted probabilities
    predicted_proba_test = []
    
    # Iterate over batches
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_samples)
        
        # Get a batch of text and labels
        batch_text = text_test[start_idx:end_idx]
        batch_labels = labels_test[start_idx:end_idx]
        
        # Encode the batch
        encoded_data = tokenizer(batch_text, padding=True, truncation=True, return_tensors='pt')
        
        # Forward pass through the model
        logits = model(encoded_data['input_ids'], attention_mask=encoded_data['attention_mask']).logits
        
        # Get predicted labels for the batch
        batch_predicted_labels = logits.argmax(dim=1).tolist()

        # Append the batch predictions to the overall list
        predicted_labels_test.extend(batch_predicted_labels)

        # Apply softmax to logits to retrieve probabilities and put them in a cleaned list
        softmax_proba = F.softmax(logits, dim=-1)
        batch_predicted_proba= [tensor.tolist() for tensor in softmax_proba]

        # Append the batch probabilities to the overall list
        predicted_proba_test.extend(batch_predicted_proba)

    return predicted_labels_test, predicted_proba_test


# Calculate performance metrics
def compute_metrics(predictions, true_labels):
    f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
    f1 = round(f1_score(true_labels, predictions),3)
    accuracy = round(accuracy_score(true_labels, predictions),3)
    recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
    precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
    performance_metrics = {
        'F1 weighted': f1_weighted,
        'F1': f1,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision
        }
    
    return performance_metrics


# Loading XGB model
with open('models/xgb_tuned_2/xgb_model_tuned_2.pkl', 'rb') as model_file:
    xgb_model_tuned_2 = pickle.load(model_file)

# Loading XGB vectorizer
with open('models/xgb_tuned_2/vectorizer.pkl', 'rb') as model_file:
    vectorizer = pickle.load(model_file)


# Loading BERT model
distil_bert_tokenizer_tuned_2 = AutoTokenizer.from_pretrained('models/distil_bert_tuned_2')

# Loading BERT tokenizer
distil_bert_model_tuned_2  = AutoModelForSequenceClassification.from_pretrained(
    'models/distil_bert_tuned_2', num_labels=2)


# Classify single lead data
def classify(CAMPAIGN_ID, LEAD_ID, proba_cutoff=50, model_type='XGB', full_campaign=False):
    
    if full_campaign == True:
        # Select full campaign data
        df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
    else: 
        # Selecting single lead data
        df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]

    # True labels
    true_labels = df['employee_is_selected'].tolist()

    # Combining text columns
    df = combine_text(df)

    # Vectorize text with tfidf vectorizer
    tfidf_matrix = vectorizer.transform(df['combined_text'])

    # Selecing model
    if model_type=='XGB':
        model = xgb_model_tuned_2
        # Predictions
        predictions = model.predict(tfidf_matrix)
        # Prediction porabilities of being 1 (selected)
        predictions_proba_1 = model.predict_proba(tfidf_matrix)[:, 1].tolist()
    
    elif model_type=='BERT':
        predictions, predicted_test_proba = model_predict(model = distil_bert_model_tuned_2, 
                                                              tokenizer = distil_bert_tokenizer_tuned_2, 
                                                              X_test = df['combined_text'], 
                                                              y_test = df['employee_is_selected'])
        # Prediction porabilities of being 1 (selected)
        predictions_proba_1 = [lists[1] for lists in predicted_test_proba]
    
    # Alter predictions based on rank_cutoff value
    cutoff_predictions = [1 if probability >= (proba_cutoff/100) else 0 for probability in predictions_proba_1]

    # Use argsort to get the indices that would sort the list in descending order
    sorted_indices = np.argsort(predictions_proba_1)[::-1]  
    
    # Create dataframe columns and ranking
    df['cutoff_prediction'] = cutoff_predictions
    df['prediction_proba_1'] = predictions_proba_1
    df = df.sort_values(by='prediction_proba_1', ascending=False)
    df['ranking'] = [i+1 for i in range(len(df['prediction_proba_1']))]
    df['prediction_proba_1'] = df['prediction_proba_1'].round(3)
    
    df = df[['ranking', 'prediction_proba_1', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction']].sort_values(by='prediction_proba_1', ascending=False)
    df_123 = df[(df['ranking'].isin([1, 2, 3])) & (df['cutoff_prediction'] == 1)].sort_values(by='ranking')
    
    performance_metrics = compute_metrics(cutoff_predictions, true_labels)
    df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Score'])
    df_performance_metrics.reset_index(inplace=True, names=['Metric'])

    return df, df_123, df_performance_metrics