Spaces:
Sleeping
Sleeping
File size: 7,451 Bytes
f83431c 320450f f83431c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# Importing required packages
import pickle
import pandas as pd
import re
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Loading data
parquet_file = 'data/data_dump_ai_assingment.parquet'
df = pd.read_parquet(parquet_file, engine='pyarrow')
# Setting 3 random campaigns aside as testing examples for final models
campaign_ids = [8, 123, 256]
df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True]
# Clean text
def clean_text(text):
# Use a regular expression to remove non-alphabetic characters
cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
# Remove multiple consecutive spaces and leading/trailing spaces
cleaned_text = ' '.join(cleaned_text.split())
# Lower texts
cleaned_text = cleaned_text.lower()
return cleaned_text
def combine_text(df_single_lead):
# Changing column types
df_single_lead['current_position'] = df_single_lead['current_position'].astype('str')
df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str')
df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str')
# Combine text columns
df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees'
# Clean text
df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row))
return df_single_lead
# Function to test model performance
def model_predict(model, tokenizer, X_test, y_test, batch_size=32):
text_test = X_test.to_list()
labels_test = y_test.to_list()
# Split the test data into batches to prevent large memory allocation
batch_size = batch_size
num_samples = len(text_test)
num_batches = (num_samples + batch_size - 1) // batch_size # Calculate the number of batches
# Initialize an empty list to store predicted labels
predicted_labels_test = []
# Initialize an empty list to store predicted probabilities
predicted_proba_test = []
# Iterate over batches
for i in range(num_batches):
start_idx = i * batch_size
end_idx = min((i + 1) * batch_size, num_samples)
# Get a batch of text and labels
batch_text = text_test[start_idx:end_idx]
batch_labels = labels_test[start_idx:end_idx]
# Encode the batch
encoded_data = tokenizer(batch_text, padding=True, truncation=True, return_tensors='pt')
# Forward pass through the model
logits = model(encoded_data['input_ids'], attention_mask=encoded_data['attention_mask']).logits
# Get predicted labels for the batch
batch_predicted_labels = logits.argmax(dim=1).tolist()
# Append the batch predictions to the overall list
predicted_labels_test.extend(batch_predicted_labels)
# Apply softmax to logits to retrieve probabilities and put them in a cleaned list
softmax_proba = F.softmax(logits, dim=-1)
batch_predicted_proba= [tensor.tolist() for tensor in softmax_proba]
# Append the batch probabilities to the overall list
predicted_proba_test.extend(batch_predicted_proba)
return predicted_labels_test, predicted_proba_test
# Calculate performance metrics
def compute_metrics(predictions, true_labels):
f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3)
f1 = round(f1_score(true_labels, predictions),3)
accuracy = round(accuracy_score(true_labels, predictions),3)
recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3)
precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3)
performance_metrics = {
'F1 weighted': f1_weighted,
'F1': f1,
'Accuracy': accuracy,
'Recall': recall,
'Precision': precision
}
return performance_metrics
# Loading XGB model
with open('models/xgb_tuned_2/xgb_model_tuned_2.pkl', 'rb') as model_file:
xgb_model_tuned_2 = pickle.load(model_file)
# Loading XGB vectorizer
with open('models/xgb_tuned_2/vectorizer.pkl', 'rb') as model_file:
vectorizer = pickle.load(model_file)
# Loading BERT model
distil_bert_tokenizer_tuned_2 = AutoTokenizer.from_pretrained('models/distil_bert_tuned_2')
# Loading BERT tokenizer
distil_bert_model_tuned_2 = AutoModelForSequenceClassification.from_pretrained(
'models/distil_bert_tuned_2', num_labels=2)
# Classify single lead data
def classify(CAMPAIGN_ID, LEAD_ID, proba_cutoff=50, model_type='XGB', full_campaign=False):
if full_campaign == True:
# Select full campaign data
df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)]
else:
# Selecting single lead data
df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)]
# True labels
true_labels = df['employee_is_selected'].tolist()
# Combining text columns
df = combine_text(df)
# Vectorize text with tfidf vectorizer
tfidf_matrix = vectorizer.transform(df['combined_text'])
# Selecing model
if model_type=='XGB':
model = xgb_model_tuned_2
# Predictions
predictions = model.predict(tfidf_matrix)
# Prediction porabilities of being 1 (selected)
predictions_proba_1 = model.predict_proba(tfidf_matrix)[:, 1].tolist()
elif model_type=='BERT':
predictions, predicted_test_proba = model_predict(model = distil_bert_model_tuned_2,
tokenizer = distil_bert_tokenizer_tuned_2,
X_test = df['combined_text'],
y_test = df['employee_is_selected'])
# Prediction porabilities of being 1 (selected)
predictions_proba_1 = [lists[1] for lists in predicted_test_proba]
# Alter predictions based on rank_cutoff value
cutoff_predictions = [1 if probability >= (proba_cutoff/100) else 0 for probability in predictions_proba_1]
# Use argsort to get the indices that would sort the list in descending order
sorted_indices = np.argsort(predictions_proba_1)[::-1]
# Create dataframe columns and ranking
df['cutoff_prediction'] = cutoff_predictions
df['prediction_proba_1'] = predictions_proba_1
df = df.sort_values(by='prediction_proba_1', ascending=False)
df['ranking'] = [i+1 for i in range(len(df['prediction_proba_1']))]
df['prediction_proba_1'] = df['prediction_proba_1'].round(3)
df = df[['ranking', 'prediction_proba_1', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction']].sort_values(by='prediction_proba_1', ascending=False)
df_123 = df[(df['ranking'].isin([1, 2, 3])) & (df['cutoff_prediction'] == 1)].sort_values(by='ranking')
performance_metrics = compute_metrics(cutoff_predictions, true_labels)
df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Score'])
df_performance_metrics.reset_index(inplace=True, names=['Metric'])
return df, df_123, df_performance_metrics
|