# Importing required packages import pickle import pandas as pd import re import numpy as np import torch.nn.functional as F from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score from transformers import AutoModelForSequenceClassification, AutoTokenizer # Loading data parquet_file = 'data/data_dump_ai_assingment.parquet' df = pd.read_parquet(parquet_file, engine='pyarrow') # Setting 3 random campaigns aside as testing examples for final models campaign_ids = [8, 123, 256] df_final_testing = df[df['campaign_id'].isin(campaign_ids)==True] # Clean text def clean_text(text): # Use a regular expression to remove non-alphabetic characters cleaned_text = re.sub(r'[^a-zA-Z0-9]+', ' ', text) # Remove multiple consecutive spaces and leading/trailing spaces cleaned_text = ' '.join(cleaned_text.split()) # Lower texts cleaned_text = cleaned_text.lower() return cleaned_text def combine_text(df_single_lead): # Changing column types df_single_lead['current_position'] = df_single_lead['current_position'].astype('str') df_single_lead['industry_sector'] = df_single_lead['industry_sector'].astype('str') df_single_lead['n_employees'] = df_single_lead['n_employees'].astype('str') # Combine text columns df_single_lead['combined_text'] = df_single_lead['current_position'] + ' ' + df_single_lead['industry_sector'] + ' ' + df_single_lead['n_employees'] + ' employees' # Clean text df_single_lead['combined_text'] = df_single_lead['combined_text'].apply(lambda row: clean_text(row)) return df_single_lead # Function to test model performance def model_predict(model, tokenizer, X_test, y_test, batch_size=32): text_test = X_test.to_list() labels_test = y_test.to_list() # Split the test data into batches to prevent large memory allocation batch_size = batch_size num_samples = len(text_test) num_batches = (num_samples + batch_size - 1) // batch_size # Calculate the number of batches # Initialize an empty list to store predicted labels predicted_labels_test = [] # Initialize an empty list to store predicted probabilities predicted_proba_test = [] # Iterate over batches for i in range(num_batches): start_idx = i * batch_size end_idx = min((i + 1) * batch_size, num_samples) # Get a batch of text and labels batch_text = text_test[start_idx:end_idx] batch_labels = labels_test[start_idx:end_idx] # Encode the batch encoded_data = tokenizer(batch_text, padding=True, truncation=True, return_tensors='pt') # Forward pass through the model logits = model(encoded_data['input_ids'], attention_mask=encoded_data['attention_mask']).logits # Get predicted labels for the batch batch_predicted_labels = logits.argmax(dim=1).tolist() # Append the batch predictions to the overall list predicted_labels_test.extend(batch_predicted_labels) # Apply softmax to logits to retrieve probabilities and put them in a cleaned list softmax_proba = F.softmax(logits, dim=-1) batch_predicted_proba= [tensor.tolist() for tensor in softmax_proba] # Append the batch probabilities to the overall list predicted_proba_test.extend(batch_predicted_proba) return predicted_labels_test, predicted_proba_test # Calculate performance metrics def compute_metrics(predictions, true_labels): f1_weighted = round(f1_score(true_labels, predictions, average='weighted'),3) f1 = round(f1_score(true_labels, predictions),3) accuracy = round(accuracy_score(true_labels, predictions),3) recall = round(recall_score(true_labels, predictions, zero_division=np.nan),3) precision = round(precision_score(true_labels, predictions, zero_division=np.nan),3) performance_metrics = { 'F1 weighted': f1_weighted, 'F1': f1, 'Accuracy': accuracy, 'Recall': recall, 'Precision': precision } return performance_metrics # Loading XGB model with open('models/xgb_tuned_2/xgb_model_tuned_2.pkl', 'rb') as model_file: xgb_model_tuned_2 = pickle.load(model_file) # Loading XGB vectorizer with open('models/xgb_tuned_2/vectorizer.pkl', 'rb') as model_file: vectorizer = pickle.load(model_file) # Loading BERT model distil_bert_tokenizer_tuned_2 = AutoTokenizer.from_pretrained('models/distil_bert_tuned_2') # Loading BERT tokenizer distil_bert_model_tuned_2 = AutoModelForSequenceClassification.from_pretrained( 'models/distil_bert_tuned_2', num_labels=2) # Classify single lead data def classify(CAMPAIGN_ID, LEAD_ID, proba_cutoff=50, model_type='XGB', full_campaign=False): if full_campaign == True: # Select full campaign data df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID)] else: # Selecting single lead data df = df_final_testing[(df_final_testing['campaign_id']==CAMPAIGN_ID) & (df_final_testing['lead_id']==LEAD_ID)] # True labels true_labels = df['employee_is_selected'].tolist() # Combining text columns df = combine_text(df) # Vectorize text with tfidf vectorizer tfidf_matrix = vectorizer.transform(df['combined_text']) # Selecing model if model_type=='XGB': model = xgb_model_tuned_2 # Predictions predictions = model.predict(tfidf_matrix) # Prediction porabilities of being 1 (selected) predictions_proba_1 = model.predict_proba(tfidf_matrix)[:, 1].tolist() elif model_type=='BERT': predictions, predicted_test_proba = model_predict(model = distil_bert_model_tuned_2, tokenizer = distil_bert_tokenizer_tuned_2, X_test = df['combined_text'], y_test = df['employee_is_selected']) # Prediction porabilities of being 1 (selected) predictions_proba_1 = [lists[1] for lists in predicted_test_proba] # Alter predictions based on rank_cutoff value cutoff_predictions = [1 if probability >= (proba_cutoff/100) else 0 for probability in predictions_proba_1] # Use argsort to get the indices that would sort the list in descending order sorted_indices = np.argsort(predictions_proba_1)[::-1] # Create dataframe columns and ranking df['cutoff_prediction'] = cutoff_predictions df['prediction_proba_1'] = predictions_proba_1 df = df.sort_values(by='prediction_proba_1', ascending=False) df['ranking'] = [i+1 for i in range(len(df['prediction_proba_1']))] df['prediction_proba_1'] = df['prediction_proba_1'].round(3) df = df[['ranking', 'prediction_proba_1', 'current_position', 'industry_sector', 'employee_is_selected', 'cutoff_prediction']].sort_values(by='prediction_proba_1', ascending=False) df_123 = df[(df['ranking'].isin([1, 2, 3])) & (df['cutoff_prediction'] == 1)].sort_values(by='ranking') performance_metrics = compute_metrics(cutoff_predictions, true_labels) df_performance_metrics = pd.DataFrame.from_dict(performance_metrics, orient='index', columns=['Score']) df_performance_metrics.reset_index(inplace=True, names=['Metric']) return df, df_123, df_performance_metrics