benax-rw's picture
Upload folder using huggingface_hub
b805416 verified
import os
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
from torch.utils.tensorboard import SummaryWriter
from tabulate import tabulate
# Configure logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s",
level=logging.INFO
)
# Define paths
model_path = "model/logistic_model.joblib"
vectorizer_path = "model/vectorizer.joblib"
log_dir = "logs"
metrics_log_path = os.path.join(log_dir, "metrics_log.txt")
# Create directories if they don't exist
os.makedirs("model", exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
# Initialize TensorBoard writer
writer = SummaryWriter(log_dir=log_dir)
# Step 1: Load and preprocess dataset
file_path = "dataset/rwandan_names.csv"
logging.info("Loading dataset from file...")
data = pd.read_csv(file_path)
logging.info("Dataset loaded. Checking for missing values...")
# Drop rows with NaN values in 'name' or 'gender' columns
data = data.dropna(subset=['name', 'gender'])
logging.info(f"Dataset loaded with {len(data)} records after dropping NaNs.")
# Step 2: Split dataset into training, validation, and test sets
logging.info("Splitting dataset into training, validation, and test sets...")
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
logging.info(f"Training set size: {len(train_data)}")
logging.info(f"Validation set size: {len(val_data)}")
logging.info(f"Test set size: {len(test_data)}")
# Step 3: Initialize vectorizer and transform names
logging.info("Initializing vectorizer and transforming training data...")
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
X_train = vectorizer.fit_transform(train_data['name'])
logging.info("Training data transformed.")
logging.info("Transforming validation and test data...")
X_val = vectorizer.transform(val_data['name'])
X_test = vectorizer.transform(test_data['name'])
logging.info("Validation and test data transformation complete.")
# Encode target variables
y_train = train_data['gender']
y_val = val_data['gender']
y_test = test_data['gender']
# Initialize and train the model
logging.info("Initializing and training the Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
logging.info("Model training complete.")
# Store metrics for final table display
metrics_summary = []
# Define a function to calculate and log metrics
def calculate_metrics(y_true, y_pred, dataset_type="Validation", step=0):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, pos_label="female", average="binary")
recall = recall_score(y_true, y_pred, pos_label="female", average="binary")
f1 = f1_score(y_true, y_pred, pos_label="female", average="binary")
# Log to TensorBoard with a specified step
writer.add_scalar(f"{dataset_type}/Accuracy", accuracy, step)
writer.add_scalar(f"{dataset_type}/Precision", precision, step)
writer.add_scalar(f"{dataset_type}/Recall", recall, step)
writer.add_scalar(f"{dataset_type}/F1-score", f1, step)
# Append metrics to summary list for table display
metrics_summary.append([dataset_type, accuracy, precision, recall, f1])
# Print and save metrics to file
with open(metrics_log_path, "a") as log_file:
log_file.write(f"\n{dataset_type} Metrics:\n")
log_file.write(f"Accuracy: {accuracy:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1-score: {f1:.4f}\n")
logging.info(f"{dataset_type} Metrics - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
# Step 5: Calculate and log metrics for validation and test sets
logging.info("Calculating metrics for validation set...")
y_val_pred = model.predict(X_val)
calculate_metrics(y_val, y_val_pred, dataset_type="Validation", step=1)
logging.info("Calculating metrics for test set...")
y_test_pred = model.predict(X_test)
calculate_metrics(y_test, y_test_pred, dataset_type="Test", step=2)
# Step 6: Save model and vectorizer
logging.info("Saving model and vectorizer to disk...")
joblib.dump(model, model_path)
joblib.dump(vectorizer, vectorizer_path)
logging.info(f"Model saved to {model_path}")
logging.info(f"Vectorizer saved to {vectorizer_path}")
logging.info(f"Metrics logged to {metrics_log_path}")
# Print summary table
print("\nFinal Metrics Summary:")
print(tabulate(metrics_summary, headers=["Dataset", "Accuracy", "Precision", "Recall", "F1-Score"], floatfmt=".4f"))
# Close TensorBoard writer
writer.close()
logging.info("Training and logging completed.")