sks01dev's picture
Upload 15 files
d64524a verified
#!/usr/bin/env python
# coding: utf-8
# This is a starter notebook for an updated module 5 of ML Zoomcamp
#
# The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)
# Import the necessary libraries
import numpy as np
import pandas as pd
import sklearn
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f'sklearn=={sklearn.__version__}')
# Load the data
def load_data():
data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(data_url)
return df
def train_model(df):
# Preprocessing using DictVectorizer and Training the Logistic Regressio model
categorical = ['lead_source']
numeric = ['number_of_courses_viewed', 'annual_income']
df[categorical] = df[categorical].fillna('NA')
df[numeric] = df[numeric].fillna(0)
train_dict = df[categorical + numeric].to_dict(orient='records')
pipeline = make_pipeline(
DictVectorizer(),
LogisticRegression(solver='liblinear')
)
# the target variable
y_train = df.converted
pipeline.fit(train_dict, y_train)
return pipeline
def save_model(filename, model):
with open(filename, 'wb') as f_out:
pickle.dump(model, f_out)
print(f"Model saved to {filename}")
df = load_data()
pipeline = train_model(df)
save_model('model.bin', pipeline)