|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import sklearn |
|
|
import pickle |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.pipeline import make_pipeline |
|
|
from sklearn.feature_extraction import DictVectorizer |
|
|
|
|
|
|
|
|
print(f'pandas=={pd.__version__}') |
|
|
print(f'numpy=={np.__version__}') |
|
|
print(f'sklearn=={sklearn.__version__}') |
|
|
|
|
|
|
|
|
|
|
|
def load_data(): |
|
|
data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv" |
|
|
df = pd.read_csv(data_url) |
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
def train_model(df): |
|
|
|
|
|
categorical = ['lead_source'] |
|
|
numeric = ['number_of_courses_viewed', 'annual_income'] |
|
|
|
|
|
df[categorical] = df[categorical].fillna('NA') |
|
|
df[numeric] = df[numeric].fillna(0) |
|
|
|
|
|
train_dict = df[categorical + numeric].to_dict(orient='records') |
|
|
|
|
|
pipeline = make_pipeline( |
|
|
DictVectorizer(), |
|
|
LogisticRegression(solver='liblinear') |
|
|
) |
|
|
|
|
|
|
|
|
y_train = df.converted |
|
|
|
|
|
pipeline.fit(train_dict, y_train) |
|
|
return pipeline |
|
|
|
|
|
|
|
|
def save_model(filename, model): |
|
|
with open(filename, 'wb') as f_out: |
|
|
pickle.dump(model, f_out) |
|
|
|
|
|
print(f"Model saved to {filename}") |
|
|
|
|
|
|
|
|
df = load_data() |
|
|
pipeline = train_model(df) |
|
|
save_model('model.bin', pipeline) |
|
|
|
|
|
|
|
|
|