File size: 4,232 Bytes
bfb4bc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import skops
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# preprocess the dataset

df = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv")


column_transformer_pipeline = ColumnTransformer([
                ("loading_missing_value_imputer", SimpleImputer(strategy="mean"), ["loading"]),
                ("numerical_missing_value_imputer", SimpleImputer(strategy="mean"), list(df.columns[df.dtypes == 'float64'])),
                ("attribute_0_encoder", OneHotEncoder(categories = "auto"), ["attribute_0"]),
                ("attribute_1_encoder", OneHotEncoder(categories = "auto"), ["attribute_1"]),
                ("product_code_encoder", OneHotEncoder(categories = "auto"), ["product_code"])])

df = df.drop(["id"], axis=1)


pipeline = Pipeline([
    ('transformation', column_transformer_pipeline),
    ('model', DecisionTreeClassifier(max_depth=4))
])

X = df.drop(["failure"], axis = 1)
y = df.failure

# split the data and train the model 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline.fit(X_train, y_train)

# we will now use skops to initialize a repository
# create a model card, and push the model to the 
# Hugging Face Hub
from skops import card, hub_utils
import pickle

model_path = "model.pkl"
local_repo = "decision-tree-playground-kaggle"

# save the model
with open(model_path, mode="bw") as f:
    pickle.dump(pipeline, file=f)

# initialize the repository
hub_utils.init(
model=model_path, 
requirements=[f"scikit-learn={sklearn.__version__}"], 
dst=local_repo,
task="tabular-classification",
data=X_test,
)

# initialize the model card
from pathlib import Path
model_card = card.Card(pipeline, metadata=card.metadata_from_config(Path(local_repo)))

## let's fill some information about the model
limitations = "This model is not ready to be used in production."
model_description = "This is a DecisionTreeClassifier model built for Kaggle Tabular Playground Series August 2022, trained on supersoaker production failures dataset."
model_card_authors = "huggingface"
get_started_code = f"import pickle \nwith open({local_repo}/{model_path}, 'rb') as file: \n    clf = pickle.load(file)"

# pass this information to the card
model_card.add(
    get_started_code=get_started_code,
    model_card_authors=model_card_authors,
    limitations=limitations,
    model_description=model_description,
)

# we will now evaluate the model and write eval results to the card
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, confusion_matrix
model_card.add(eval_method="The model is evaluated using test split, on accuracy and F1 score with micro average.")
model_card.add_metrics(accuracy=accuracy_score(y_test, y_pred))
model_card.add_metrics(**{"f1 score": f1_score(y_test, y_pred, average="micro")})

model = pipeline.steps[-1][1]

# we will plot the tree and add the plot to our card
from sklearn.tree import plot_tree
plt.figure()
plot_tree(model,filled=True)  
plt.savefig(f'{local_repo}/tree.png',format='png',bbox_inches = "tight")

# let's make a prediction and evaluate the model
y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

# save the plot
plt.savefig(Path(local_repo) / "confusion_matrix.png")

# add figures to model card with their new sections as keys to the dictionary
model_card.add_plot(**{"Tree Plot": f'{local_repo}/tree.png', "Confusion Matrix": f"{local_repo}/confusion_matrix.png"})

#save the card
model_card.save(f"{local_repo}/README.md")

# we can now push the model!
# if the repository doesn't exist remotely on the Hugging Face Hub, it will be created when we set create_remote to True
repo_id = "scikit-learn/tabular-playground"
hub_utils.push(
    repo_id=repo_id,
    source=local_repo,
    token=token,
    commit_message="pushing files to the repo from the example!",
    create_remote=True,
)