import pandas as pd | |
import numpy as np | |
from sklearn.cluster import KMeans | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import Pipeline | |
def get_scaler(): | |
data = pd.read_csv("data.csv") | |
song_cluster_pipeline = Pipeline([("scaler", StandardScaler()), | |
("kmeans", KMeans(n_clusters=20, | |
verbose=False))], | |
verbose=False) | |
X = data.select_dtypes(np.number) | |
number_cols = list(X.columns) | |
data["cluster_label"] = song_cluster_pipeline.fit_predict(X) | |
return data, song_cluster_pipeline | |