Spaces:
Sleeping
Sleeping
File size: 2,201 Bytes
51abf05 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 f844302 5a56710 51abf05 5a56710 51abf05 8c9dd24 5a56710 51abf05 5a56710 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import joblib
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
# Custom Transformer: UnitPriceTransformer
class UnitPriceTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
X['unit_price'] = X['sales'] / X['quantity']
return X[['unit_price']]
# Custom Transformer: KMeansAndLabelTransformer
class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
def __init__(self, n_clusters=3):
self.n_clusters = n_clusters
self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
def fit(self, X, y=None):
self.kmeans.fit(X[['unit_price']])
return self
def transform(self, X):
X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
cluster_labels = self.kmeans.predict(X[['unit_price']])
X['distinct_cluster_label'] = cluster_labels.astype(str) + "_" + X['sub_category']
return X[['distinct_cluster_label']]
# Custom Transformer: DynamicOneHotEncoder
class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
self.encoder = OneHotEncoder(handle_unknown='ignore')
def fit(self, X, y=None):
self.encoder.fit(X[['distinct_cluster_label']])
return self
def transform(self, X):
X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
return encoded_df
# Load the pipeline and model
pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
model = joblib.load('best_model.pkl')
def make_prediction(input_features):
processed_features = pipeline.transform(pd.DataFrame([input_features]))
prediction = model.predict(processed_features)
return prediction[0]
|