7sugiwa commited on
Commit
f844302
1 Parent(s): 0e2c5d9

Update prediction.py

Browse files
Files changed (1) hide show
  1. prediction.py +55 -1
prediction.py CHANGED
@@ -2,7 +2,61 @@
2
 
3
  import joblib
4
  import pandas as pd
5
- from transformers import UnitPriceTransformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # Load the preprocessing pipeline
8
  pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
 
2
 
3
  import joblib
4
  import pandas as pd
5
+ from sklearn.base import BaseEstimator, TransformerMixin
6
+ from sklearn.preprocessing import OneHotEncoder
7
+ from sklearn.cluster import KMeans
8
+ import pandas as pd
9
+ import joblib
10
+
11
+ class UnitPriceTransformer(BaseEstimator, TransformerMixin):
12
+ def fit(self, X, y=None):
13
+ return self
14
+
15
+ def transform(self, X):
16
+ X['unit_price'] = X['sales'] / X['quantity']
17
+ return X
18
+
19
+ class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
20
+ def __init__(self, n_clusters=3):
21
+ self.n_clusters = n_clusters
22
+ self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
23
+
24
+ def fit(self, X, y=None):
25
+ # Fit the KMeans model on the 'unit_price', ensuring it's reshaped for a single feature
26
+ self.kmeans.fit(X[['unit_price']])
27
+ return self
28
+
29
+ def transform(self, X):
30
+ # Predict the cluster labels
31
+ cluster_labels = self.kmeans.predict(X[['unit_price']])
32
+
33
+ # Convert cluster labels to strings for concatenation
34
+ # Create a new DataFrame column for 'distinct_cluster_label'
35
+ # Here, we use the apply function with a lambda to concatenate the string representations safely
36
+ X = X.copy() # Avoid SettingWithCopyWarning
37
+ X['cluster_labels_str'] = cluster_labels.astype(str)
38
+ X['distinct_cluster_label'] = X.apply(lambda row: row['cluster_labels_str'] + "_" + str(row['sub_category']), axis=1)
39
+
40
+ # Now that 'distinct_cluster_label' is created, 'cluster_labels_str' can be dropped
41
+ X.drop(['cluster_labels_str'], axis=1, inplace=True)
42
+
43
+ return X
44
+
45
+
46
+
47
+ class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
48
+ def fit(self, X, y=None):
49
+ self.encoder = OneHotEncoder(handle_unknown='ignore')
50
+ self.encoder.fit(X[['distinct_cluster_label']])
51
+ return self
52
+
53
+ def transform(self, X):
54
+ encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
55
+ encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
56
+ X.reset_index(drop=True, inplace=True)
57
+ result = pd.concat([X, encoded_df], axis=1)
58
+ result.drop(['distinct_cluster_label', 'sub_category', 'unit_price'], axis=1, inplace=True) # Drop original columns if not needed
59
+ return result
60
 
61
  # Load the preprocessing pipeline
62
  pipeline = joblib.load('full_pipeline_with_unit_price.pkl')