7sugiwa commited on
Commit
0ec3d9b
1 Parent(s): e24448f

Upload 7 files

Browse files
full_pipeline_with_unit_price.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0d7e033c7f305bfd4da107e53afd53a73c5ab9af4a335fd39bc5bad88e546c4
3
- size 37369
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b44c045125a5dc057c0b560b07ef9db20adefc79884a33ffeffd239c8c03b2d4
3
+ size 34361
prediction.py CHANGED
@@ -1,9 +1,18 @@
1
- import joblib
2
  import pandas as pd
3
 
4
  # Load the pipeline and model
5
- pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
6
- model = joblib.load('best_model.pkl')
 
 
 
 
 
 
 
 
 
7
 
8
  def make_prediction(input_features):
9
  # Assuming input_features is a DataFrame with the correct structure
 
1
+ import pickle
2
  import pandas as pd
3
 
4
  # Load the pipeline and model
5
+ # Load the pipeline object from the file
6
+ with open('full_pipeline_with_unit_price.pkl', 'rb') as file:
7
+ pipeline = pickle.load(file)
8
+
9
+ # Load the preprocessor object from the file
10
+ with open('preprocessor.pkl', 'rb') as file:
11
+ preprocessor = pickle.load(file)
12
+
13
+ # Load the model object from the file
14
+ with open('best_model.pkl', 'rb') as file:
15
+ model = pickle.load(file)
16
 
17
  def make_prediction(input_features):
18
  # Assuming input_features is a DataFrame with the correct structure
preprocessor.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52dec9ecf963a4d8a050f58ee02438db62121bb9a71419e85d067b0957429f0b
3
- size 4979
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac46af803735f9b18167add1df884ad73ac9c35f7fc0e2784cc6b3e389145113
3
+ size 2779
transformers.py CHANGED
@@ -1,10 +1,52 @@
1
  from sklearn.base import BaseEstimator, TransformerMixin
 
 
 
2
 
3
  class UnitPriceTransformer(BaseEstimator, TransformerMixin):
4
  def fit(self, X, y=None):
5
  return self
6
-
7
  def transform(self, X):
8
  X['unit_price'] = X['sales'] / X['quantity']
9
  return X
10
- # Include other custom transformers as needed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from sklearn.base import BaseEstimator, TransformerMixin
2
+ from sklearn.preprocessing import OneHotEncoder
3
+ from sklearn.cluster import KMeans
4
+ import pandas as pd
5
 
6
  class UnitPriceTransformer(BaseEstimator, TransformerMixin):
7
  def fit(self, X, y=None):
8
  return self
9
+
10
  def transform(self, X):
11
  X['unit_price'] = X['sales'] / X['quantity']
12
  return X
13
+
14
+ class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
15
+ def __init__(self, n_clusters=3):
16
+ self.n_clusters = n_clusters
17
+ self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
18
+
19
+ def fit(self, X, y=None):
20
+ # Fit the KMeans model on the 'unit_price', ensuring it's reshaped for a single feature
21
+ self.kmeans.fit(X[['unit_price']])
22
+ return self
23
+
24
+ def transform(self, X):
25
+ # Predict the cluster labels
26
+ cluster_labels = self.kmeans.predict(X[['unit_price']])
27
+
28
+ # Convert cluster labels to strings for concatenation
29
+ # Create a new DataFrame column for 'distinct_cluster_label'
30
+ # Here, we use the apply function with a lambda to concatenate the string representations safely
31
+ X = X.copy() # Avoid SettingWithCopyWarning
32
+ X['cluster_labels_str'] = cluster_labels.astype(str)
33
+ X['distinct_cluster_label'] = X.apply(lambda row: row['cluster_labels_str'] + "_" + str(row['sub_category']), axis=1)
34
+
35
+ # Now that 'distinct_cluster_label' is created, 'cluster_labels_str' can be dropped
36
+ X.drop(['cluster_labels_str'], axis=1, inplace=True)
37
+
38
+ return X
39
+
40
+ class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
41
+ def fit(self, X, y=None):
42
+ self.encoder = OneHotEncoder(handle_unknown='ignore')
43
+ self.encoder.fit(X[['distinct_cluster_label']])
44
+ return self
45
+
46
+ def transform(self, X):
47
+ encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
48
+ encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
49
+ X.reset_index(drop=True, inplace=True)
50
+ result = pd.concat([X, encoded_df], axis=1)
51
+ result.drop(['distinct_cluster_label', 'sub_category', 'unit_price'], axis=1, inplace=True) # Drop original columns if not needed
52
+ return result