Spaces:

7sugiwa
/

profitboost

Sleeping

App Files Files Community

profitboost / transformers.py

7sugiwa

Upload 7 files

0ec3d9b verified 6 months ago

raw

history blame contribute delete

No virus

2.33 kB

	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.cluster import KMeans
	import pandas as pd

	class UnitPriceTransformer(BaseEstimator, TransformerMixin):
	def fit(self, X, y=None):
	return self

	def transform(self, X):
	X['unit_price'] = X['sales'] / X['quantity']
	return X

	class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
	def __init__(self, n_clusters=3):
	self.n_clusters = n_clusters
	self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)

	def fit(self, X, y=None):
	# Fit the KMeans model on the 'unit_price', ensuring it's reshaped for a single feature
	self.kmeans.fit(X[['unit_price']])
	return self

	def transform(self, X):
	# Predict the cluster labels
	cluster_labels = self.kmeans.predict(X[['unit_price']])

	# Convert cluster labels to strings for concatenation
	# Create a new DataFrame column for 'distinct_cluster_label'
	# Here, we use the apply function with a lambda to concatenate the string representations safely
	X = X.copy() # Avoid SettingWithCopyWarning
	X['cluster_labels_str'] = cluster_labels.astype(str)
	X['distinct_cluster_label'] = X.apply(lambda row: row['cluster_labels_str'] + "_" + str(row['sub_category']), axis=1)

	# Now that 'distinct_cluster_label' is created, 'cluster_labels_str' can be dropped
	X.drop(['cluster_labels_str'], axis=1, inplace=True)

	return X

	class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
	def fit(self, X, y=None):
	self.encoder = OneHotEncoder(handle_unknown='ignore')
	self.encoder.fit(X[['distinct_cluster_label']])
	return self

	def transform(self, X):
	encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
	encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
	X.reset_index(drop=True, inplace=True)
	result = pd.concat([X, encoded_df], axis=1)
	result.drop(['distinct_cluster_label', 'sub_category', 'unit_price'], axis=1, inplace=True) # Drop original columns if not needed
	return result