prodrec / train.py
Ishaan Shah
init
267e3a7
raw
history blame
1.3 kB
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pickle
product_descriptions = pd.read_csv("./train.csv")
product_descriptions = product_descriptions.dropna()
vectorizer = TfidfVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(product_descriptions["value"])
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X1)
def show_recommendations(product):
Y = vectorizer.transform([product])
prediction = model.predict(Y)
return prediction
def print_cluster(i):
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
def get_cluster_terms(cluster_index):
cluster_terms = [terms[ind] for ind in order_centroids[cluster_index, :10]]
return cluster_terms
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
print(print_cluster(show_recommendations("red dress")[0]))
print(print_cluster(show_recommendations("water")[0]))
print(print_cluster(show_recommendations("shoes")[0]))
print(print_cluster(show_recommendations("cutting tool")[0]))
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))