add all files

Files changed (10) hide show

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/apify.png filter=lfs diff=lfs merge=lfs -text

assets/apify.png ADDED Viewed

assets/demo.png ADDED Viewed

data/clothing_data_preprocessed.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/clothing_similarity_search.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/embeddings.npy ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:c69fef9a8e91493b831b17f9a05acbb171c132bfff4abf60128e29ab0b11de3e
+size 4454528

notebooks/Clothing_Similarity_Search.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

+pandas
+nltk
+transformers
+sentence-transformers
+fastapi
+numpy
+uvicorn
+gunicorn==19.9.0

utils/preprocess.py ADDED Viewed

+import pandas as pd
+import nltk
+import string
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+df = pd.read_csv('data/clothing_similarity_search.csv')
+# Using DataFrame.apply() and lambda function
+df["product"] = df['title'].fillna('') + df['description'].fillna('')
+# Using DataFrame.copy() create new DaraFrame.
+clothing_data = df[['url', 'product']].copy()
+def preprocess_text(text):
+	# Tokenize the text into individual words
+	tokens = word_tokenize(text)
+	tokens = [token.lower() for token in tokens]
+	# Remove special characters and punctuation
+	tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
+	# Remove stopwords
+	stop_words = set(stopwords.words('english'))
+	tokens = [token for token in tokens if token not in stop_words]
+	# Lemmatize the tokens
+	lemmatizer = WordNetLemmatizer()
+	tokens = [lemmatizer.lemmatize(token) for token in tokens]
+	# Join the tokens back into a single string
+	preprocessed_text = ' '.join(tokens)
+	return preprocessed_text
+preprocessed_products = []
+for index, row in clothing_data.iterrows():
+	preprocessed_product = preprocess_text(row['product'])
+	preprocessed_products.append(preprocessed_product)
+# Add the preprocessed text to a new column in the clothing_data
+clothing_data['preprocessed_product'] = preprocessed_products
+clothing_data.to_csv('data/clothing_data_updated.csv')

utils/similarity.py ADDED Viewed

+from sentence_transformers import SentenceTransformer, util
+import pandas as pd
+import numpy as np
+clothing_data = pd.read_csv('data/clothing_data_preprocessed.csv')
+model = SentenceTransformer('model')
+embeddings = np.load('data/embeddings.npy')
+def get_similar_items(query, embeddings, clothing_data, top_k):
+    # Encode the query text
+    query_embedding = model.encode([query], convert_to_tensor=True)
+    # Compute similarity scores
+    similarity_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
+    # Sort indices based on similarity scores
+    sorted_indices = similarity_scores.argsort(descending=True)
+    # Get the top-k most similar indices
+    similar_indices = sorted_indices[:top_k].cpu().numpy()
+    # Get the URLs of the top-k similar items
+    similar_urls = clothing_data.loc[similar_indices, 'url'].tolist()
+    return similar_urls
+# Assuming you have the embeddings and clothing_data available
+query = "Men's jeans black color"
+similar_urls = get_similar_items(query, embeddings, clothing_data, top_k=5)
+print(similar_urls)