files necessary for running k means inference on a user prompt

Browse files

Files changed (3) hide show

GPT_128k_means_model.joblib +3 -0
cluster_inference.py +185 -0
mbid_GPT_128_clusters.pickle +3 -0

GPT_128k_means_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:377a377d10735695ba3f923def032b65ad62e0c5eaf960cccb55b310aa123fe1
+size 465849

cluster_inference.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+we want to be able to assign a small user text entry to one of our clusters.
+"""
+import joblib
+import pickle
+from transformers import GPT2Tokenizer, GPT2Model
+import torch
+### inference demo
+# load the GPT model
+GPT_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding=True)
+GPT_tokenizer.pad_token = '[PAD]'
+GPT_model = GPT2Model.from_pretrained('gpt2')
+# set some user example
+user_example = "we are looking to make some music! please point us to a lovely cluster where we can hear lovely sounds. I like the cranberries."
+# tokenize the input
+encoded_input = GPT_tokenizer(user_example, return_tensors="pt", padding=True, truncation=True)
+# generate the embeddings
+with torch.no_grad():
+    # get outputs from GPT model
+    outputs = GPT_model(**encoded_input)
+    # get the [CLS] (classification) token for sequence representation
+    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
+# load the kmeans model
+kmeans_model = joblib.load('GPT_128k_means_model.joblib')
+# do inference
+example_cluster = kmeans_model.predict(cls_embedding)
+print(example_cluster)
+from collections import Counter
+with open('mbid_GPT_128_clusters.pickle', 'rb') as f:
+    mbid_clusters = pickle.load(f)
+print(type(mbid_clusters))
+# print(mbid_clusters)
+sample_mbid = 'bd57a71ece2912664f5e267166a2a1fb'
+cluster_assignment = mbid_clusters.get(sample_mbid)
+print(cluster_assignment)
+# cluster_distribution = Counter(mbid_clusters.values())
+# print(cluster_distribution)
+# # Load the KMeans model
+# kmeans_model = joblib.load('GPT_512k_means_model.joblib')
+# # Load the cluster assignments from the pickle file
+# with open('mbid_GPT_512_clusters.pickle', 'rb') as f:
+#     mbid_clusters = pickle.load(f)
+# # Now you can access the KMeans model and cluster assignments
+# # For example, to get the cluster assignments for a specific mbid:
+# sample_mbid = '2a0a712b4b00f3df2d4fa50fe21f43cb'
+# cluster_assignment = mbid_clusters.get(sample_mbid)
+# # To get the distribution of clusters
+# from collections import Counter
+# cluster_distribution = Counter(mbid_clusters.values())
+# # print(cluster_distribution)
+# # To check if each article is assigned a cluster
+# total_articles = len(mbid_clusters)
+# articles_with_cluster = sum(1 for cluster in mbid_clusters.values() if cluster is not None)
+# print(f"Total articles: {total_articles}")
+# print(f"Articles with assigned clusters: {articles_with_cluster}")
+# # To check different clusters
+# # Replace 'cluster_number' with the cluster number you want to inspect
+# cluster_number = 0
+# articles_in_cluster = [mbid for mbid, cluster in mbid_clusters.items() if cluster == cluster_number]
+# #print(f"Articles in cluster {cluster_number}: {articles_in_cluster}")
+# # for cluster in mbid_clusters:
+# import joblib
+# import numpy as np
+# # vectorizer
+# from sklearn.feature_extraction.text import HashingVectorizer
+# # load cluster data pickle file, kmeans model, and vectorizer model
+# clusters = joblib.load("clusters_data.pickle")
+# vectorizer = joblib.load("vectorizer.joblib")
+# kmeans = joblib.load("best_kmeans_model.joblib")
+# # an example to try
+# user_example = ["make me and my friends a cool song!"]
+# # vectorize user example
+# vectorized_example = vectorizer.transform(user_example)
+# print(vectorized_example)
+# # assign a cluster: result is cluster 497
+# example_cluster = kmeans.predict(vectorized_example)
+# print(example_cluster)
+# # print(type(clusters[497]))
+# # print(len(clusters[497]))
+# # print(clusters[497][1])
+# # Get the number of data points assigned to each cluster
+# num_assigned = [len(cluster_data) for cluster_data in clusters.values()]
+# # Compute mean and standard deviation of the number of data points per cluster
+# mean_assigned = np.mean(num_assigned)
+# std_assigned = np.std(num_assigned)
+# print(f"Mean number of data points per cluster: {mean_assigned}")
+# print(f"Standard deviation of number of data points per cluster: {std_assigned}")
+# # Mean number of data points per cluster: 9.694656488549619
+# # Standard deviation of number of data points per cluster: 21.820754225240147
+# # get a view of some of the clusters
+# num_samples = 3
+# # # Print a short version of some clusters
+# # for cluster_label, cluster_data in clusters.items():
+# #     print(f"Cluster {cluster_label}:")
+# #     for i, (mbid, text) in enumerate(cluster_data[:num_samples], 1):
+# #         print(f"Sample {i}: {text[:100]}...")  # Print only the first 100 characters of each text
+# #     print()  # Add a blank line between clusters

mbid_GPT_128_clusters.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b86a2b4e34a33765fa38f6b840c565909d1c71884c358974de60afd7cc2225f9
+size 3287814