File size: 4,581 Bytes
cf85eee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
"""
we want to be able to assign a small user text entry to one of our clusters.
"""
import joblib
import pickle
from transformers import GPT2Tokenizer, GPT2Model
import torch
### inference demo
# load the GPT model
GPT_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding=True)
GPT_tokenizer.pad_token = '[PAD]'
GPT_model = GPT2Model.from_pretrained('gpt2')
# set some user example
user_example = "we are looking to make some music! please point us to a lovely cluster where we can hear lovely sounds. I like the cranberries."
# tokenize the input
encoded_input = GPT_tokenizer(user_example, return_tensors="pt", padding=True, truncation=True)
# generate the embeddings
with torch.no_grad():
# get outputs from GPT model
outputs = GPT_model(**encoded_input)
# get the [CLS] (classification) token for sequence representation
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
# load the kmeans model
kmeans_model = joblib.load('GPT_128k_means_model.joblib')
# do inference
example_cluster = kmeans_model.predict(cls_embedding)
print(example_cluster)
from collections import Counter
with open('mbid_GPT_128_clusters.pickle', 'rb') as f:
mbid_clusters = pickle.load(f)
print(type(mbid_clusters))
# print(mbid_clusters)
sample_mbid = 'bd57a71ece2912664f5e267166a2a1fb'
cluster_assignment = mbid_clusters.get(sample_mbid)
print(cluster_assignment)
# cluster_distribution = Counter(mbid_clusters.values())
# print(cluster_distribution)
# # Load the KMeans model
# kmeans_model = joblib.load('GPT_512k_means_model.joblib')
# # Load the cluster assignments from the pickle file
# with open('mbid_GPT_512_clusters.pickle', 'rb') as f:
# mbid_clusters = pickle.load(f)
# # Now you can access the KMeans model and cluster assignments
# # For example, to get the cluster assignments for a specific mbid:
# sample_mbid = '2a0a712b4b00f3df2d4fa50fe21f43cb'
# cluster_assignment = mbid_clusters.get(sample_mbid)
# # To get the distribution of clusters
# from collections import Counter
# cluster_distribution = Counter(mbid_clusters.values())
# # print(cluster_distribution)
# # To check if each article is assigned a cluster
# total_articles = len(mbid_clusters)
# articles_with_cluster = sum(1 for cluster in mbid_clusters.values() if cluster is not None)
# print(f"Total articles: {total_articles}")
# print(f"Articles with assigned clusters: {articles_with_cluster}")
# # To check different clusters
# # Replace 'cluster_number' with the cluster number you want to inspect
# cluster_number = 0
# articles_in_cluster = [mbid for mbid, cluster in mbid_clusters.items() if cluster == cluster_number]
# #print(f"Articles in cluster {cluster_number}: {articles_in_cluster}")
# # for cluster in mbid_clusters:
# import joblib
# import numpy as np
# # vectorizer
# from sklearn.feature_extraction.text import HashingVectorizer
# # load cluster data pickle file, kmeans model, and vectorizer model
# clusters = joblib.load("clusters_data.pickle")
# vectorizer = joblib.load("vectorizer.joblib")
# kmeans = joblib.load("best_kmeans_model.joblib")
# # an example to try
# user_example = ["make me and my friends a cool song!"]
# # vectorize user example
# vectorized_example = vectorizer.transform(user_example)
# print(vectorized_example)
# # assign a cluster: result is cluster 497
# example_cluster = kmeans.predict(vectorized_example)
# print(example_cluster)
# # print(type(clusters[497]))
# # print(len(clusters[497]))
# # print(clusters[497][1])
# # Get the number of data points assigned to each cluster
# num_assigned = [len(cluster_data) for cluster_data in clusters.values()]
# # Compute mean and standard deviation of the number of data points per cluster
# mean_assigned = np.mean(num_assigned)
# std_assigned = np.std(num_assigned)
# print(f"Mean number of data points per cluster: {mean_assigned}")
# print(f"Standard deviation of number of data points per cluster: {std_assigned}")
# # Mean number of data points per cluster: 9.694656488549619
# # Standard deviation of number of data points per cluster: 21.820754225240147
# # get a view of some of the clusters
# num_samples = 3
# # # Print a short version of some clusters
# # for cluster_label, cluster_data in clusters.items():
# # print(f"Cluster {cluster_label}:")
# # for i, (mbid, text) in enumerate(cluster_data[:num_samples], 1):
# # print(f"Sample {i}: {text[:100]}...") # Print only the first 100 characters of each text
# # print() # Add a blank line between clusters |