File size: 4,581 Bytes
cf85eee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""
we want to be able to assign a small user text entry to one of our clusters.
"""

import joblib
import pickle
from transformers import GPT2Tokenizer, GPT2Model
import torch

### inference demo

# load the GPT model
GPT_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding=True)
GPT_tokenizer.pad_token = '[PAD]'
GPT_model = GPT2Model.from_pretrained('gpt2')

# set some user example
user_example = "we are looking to make some music! please point us to a lovely cluster where we can hear lovely sounds. I like the cranberries."

# tokenize the input
encoded_input = GPT_tokenizer(user_example, return_tensors="pt", padding=True, truncation=True)

# generate the embeddings
with torch.no_grad():

    # get outputs from GPT model
    outputs = GPT_model(**encoded_input)

    # get the [CLS] (classification) token for sequence representation
    cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()

# load the kmeans model
kmeans_model = joblib.load('GPT_128k_means_model.joblib')

# do inference
example_cluster = kmeans_model.predict(cls_embedding)
print(example_cluster)



from collections import Counter
with open('mbid_GPT_128_clusters.pickle', 'rb') as f:
    mbid_clusters = pickle.load(f)


print(type(mbid_clusters))
# print(mbid_clusters)
sample_mbid = 'bd57a71ece2912664f5e267166a2a1fb'
cluster_assignment = mbid_clusters.get(sample_mbid)
print(cluster_assignment)

# cluster_distribution = Counter(mbid_clusters.values())
# print(cluster_distribution)













# # Load the KMeans model
# kmeans_model = joblib.load('GPT_512k_means_model.joblib')

# # Load the cluster assignments from the pickle file
# with open('mbid_GPT_512_clusters.pickle', 'rb') as f:
#     mbid_clusters = pickle.load(f)

# # Now you can access the KMeans model and cluster assignments
# # For example, to get the cluster assignments for a specific mbid:
# sample_mbid = '2a0a712b4b00f3df2d4fa50fe21f43cb'
# cluster_assignment = mbid_clusters.get(sample_mbid)

# # To get the distribution of clusters
# from collections import Counter
# cluster_distribution = Counter(mbid_clusters.values())
# # print(cluster_distribution)

# # To check if each article is assigned a cluster
# total_articles = len(mbid_clusters)
# articles_with_cluster = sum(1 for cluster in mbid_clusters.values() if cluster is not None)

# print(f"Total articles: {total_articles}")
# print(f"Articles with assigned clusters: {articles_with_cluster}")

# # To check different clusters
# # Replace 'cluster_number' with the cluster number you want to inspect
# cluster_number = 0
# articles_in_cluster = [mbid for mbid, cluster in mbid_clusters.items() if cluster == cluster_number]
# #print(f"Articles in cluster {cluster_number}: {articles_in_cluster}")
# # for cluster in mbid_clusters:
    





























# import joblib
# import numpy as np

# # vectorizer
# from sklearn.feature_extraction.text import HashingVectorizer

# # load cluster data pickle file, kmeans model, and vectorizer model
# clusters = joblib.load("clusters_data.pickle")
# vectorizer = joblib.load("vectorizer.joblib")
# kmeans = joblib.load("best_kmeans_model.joblib")

# # an example to try
# user_example = ["make me and my friends a cool song!"] 

# # vectorize user example 
# vectorized_example = vectorizer.transform(user_example)
# print(vectorized_example)

# # assign a cluster: result is cluster 497
# example_cluster = kmeans.predict(vectorized_example)
# print(example_cluster)

# # print(type(clusters[497]))
# # print(len(clusters[497]))
# # print(clusters[497][1])


# # Get the number of data points assigned to each cluster
# num_assigned = [len(cluster_data) for cluster_data in clusters.values()]

# # Compute mean and standard deviation of the number of data points per cluster
# mean_assigned = np.mean(num_assigned)
# std_assigned = np.std(num_assigned)

# print(f"Mean number of data points per cluster: {mean_assigned}")
# print(f"Standard deviation of number of data points per cluster: {std_assigned}")

# # Mean number of data points per cluster: 9.694656488549619
# # Standard deviation of number of data points per cluster: 21.820754225240147











# # get a view of some of the clusters
# num_samples = 3

# # # Print a short version of some clusters
# # for cluster_label, cluster_data in clusters.items():
# #     print(f"Cluster {cluster_label}:")
# #     for i, (mbid, text) in enumerate(cluster_data[:num_samples], 1):
# #         print(f"Sample {i}: {text[:100]}...")  # Print only the first 100 characters of each text
# #     print()  # Add a blank line between clusters