wilmerags commited on
Commit
4b206d5
·
1 Parent(s): 1ef5823

test: Experiment with reusing initial embeddings to avoid recalculation and speed up the process

Browse files
Files changed (1) hide show
  1. app.py +4 -1
app.py CHANGED
@@ -150,12 +150,15 @@ def generate_plot(
150
  continue
151
  cluster_keyword[label] = []
152
  cluster_tws = []
 
153
  for ix, obs in enumerate(encoded_labels):
154
  if obs == label:
155
  cluster_tws.append(tws_cleaned[ix])
 
156
  cluster_words = [tw.split(' ') for tw in cluster_tws]
157
  cluster_words = list(set(itertools.chain.from_iterable(cluster_words)))
158
- cluster_embeddings = embed_text(cluster_tws, model)
 
159
  cluster_embeddings_avg = np.mean(cluster_embeddings, axis=0)
160
  cluster_words_embeddings = embed_text(cluster_words, model)
161
  cluster_to_words_similarities = util.dot_score(cluster_embeddings_avg, cluster_words_embeddings)
 
150
  continue
151
  cluster_keyword[label] = []
152
  cluster_tws = []
153
+ cluster_ixs = []
154
  for ix, obs in enumerate(encoded_labels):
155
  if obs == label:
156
  cluster_tws.append(tws_cleaned[ix])
157
+ cluster_ixs.append(ix)
158
  cluster_words = [tw.split(' ') for tw in cluster_tws]
159
  cluster_words = list(set(itertools.chain.from_iterable(cluster_words)))
160
+ # cluster_embeddings = embed_text(cluster_tws, model)
161
+ cluster_embeddings = [embeddings[i] for i in ixs]
162
  cluster_embeddings_avg = np.mean(cluster_embeddings, axis=0)
163
  cluster_words_embeddings = embed_text(cluster_words, model)
164
  cluster_to_words_similarities = util.dot_score(cluster_embeddings_avg, cluster_words_embeddings)