jharrison27 commited on
Commit
66cf393
1 Parent(s): e2775c7

fix looping

Browse files
Files changed (1) hide show
  1. app.py +18 -33
app.py CHANGED
@@ -1,15 +1,10 @@
1
  import streamlit as st
2
- import logging
3
  from transformers import pipeline
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
6
  from sklearn.cluster import KMeans
7
  import numpy as np
8
 
9
- # Setting up logging
10
- logging.basicConfig(level=logging.INFO)
11
- logger = logging.getLogger(__name__)
12
-
13
  # Mock data
14
  mock_words = [
15
  "apple", "banana", "cherry", "date", # Fruits
@@ -27,7 +22,6 @@ models = {
27
 
28
  @st.cache_resource
29
  def load_models():
30
- logger.info("Loading models...")
31
  pipelines = {}
32
  for name, model_name in models.items():
33
  pipelines[name] = pipeline('feature-extraction', model=model_name)
@@ -39,37 +33,28 @@ def embed_words(words, model_name):
39
  """
40
  Embed the given words using the specified model and return the averaged embeddings.
41
  """
42
- logger.info(f"Embedding words using model {model_name}...")
43
  embedder = pipelines[model_name]
44
  embeddings = embedder(words)
45
  return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
46
 
47
- def iterative_clustering(words, model_name, method):
48
- logger.info(f"Starting iterative clustering using {method}...")
49
- remaining_words = words[:]
50
- grouped_words = []
51
- while len(remaining_words) >= 4:
52
- embeddings = embed_words(remaining_words, model_name)
53
- if method == 'Cosine Similarity':
54
- logger.info("Clustering using Cosine Similarity...")
55
- sim_matrix = cosine_similarity(embeddings)
56
- Z = linkage(sim_matrix, 'average', metric='cosine')
57
- labels = fcluster(Z, t=4, criterion='maxclust')
58
- elif method == 'K-means':
59
- logger.info("Clustering using K-means...")
60
- kmeans = KMeans(n_clusters=4, random_state=0).fit(embeddings)
61
- labels = kmeans.labels_ + 1
62
- for i in range(1, 5):
63
- cluster = [word for idx, word in enumerate(remaining_words) if labels[idx] == i]
64
- if len(cluster) == 4:
65
- grouped_words.append(cluster)
66
- remaining_words = [word for word in remaining_words if word not in cluster]
67
- break
68
- return grouped_words
69
 
70
  def display_clusters(clusters):
71
- logger.info("Displaying clusters...")
72
- for i, words in enumerate(clusters):
73
  st.markdown(f"### Group {i+1}")
74
  st.write(", ".join(words))
75
 
@@ -83,8 +68,8 @@ def main():
83
 
84
  if st.button("Generate Clusters"):
85
  with st.spinner("Generating clusters..."):
86
- clusters = iterative_clustering(mock_words, model_name, clustering_method)
87
- display_clusters(clusters)
88
 
89
  if __name__ == "__main__":
90
  main()
 
1
  import streamlit as st
 
2
  from transformers import pipeline
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
5
  from sklearn.cluster import KMeans
6
  import numpy as np
7
 
 
 
 
 
8
  # Mock data
9
  mock_words = [
10
  "apple", "banana", "cherry", "date", # Fruits
 
22
 
23
  @st.cache_resource
24
  def load_models():
 
25
  pipelines = {}
26
  for name, model_name in models.items():
27
  pipelines[name] = pipeline('feature-extraction', model=model_name)
 
33
  """
34
  Embed the given words using the specified model and return the averaged embeddings.
35
  """
 
36
  embedder = pipelines[model_name]
37
  embeddings = embedder(words)
38
  return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
39
 
40
+ def cluster_words(words, model_name, method):
41
+ embeddings = embed_words(words, model_name)
42
+ if method == 'Cosine Similarity':
43
+ # Use cosine similarity and hierarchical clustering
44
+ sim_matrix = cosine_similarity(embeddings)
45
+ Z = linkage(sim_matrix, 'average', metric='cosine')
46
+ labels = fcluster(Z, t=4, criterion='maxclust')
47
+ elif method == 'K-means':
48
+ # Use K-means clustering
49
+ kmeans = KMeans(n_clusters=4, random_state=0).fit(embeddings)
50
+ labels = kmeans.labels_ + 1
51
+ clusters = {i: [] for i in range(1, 5)}
52
+ for word, label in zip(words, labels):
53
+ clusters[label].append(word)
54
+ return clusters
 
 
 
 
 
 
 
55
 
56
  def display_clusters(clusters):
57
+ for i, words in clusters.items():
 
58
  st.markdown(f"### Group {i+1}")
59
  st.write(", ".join(words))
60
 
 
68
 
69
  if st.button("Generate Clusters"):
70
  with st.spinner("Generating clusters..."):
71
+ clusters = cluster_words(mock_words, model_name, clustering_method)
72
+ display_clusters(clusters)
73
 
74
  if __name__ == "__main__":
75
  main()