Spaces:

jharrison27
/

connections-solver

Runtime error

App Files Files Community

jharrison27 commited on May 20, 2024

Commit

8fd248e

1 Parent(s): 66cf393

revert changes

Browse files

Files changed (1) hide show

app.py +39 -21

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import streamlit as st
 from transformers import pipeline
-from sklearn.metrics.pairwise import cosine_similarity
-from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
 from sklearn.cluster import KMeans
 import numpy as np
@@ -37,38 +35,58 @@ def embed_words(words, model_name):
     embeddings = embedder(words)
     return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
-def cluster_words(words, model_name, method):
-    embeddings = embed_words(words, model_name)
-    if method == 'Cosine Similarity':
-        # Use cosine similarity and hierarchical clustering
-        sim_matrix = cosine_similarity(embeddings)
-        Z = linkage(sim_matrix, 'average', metric='cosine')
-        labels = fcluster(Z, t=4, criterion='maxclust')
-    elif method == 'K-means':
-        # Use K-means clustering
-        kmeans = KMeans(n_clusters=4, random_state=0).fit(embeddings)
-        labels = kmeans.labels_ + 1
-    clusters = {i: [] for i in range(1, 5)}
-    for word, label in zip(words, labels):
-        clusters[label].append(word)
-    return clusters
 def display_clusters(clusters):
-    for i, words in clusters.items():
         st.markdown(f"### Group {i+1}")
         st.write(", ".join(words))
 def main():
     st.title("NYT Connections Solver")
     st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
-    st.write("Select an embedding model and a clustering method from the dropdown menus, then click 'Generate Clusters' to see the grouped words.")
     model_name = st.selectbox("Select Embedding Model", list(models.keys()))
-    clustering_method = st.selectbox("Select Clustering Method", ['K-means', 'Cosine Similarity'])
     if st.button("Generate Clusters"):
         with st.spinner("Generating clusters..."):
-            clusters = cluster_words(mock_words, model_name, clustering_method)
         display_clusters(clusters)
 if __name__ == "__main__":

 import streamlit as st
 from transformers import pipeline
 from sklearn.cluster import KMeans
 import numpy as np
     embeddings = embedder(words)
     return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings])
+def iterative_clustering(words, model_name):
+    remaining_words = words[:]
+    grouped_words = []
+    while len(remaining_words) >= 4:
+        embeddings = embed_words(remaining_words, model_name)
+        kmeans = KMeans(n_clusters=min(4, len(remaining_words) // 4), random_state=0).fit(embeddings)
+        clusters = {i: [] for i in range(kmeans.n_clusters)}
+        for word, label in zip(remaining_words, kmeans.labels_):
+            if len(clusters[label]) < 4:
+                clusters[label].append(word)
+        # Select the most cohesive cluster
+        best_cluster, best_idx = select_most_cohesive_cluster(clusters, kmeans, embeddings)
+        # Store the best cluster and remove those words
+        grouped_words.append(best_cluster)
+        remaining_words = [word for word in remaining_words if word not in best_cluster]
+    return grouped_words
+def select_most_cohesive_cluster(clusters, kmeans_model, embeddings):
+    min_distance = float('inf')
+    best_cluster = None
+    best_idx = -1
+    for idx, cluster in clusters.items():
+        if len(cluster) == 4:
+            cluster_embeddings = embeddings[[i for i, label in enumerate(kmeans_model.labels_) if label == idx]]
+            centroid = kmeans_model.cluster_centers_[idx]
+            distance = np.mean(np.linalg.norm(cluster_embeddings - centroid, axis=1))
+            if distance < min_distance:
+                min_distance = distance
+                best_cluster = cluster
+                best_idx = idx
+    return best_cluster, best_idx
 def display_clusters(clusters):
+    for i, words in enumerate(clusters):
         st.markdown(f"### Group {i+1}")
         st.write(", ".join(words))
 def main():
     st.title("NYT Connections Solver")
     st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.")
+    st.write("Select an embedding model from the dropdown menu and click 'Generate Clusters' to see the grouped words.")
+    # Dropdown menu for selecting the embedding model
     model_name = st.selectbox("Select Embedding Model", list(models.keys()))
     if st.button("Generate Clusters"):
         with st.spinner("Generating clusters..."):
+            clusters = iterative_clustering(mock_words, model_name)
         display_clusters(clusters)
 if __name__ == "__main__":