Spaces:

taskswithcode
/

semantic_clustering

Build error

App Files Files Community

taskswithcode commited on Sep 23, 2022

Commit

5fe6115

•

1 Parent(s): e4cf805

Fixes

Browse files

Files changed (3) hide show

app.py +21 -14
clus_app_clustypes.json +4 -0
twc_clustering.py +82 -22

app.py CHANGED Viewed

@@ -103,16 +103,16 @@ def load_model(model_name,model_class,load_model_name):
 @st.experimental_memo
-def cached_compute_similarity(sentences,_model,model_name,threshold,_cluster):
     texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
-    results = _cluster.cluster(None,texts,embeddings,threshold)
     return results
-def uncached_compute_similarity(sentences,_model,model_name,threshold,cluster):
     with st.spinner('Computing vectors for sentences'):
         texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
-        results = cluster.cluster(None,texts,embeddings,threshold)
     #st.success("Similarity computation complete")
     return results
@@ -124,7 +124,7 @@ def get_model_info(model_names,model_name):
     return get_model_info(model_names,DEFAULT_HF_MODEL)
-def run_test(model_names,model_name,sentences,display_area,threshold,user_uploaded,custom_model):
     display_area.text("Loading model:" + model_name)
     #Note. model_name may get mapped to new name in the call below for custom models
     orig_model_name = model_name
@@ -140,10 +140,10 @@ def run_test(model_names,model_name,sentences,display_area,threshold,user_upload
     display_area.text("Model " + model_name  + " load complete")
     try:
             if (user_uploaded):
-                results = uncached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"])
             else:
                 display_area.text("Computing vectors for sentences")
-                results = cached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"])
                 display_area.text("Similarity computation complete")
             return results
@@ -193,16 +193,19 @@ def init_session():
         st.session_state["model_name"] = "ss_test"
         st.session_state["threshold"] = 1.5
         st.session_state["file_name"] = "default"
         st.session_state["cluster"] = TWCClustering()
     else:
         print("Skipping init session")
-def app_main(app_mode,example_files,model_name_files):
   init_session()
   with open(example_files) as fp:
         example_file_names = json.load(fp)
   with open(model_name_files) as fp:
         model_names = json.load(fp)
   curr_use_case = use_case[app_mode].split(".")[0]
   st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for tasks using sentence embeddings</h5>", unsafe_allow_html=True)
   st.markdown(f"<p style='font-size:14px; color: #4f4f4f; text-align: center'><i>Or compare your own model with state-of-the-art/popular models</p>", unsafe_allow_html=True)
@@ -215,7 +218,7 @@ def app_main(app_mode,example_files,model_name_files):
       with st.form('twc_form'):
-        step1_line = "Step 1. Upload text file(one sentence in a line) or choose an example text file below"
         if (app_mode ==  DOC_RETRIEVAL):
             step1_line += ". The first line is treated as the query"
         uploaded_file = st.file_uploader(step1_line, type=".txt")
@@ -224,14 +227,17 @@ def app_main(app_mode,example_files,model_name_files):
                     options = list(dict.keys(example_file_names)), index=0,  key = "twc_file")
         st.write("")
         options_arr,markdown_str = construct_model_info_for_display(model_names)
-        selection_label = 'Step 2. Select Model'
         selected_model = st.selectbox(label=selection_label,
                     options = options_arr, index=0,  key = "twc_model")
         st.write("")
         custom_model_selection = st.text_input("Model not listed above? Type any Huggingface sentence embedding model name ", "",key="custom_model")
         hf_link_str = "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><a href='https://huggingface.co/models?pipeline_tag=sentence-similarity' target = '_blank'>List of Huggingface sentence embedding models</a><br/><br/><br/></div>"
         st.markdown(hf_link_str, unsafe_allow_html=True)
-        threshold = st.number_input('Step 3. Choose a zscore threshold (number of std devs from mean)',value=st.session_state["threshold"],min_value = 0.0,step=.01)
         st.write("")
         submit_button = st.form_submit_button('Run')
@@ -256,7 +262,8 @@ def app_main(app_mode,example_files,model_name_files):
                 run_model = selected_model
             st.session_state["model_name"] = selected_model
             st.session_state["threshold"] = threshold
-            results = run_test(model_names,run_model,sentences,display_area,threshold,(uploaded_file is not None),(len(custom_model_selection) != 0))
             display_area.empty()
             with display_area.container():
                 device = 'GPU' if torch.cuda.is_available() else 'CPU'
@@ -269,7 +276,7 @@ def app_main(app_mode,example_files,model_name_files):
          label="Download results as json",
          data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
          disabled = False if st.session_state["download_ready"] != None else True,
-         file_name= (st.session_state["model_name"] + "_" +  str(st.session_state["threshold"]) + "_" + '_'.join(st.session_state["file_name"].split(".")[:-1]) + ".json").replace("/","_"),
          mime='text/json',
          key ="download"
         )
@@ -288,5 +295,5 @@ if __name__ == "__main__":
    #print("comand line input:",len(sys.argv),str(sys.argv))
    #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
    #app_main("1","sim_app_examples.json","sim_app_models.json")
-   app_main("3","clus_app_examples.json","clus_app_models.json")

 @st.experimental_memo
+def cached_compute_similarity(sentences,_model,model_name,threshold,_cluster,clustering_type):
     texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
+    results = _cluster.cluster(None,texts,embeddings,threshold,clustering_type)
     return results
+def uncached_compute_similarity(sentences,_model,model_name,threshold,cluster,clustering_type):
     with st.spinner('Computing vectors for sentences'):
         texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
+        results = cluster.cluster(None,texts,embeddings,threshold,clustering_type)
     #st.success("Similarity computation complete")
     return results
     return get_model_info(model_names,DEFAULT_HF_MODEL)
+def run_test(model_names,model_name,sentences,display_area,threshold,user_uploaded,custom_model,clustering_type):
     display_area.text("Loading model:" + model_name)
     #Note. model_name may get mapped to new name in the call below for custom models
     orig_model_name = model_name
     display_area.text("Model " + model_name  + " load complete")
     try:
             if (user_uploaded):
+                results = uncached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
             else:
                 display_area.text("Computing vectors for sentences")
+                results = cached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"],clustering_type)
                 display_area.text("Similarity computation complete")
             return results
         st.session_state["model_name"] = "ss_test"
         st.session_state["threshold"] = 1.5
         st.session_state["file_name"] = "default"
+        st.session_state["overlapped"] = "overlapped"
         st.session_state["cluster"] = TWCClustering()
     else:
         print("Skipping init session")
+def app_main(app_mode,example_files,model_name_files,clus_types):
   init_session()
   with open(example_files) as fp:
         example_file_names = json.load(fp)
   with open(model_name_files) as fp:
         model_names = json.load(fp)
+  with open(clus_types) as fp:
+        cluster_types = json.load(fp)
   curr_use_case = use_case[app_mode].split(".")[0]
   st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for tasks using sentence embeddings</h5>", unsafe_allow_html=True)
   st.markdown(f"<p style='font-size:14px; color: #4f4f4f; text-align: center'><i>Or compare your own model with state-of-the-art/popular models</p>", unsafe_allow_html=True)
       with st.form('twc_form'):
+        step1_line = "Upload text file(one sentence in a line) or choose an example text file below"
         if (app_mode ==  DOC_RETRIEVAL):
             step1_line += ". The first line is treated as the query"
         uploaded_file = st.file_uploader(step1_line, type=".txt")
                     options = list(dict.keys(example_file_names)), index=0,  key = "twc_file")
         st.write("")
         options_arr,markdown_str = construct_model_info_for_display(model_names)
+        selection_label = 'Select Model'
         selected_model = st.selectbox(label=selection_label,
                     options = options_arr, index=0,  key = "twc_model")
         st.write("")
         custom_model_selection = st.text_input("Model not listed above? Type any Huggingface sentence embedding model name ", "",key="custom_model")
         hf_link_str = "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><a href='https://huggingface.co/models?pipeline_tag=sentence-similarity' target = '_blank'>List of Huggingface sentence embedding models</a><br/><br/><br/></div>"
         st.markdown(hf_link_str, unsafe_allow_html=True)
+        threshold = st.number_input('Choose a zscore threshold (number of std devs from mean)',value=st.session_state["threshold"],min_value = 0.0,step=.01)
+        st.write("")
+        clustering_type = st.selectbox(label=f'Select type of clustering',
+                    options = list(dict.keys(cluster_types)), index=0,  key = "twc_cluster_types")
         st.write("")
         submit_button = st.form_submit_button('Run')
                 run_model = selected_model
             st.session_state["model_name"] = selected_model
             st.session_state["threshold"] = threshold
+            st.session_state["overlapped"] = cluster_types[clustering_type]["type"]
+            results = run_test(model_names,run_model,sentences,display_area,threshold,(uploaded_file is not None),(len(custom_model_selection) != 0),cluster_types[clustering_type]["type"])
             display_area.empty()
             with display_area.container():
                 device = 'GPU' if torch.cuda.is_available() else 'CPU'
          label="Download results as json",
          data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
          disabled = False if st.session_state["download_ready"] != None else True,
+         file_name= (st.session_state["model_name"] + "_" +  str(st.session_state["threshold"]) + "_" + st.session_state["overlapped"] + "_" +  '_'.join(st.session_state["file_name"].split(".")[:-1]) + ".json").replace("/","_"),
          mime='text/json',
          key ="download"
         )
    #print("comand line input:",len(sys.argv),str(sys.argv))
    #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
    #app_main("1","sim_app_examples.json","sim_app_models.json")
+   app_main("3","clus_app_examples.json","clus_app_models.json","clus_app_clustypes.json")

clus_app_clustypes.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+"Overlapped clustering (cluster size determined by zscore)": {"type":"overlapped"},
+"Non-overlapped clustering (overlapped clusters aggregated)":{"type":"non-overlapped"}
+}

twc_clustering.py CHANGED Viewed

@@ -31,27 +31,30 @@ class TWCClustering:
         picked_arr = []
         while (run_index < len(embeddings)):
             if (matrix[pivot_index][run_index] >= threshold):
-                #picked_arr.append({"index":run_index,"val":matrix[pivot_index][run_index]})
-                picked_arr.append({"index":run_index})
             run_index += 1
         return picked_arr
     def update_picked_dict(self,picked_dict,in_dict):
         for key in in_dict:
             picked_dict[key] = 1
-    def find_pivot_subgraph(self,pivot_index,arr,matrix,threshold):
         center_index = pivot_index
         center_score = 0
         center_dict = {}
         for i in range(len(arr)):
-            node_i_index = arr[i]["index"]
             running_score = 0
             temp_dict = {}
             for j in range(len(arr)):
-                node_j_index = arr[j]["index"]
                 cosine_dist = matrix[node_i_index][node_j_index]
-                if (cosine_dist < threshold):
                     continue
                 running_score += cosine_dist
                 temp_dict[node_j_index] = cosine_dist
@@ -80,8 +83,76 @@ class TWCClustering:
                 bucket_dict[overlap_dict[key]] += 1
         sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
         return sorted_d
-    def cluster(self,output_file,texts,embeddings,threshold = 1.5):
         matrix = self.compute_matrix(embeddings)
         mean = np.mean(matrix)
         std = np.std(matrix)
@@ -95,22 +166,11 @@ class TWCClustering:
         #print("In clustering:",round(std,2),zscores)
         cluster_dict = {}
         cluster_dict["clusters"] = []
-        picked_dict = {}
-        overlap_dict = {}
-        for i in range(len(embeddings)):
-            if (i in picked_dict):
-                continue
-            zscore = mean + threshold*std
-            arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
-            cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
-            self.update_picked_dict(picked_dict,cluster_info["neighs"])
-            self.update_overlap_stats(overlap_dict,cluster_info)
-            cluster_dict["clusters"].append(cluster_info)
         curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
-        sorted_d = OrderedDict(sorted(overlap_dict.items(), key=lambda kv: kv[1], reverse=True))
-        #print(sorted_d)
-        sorted_d = self.bucket_overlap(overlap_dict)
         cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
         return cluster_dict

         picked_arr = []
         while (run_index < len(embeddings)):
             if (matrix[pivot_index][run_index] >= threshold):
+                picked_arr.append(run_index)
             run_index += 1
         return picked_arr
+    def update_picked_dict_arr(self,picked_dict,arr):
+        for i in range(len(arr)):
+            picked_dict[arr[i]] = 1
     def update_picked_dict(self,picked_dict,in_dict):
         for key in in_dict:
             picked_dict[key] = 1
+    def find_pivot_subgraph(self,pivot_index,arr,matrix,threshold,strict_cluster = True):
         center_index = pivot_index
         center_score = 0
         center_dict = {}
         for i in range(len(arr)):
+            node_i_index = arr[i]
             running_score = 0
             temp_dict = {}
             for j in range(len(arr)):
+                node_j_index = arr[j]
                 cosine_dist = matrix[node_i_index][node_j_index]
+                if ((cosine_dist < threshold) and strict_cluster):
                     continue
                 running_score += cosine_dist
                 temp_dict[node_j_index] = cosine_dist
                 bucket_dict[overlap_dict[key]] += 1
         sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
         return sorted_d
+    def merge_clusters(self,ref_cluster,curr_cluster):
+        dup_arr = ref_cluster.copy()
+        for j in range(len(curr_cluster)):
+            if (curr_cluster[j] not in dup_arr):
+                ref_cluster.append(curr_cluster[j])
+    def non_overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
+        picked_dict = {}
+        overlap_dict = {}
+        candidates = []
+        for i in range(len(embeddings)):
+            if (i in picked_dict):
+                continue
+            zscore = mean + threshold*std
+            arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
+            candidates.append(arr)
+            self.update_picked_dict_arr(picked_dict,arr)
+        # Merge arrays to create non-overlapping sets
+        run_index_i = 0
+        while (run_index_i < len(candidates)):
+            ref_cluster = candidates[run_index_i]
+            run_index_j = run_index_i + 1
+            found = False
+            while (run_index_j < len(candidates)):
+                curr_cluster = candidates[run_index_j]
+                for k in range(len(curr_cluster)):
+                    if (curr_cluster[k] in ref_cluster):
+                        self.merge_clusters(ref_cluster,curr_cluster)
+                        candidates.pop(run_index_j)
+                        found = True
+                        run_index_i = 0
+                        break
+                if (found):
+                    break
+                else:
+                    run_index_j += 1
+            if (not found):
+                run_index_i += 1
+        zscore = mean + threshold*std
+        for i in range(len(candidates)):
+            arr = candidates[i]
+            cluster_info = self.find_pivot_subgraph(arr[0],arr,matrix,zscore,strict_cluster = False)
+            cluster_dict["clusters"].append(cluster_info)
+        return  {}
+    def overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
+        picked_dict = {}
+        overlap_dict = {}
+        zscore = mean + threshold*std
+        for i in range(len(embeddings)):
+            if (i in picked_dict):
+                continue
+            arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
+            cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore,strict_cluster = True)
+            self.update_picked_dict(picked_dict,cluster_info["neighs"])
+            self.update_overlap_stats(overlap_dict,cluster_info)
+            cluster_dict["clusters"].append(cluster_info)
+        sorted_d = self.bucket_overlap(overlap_dict)
+        return  sorted_d
+    def cluster(self,output_file,texts,embeddings,threshold,clustering_type):
+        is_overlapped = True if clustering_type == "overlapped" else False
         matrix = self.compute_matrix(embeddings)
         mean = np.mean(matrix)
         std = np.std(matrix)
         #print("In clustering:",round(std,2),zscores)
         cluster_dict = {}
         cluster_dict["clusters"] = []
+        if (is_overlapped):
+            sorted_d = self.overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict)
+        else:
+            sorted_d = self.non_overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict)
         curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
         cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
         return cluster_dict