File size: 6,903 Bytes
cebe313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from scipy.spatial.distance import cosine
import argparse
import json
import pdb
import torch
import torch.nn.functional as F
import numpy as np
import time
from collections import OrderedDict


class TWCClustering:
    def __init__(self):
        print("In Zscore  Clustering")

    def compute_matrix(self,embeddings):
        #print("Computing similarity matrix ...)")
        embeddings= np.array(embeddings)
        start = time.time()
        vec_a = embeddings.T #vec_a shape (1024,)
        vec_a = vec_a/np.linalg.norm(vec_a,axis=0) #Norm is along axis 0 - rows
        vec_a = vec_a.T #vec_a shape becomes (,1024)
        similarity_matrix = np.inner(vec_a,vec_a)
        end = time.time()
        time_val = (end-start)*1000
        #print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f}  minutes")
        return similarity_matrix
        
    def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
        run_index = pivot_index
        picked_arr = []
        while (run_index < len(embeddings)):
            if (matrix[pivot_index][run_index] >= threshold):
                picked_arr.append(run_index)
            run_index += 1
        return picked_arr

    def update_picked_dict_arr(self,picked_dict,arr):
        for i in range(len(arr)):
            picked_dict[arr[i]] = 1

    def update_picked_dict(self,picked_dict,in_dict):
        for key in in_dict:
            picked_dict[key] = 1

    def find_pivot_subgraph(self,pivot_index,arr,matrix,threshold,strict_cluster = True):
        center_index = pivot_index
        center_score = 0
        center_dict = {}
        for i in range(len(arr)):
            node_i_index = arr[i]
            running_score = 0
            temp_dict = {}
            for j in range(len(arr)):
                node_j_index = arr[j]
                cosine_dist = matrix[node_i_index][node_j_index]
                if ((cosine_dist < threshold) and strict_cluster):
                    continue
                running_score += cosine_dist
                temp_dict[node_j_index] = cosine_dist
            if (running_score > center_score):
                center_index = node_i_index
                center_dict = temp_dict
                center_score = running_score
        sorted_d = OrderedDict(sorted(center_dict.items(), key=lambda kv: kv[1], reverse=True))
        return  {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
         

    def update_overlap_stats(self,overlap_dict,cluster_info):
        arr = list(cluster_info["neighs"].keys())
        for val in arr:
            if (val not in overlap_dict):
                overlap_dict[val] = 1
            else:
                overlap_dict[val] += 1

    def bucket_overlap(self,overlap_dict):
        bucket_dict = {}
        for key in overlap_dict:
            if (overlap_dict[key] not in bucket_dict):
                bucket_dict[overlap_dict[key]] = 1
            else:
                bucket_dict[overlap_dict[key]] += 1
        sorted_d = OrderedDict(sorted(bucket_dict.items(), key=lambda kv: kv[1], reverse=False))
        return sorted_d

    def merge_clusters(self,ref_cluster,curr_cluster):
        dup_arr = ref_cluster.copy()
        for j in range(len(curr_cluster)):
            if (curr_cluster[j] not in dup_arr):
                ref_cluster.append(curr_cluster[j]) 
                

    def non_overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
        picked_dict = {}
        overlap_dict = {}
        candidates = []
    
        for i in range(len(embeddings)):
            if (i in picked_dict):
                continue
            zscore = mean + threshold*std
            arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
            candidates.append(arr)
            self.update_picked_dict_arr(picked_dict,arr)
    
        # Merge arrays to create non-overlapping sets
        run_index_i = 0
        while (run_index_i < len(candidates)):
            ref_cluster = candidates[run_index_i]
            run_index_j = run_index_i + 1
            found = False
            while (run_index_j < len(candidates)): 
                curr_cluster = candidates[run_index_j]
                for k in range(len(curr_cluster)):
                    if (curr_cluster[k] in ref_cluster):
                        self.merge_clusters(ref_cluster,curr_cluster)
                        candidates.pop(run_index_j)
                        found = True
                        run_index_i = 0
                        break
                if (found):
                    break
                else:
                    run_index_j += 1
            if (not found):
                run_index_i += 1 
            
                
        zscore = mean + threshold*std
        for i in range(len(candidates)):
            arr = candidates[i]
            cluster_info = self.find_pivot_subgraph(arr[0],arr,matrix,zscore,strict_cluster = False)
            cluster_dict["clusters"].append(cluster_info)
        return  {}

    def overlapped_clustering(self,matrix,embeddings,threshold,mean,std,cluster_dict):
        picked_dict = {}
        overlap_dict = {}
    
        zscore = mean + threshold*std
        for i in range(len(embeddings)):
            if (i in picked_dict):
                continue
            arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
            cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore,strict_cluster = True)
            self.update_picked_dict(picked_dict,cluster_info["neighs"])
            self.update_overlap_stats(overlap_dict,cluster_info)
            cluster_dict["clusters"].append(cluster_info)
        sorted_d = self.bucket_overlap(overlap_dict)
        return  sorted_d
        
        
    def cluster(self,output_file,texts,embeddings,threshold,clustering_type):
        is_overlapped = True if clustering_type == "overlapped" else False
        matrix = self.compute_matrix(embeddings)
        mean = np.mean(matrix)
        std = np.std(matrix)
        zscores = []
        inc = 0
        value = mean
        while (value < 1):
            zscores.append({"threshold":inc,"cosine":round(value,2)})
            inc += 1
            value = mean + inc*std
        #print("In clustering:",round(std,2),zscores)
        cluster_dict = {}
        cluster_dict["clusters"] = []
        if (is_overlapped):
            sorted_d = self.overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict) 
        else:
            sorted_d = self.non_overlapped_clustering(matrix,embeddings,threshold,mean,std,cluster_dict) 
        curr_threshold = f"{threshold} (cosine:{mean+threshold*std:.2f})"
        cluster_dict["info"] ={"mean":mean,"std":std,"current_threshold":curr_threshold,"zscores":zscores,"overlap":list(sorted_d.items())}
        return cluster_dict