File size: 6,224 Bytes
5f773d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Description:Creates clusters based on the uploaded transcripts and returns the uuid of the documents that are similar to the query.
# Reference Code: https://github.com/chakib401/smoothing_sentence_embeddings/blob/master/utils.py

'''
Paper Citation for def normalize_adj(): 
Fettal, Chakib, Lazhar Labiod, and Mohamed Nadif. 
"More Discriminative Sentence Embeddings via Semantic Graph Smoothing." 
arXiv preprint arXiv:2402.12890 (2024).
'''

import json
import os
import joblib
import numpy as np
import pandas as pd
import scipy.sparse as sp
import src.clustering.resonate_semantic_search as SemanticSearch
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from scipy.io import loadmat, savemat
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.neighbors import kneighbors_graph

def normalize_adj(adj, lmbda=1):
    '''
    Normalize adjacency matrix of semantic graph
    '''
    adj = adj + lmbda * sp.eye(adj.shape[0])
    rowsum = np.array(adj.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.0
    r_mat_inv = sp.diags(r_inv)
    adj = r_mat_inv.dot(adj)

    adj = adj + lmbda * sp.eye(adj.shape[0])
    adj = sp.coo_matrix(adj)
    row_sum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(row_sum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()


def graph_filtering(features, degree=2, lmbda=1, nn=10, alpha=0.5, t=5, method="sgc"):
    """
    This function will perform graph filtering based on four polynomial filters
    We keep n=10, as per paper. And is used to calculate the graph (adjacency matrix)
    between 10 vectors/features.

    **That is why we have 10 pre-existing transcripts placed in pinecone (through the ont_time_script)
    **If you want to change the number of transcripts, you will have to change the number of neighbors
    """
    adj = kneighbors_graph(features, n_neighbors=nn, metric="cosine")
    adj = (adj + adj.T) / 2

    S = normalize_adj(adj, lmbda)
    xx = features
    yy = features.copy()
    if method in ["sgc", "s2gc"]:
        for _ in range(degree):
            xx = S @ xx
            yy += xx
        if method == "sgc":
            return xx
        elif method == "s2gc":
            return yy
    elif method == "appnp":
        for _ in range(degree):
            xx = (1 - alpha) * S @ xx + alpha * features
        return xx
    elif method == "dgc":
        k = degree + 1
        for _ in range(1, degree + 1):
            xx = (1 - t / k) * xx + (t / k) * (S @ xx)
        return xx
    else:
        raise "unrecognized filter"


def load_json_config(json_file_path="./config/config.json"):
    with open(json_file_path, "r") as file:
        data = json.load(file)
    return data


class Clustering:
    def __init__(self):
        self.api_key = os.environ.get("OPENAI_API_KEY")
        self.method = "dgc"

        if not os.path.exists("./data/clusteringFiles/cluster_data.csv"):
            self.create_Cluster()

        self.index = self.initialize_FAISS()

    def create_embedding(self):
        '''This function will perform two task:
        1. embedding on entire data, abstract_data.csv
        2. save embeddings in cluster_data-embedding.mat in format uuid, text
        '''
        data = pd.read_csv("./data/summaryFiles/abstract_summary_data.csv")
        json_config = load_json_config()

        text, id = data["text"], data["uuid"]
        # embedding model
        embed = OpenAIEmbeddings(
            model=json_config["EMBEDDING_MODEL_NAME"], openai_api_key=self.api_key
        )
        embeddings = embed.embed_documents(text)
        savemat(
            "./data/embeddingFiles/cluster-embedding.mat",
            {"uuid": id, "text": embeddings},
        )


    def create_Cluster(self):
        '''
        This function will perform following tasks:
        1. call embedding function
        2. form clusters using cluste_data-embedding.mat file
        3. Save predicted labels in cluster_data.csv
        '''
        self.create_embedding()

        data = loadmat("./data/embeddingFiles/cluster-embedding.mat")
        features1 = data["text"]

        features = graph_filtering(features1, method=self.method)
        ibandwidth = estimate_bandwidth(features, quantile=0.30, random_state=42)
        msclustering = MeanShift(bandwidth=ibandwidth, max_iter=900)
        msclustering.fit(features)
        model_path = f"./data/clusteringFiles/{self.method}_model.joblib"
        joblib.dump(msclustering, model_path)

        print("Model saved")

        df = pd.read_csv(f"./data/summaryFiles/abstract_summary_data.csv")
        df["cluster"] = msclustering.predict(features)
        df.to_csv("./data/clusteringFiles/cluster_data.csv")
        print("Cluster data saved")
        self.index = self.initialize_FAISS()



    def uuid_for_query(self, query):
        '''
        Returns the uuids of the documents that are similar to the query, based on the clustering
        '''
        query_cluster_label = self.index.search_query(query)
        print(f"Predicted Label : {query_cluster_label[0]}")
        df = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
        filtered_uuids = df[df["cluster"] == query_cluster_label[0]]["uuid"].tolist()
        return filtered_uuids

    def initialize_FAISS(self):
        model = SemanticSearch.SemanticEmbedding()
        index = SemanticSearch.FaissForQuerySearch(model)
        data = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
        features1 = data["text"]
        uuids = data["uuid"]
        labels = data["cluster"]
        for text, uuid, label in zip(features1, uuids, labels):
            index.add_summary(text, uuid, label)
        return index


if __name__ == "__main__":
    #load_dotenv("./config/.env")
    Clustering_obj = Clustering()
    print(
        Clustering_obj.uuid_for_query(
            "What is the goal of defining maintainability for the new diffs architecture?"
        )
    )
    print(
        Clustering_obj.uuid_for_query(
            "What was the design component for remote control?"
        )
    )