Spaces:

Henry65
/

RepoSnipy

App Files Files Community

Henry65 commited on Apr 9

Commit

e0d476d

•

1 Parent(s): 5e997c4

Update similaritycal model

Browse files

Files changed (9) hide show

.gitignore +6 -0
app.py +24 -18
assets/Repository-Code Cluster Assignments.png +0 -0
assets/Repository-Topic Cluster Assignments.png +0 -0
common/pair_classifier.py +4 -3
similarityCal/__init__.py +0 -0
data/SimilarityCal_model_NO1.pt → similarityCal/code.pt +2 -2
similarityCal/topic.pt +3 -0
similarityCal/utils.py +169 -0

.gitignore CHANGED Viewed

@@ -161,3 +161,9 @@ cython_debug/
 # Streamlit configs
 .streamlit/

 # Streamlit configs
 .streamlit/
+# IDE files
+.idea/
+# Mac os files
+*.DS_Store

app.py CHANGED Viewed

@@ -7,21 +7,22 @@ import pandas as pd
 import numpy as np
 import streamlit as st
 from pathlib import Path
-from torch import nn
 from docarray import DocList
 from docarray.index import InMemoryExactNNIndex
 from transformers import pipeline
 from transformers import AutoTokenizer, AutoModel
 from common.repo_doc import RepoDoc
-from common.pair_classifier import PairClassifier
 from nltk.stem import WordNetLemmatizer
 nltk.download("wordnet")
 KMEANS_TOPIC_MODEL_PATH = Path(__file__).parent.joinpath("data/kmeans_model_topic_scibert.pkl")
 KMEANS_CODE_MODEL_PATH = Path(__file__).parent.joinpath("data/kmeans_model_code_unixcoder.pkl")
-SIMILARITY_CAL_MODEL_PATH = Path(__file__).parent.joinpath("data/SimilarityCal_model_NO1.pt")
 SCIBERT_MODEL_PATH = "allenai/scibert_scivocab_uncased"
 # SCIBERT_MODEL_PATH = Path(__file__).parent.joinpath("data/scibert_scivocab_uncased")  # Download locally
 device = (
     "cuda"
     if torch.cuda.is_available()
@@ -136,16 +137,20 @@ def load_code_kmeans_model():
 @st.cache_resource(show_spinner="Loading SimilarityCal model...")
-def load_similaritycal_model():
     """
     The function to load SimilarityCal model
     :return: the SimilarityCal model
     """
-    sim_cal_model = PairClassifier()
-    sim_cal_model.load_state_dict(torch.load(SIMILARITY_CAL_MODEL_PATH, map_location=device))
-    sim_cal_model = sim_cal_model.to(device)
-    sim_cal_model = sim_cal_model.eval()
     return sim_cal_model
@@ -247,27 +252,27 @@ def run_similaritycal_search(index, repo_clusters, model, query_doc, query_clust
     :return: result dataframe
     """
     docs = index._docs
-    input_embeddings_list = []
     result_dl = DocList[RepoDoc]()
     for doc in docs:
         if query_cluster_number != repo_clusters[doc.name]:
             continue
         if doc.name != query_doc.name:
             e1, e2 = (torch.Tensor(query_doc.repository_embedding),
                       torch.Tensor(doc.repository_embedding))
-            input_embeddings = torch.cat([e1, e2])
-            input_embeddings_list.append(input_embeddings)
             result_dl.append(doc)
-    input_embeddings_list = torch.stack(input_embeddings_list).to(device)
-    softmax = nn.Softmax(dim=1).to(device)
-    model_output = model(input_embeddings_list)
-    similarity_scores = softmax(model_output)[:, 1].cpu().detach().numpy()
     df = result_dl.to_dataframe()
     df["scores"] = similarity_scores
     sorted_df = df.sort_values(by='scores', ascending=False).reset_index(drop=True).head(limit)
-    sorted_df["rankings"] = sorted_df["scores"].rank(ascending=False).astype(int)
     sorted_df.drop(columns="scores", inplace=True)
     return sorted_df
@@ -283,7 +288,6 @@ if __name__ == "__main__":
     tokenizer, scibert_model = load_scibert_model()
     topic_kmeans = load_topic_kmeans_model()
     code_kmeans = load_code_kmeans_model()
-    sim_cal_model = load_similaritycal_model()
     # Setting the sidebar
     with st.sidebar:
@@ -507,6 +511,7 @@ if __name__ == "__main__":
         with code_cluster_tab:
             if query_doc.repository_embedding is not None:
                 cluster_df = run_similaritycal_search(index, repo_code_clusters, sim_cal_model,
                                                       query_doc, code_cluster_number, limit)
                 code_cluster_numbers = run_code_cluster_search(repo_code_clusters, cluster_df["name"])
@@ -519,6 +524,7 @@ if __name__ == "__main__":
         with topic_cluster_tab:
             if query_doc.repository_embedding is not None:
                 cluster_df = run_similaritycal_search(index, repo_topic_clusters, sim_cal_model,
                                                       query_doc, topic_cluster_number, limit)
                 topic_cluster_numbers = run_topic_cluster_search(repo_topic_clusters, cluster_df["name"])

 import numpy as np
 import streamlit as st
 from pathlib import Path
 from docarray import DocList
 from docarray.index import InMemoryExactNNIndex
 from transformers import pipeline
 from transformers import AutoTokenizer, AutoModel
 from common.repo_doc import RepoDoc
 from nltk.stem import WordNetLemmatizer
+from similarityCal.utils import calculate_similarity
 nltk.download("wordnet")
 KMEANS_TOPIC_MODEL_PATH = Path(__file__).parent.joinpath("data/kmeans_model_topic_scibert.pkl")
 KMEANS_CODE_MODEL_PATH = Path(__file__).parent.joinpath("data/kmeans_model_code_unixcoder.pkl")
 SCIBERT_MODEL_PATH = "allenai/scibert_scivocab_uncased"
 # SCIBERT_MODEL_PATH = Path(__file__).parent.joinpath("data/scibert_scivocab_uncased")  # Download locally
 device = (
     "cuda"
     if torch.cuda.is_available()
 @st.cache_resource(show_spinner="Loading SimilarityCal model...")
+def load_similaritycal_model(mode: str):
     """
     The function to load SimilarityCal model
+    mode: 'code' or 'topic'
     :return: the SimilarityCal model
     """
+    if mode == 'topic':
+        sim_cal_model = torch.load('similarityCal/topic.pt')
+    elif mode == 'code':
+        sim_cal_model = torch.load('similarityCal/code.pt')
+    else:
+        raise ValueError("parameter 'mode' must be 'code' or 'topic'")
+    sim_cal_model.to(device)
+    sim_cal_model.eval()
     return sim_cal_model
     :return: result dataframe
     """
     docs = index._docs
     result_dl = DocList[RepoDoc]()
+    e1_list, e2_list = [], []
     for doc in docs:
         if query_cluster_number != repo_clusters[doc.name]:
             continue
         if doc.name != query_doc.name:
             e1, e2 = (torch.Tensor(query_doc.repository_embedding),
                       torch.Tensor(doc.repository_embedding))
+            e1_list.append(e1)
+            e2_list.append(e2)
             result_dl.append(doc)
+    e1_list = torch.stack(e1_list).to(device)
+    e2_list = torch.stack(e2_list).to(device)
+    model.eval()
+    similarity_scores = calculate_similarity(model, e1_list, e2_list)[:, 1].cpu().detach().numpy()
     df = result_dl.to_dataframe()
     df["scores"] = similarity_scores
     sorted_df = df.sort_values(by='scores', ascending=False).reset_index(drop=True).head(limit)
+    sorted_df["rankings"] = sorted_df["scores"].rank(ascending=False, method='first').astype(int)
     sorted_df.drop(columns="scores", inplace=True)
     return sorted_df
     tokenizer, scibert_model = load_scibert_model()
     topic_kmeans = load_topic_kmeans_model()
     code_kmeans = load_code_kmeans_model()
     # Setting the sidebar
     with st.sidebar:
         with code_cluster_tab:
             if query_doc.repository_embedding is not None:
+                sim_cal_model = load_similaritycal_model("code")
                 cluster_df = run_similaritycal_search(index, repo_code_clusters, sim_cal_model,
                                                       query_doc, code_cluster_number, limit)
                 code_cluster_numbers = run_code_cluster_search(repo_code_clusters, cluster_df["name"])
         with topic_cluster_tab:
             if query_doc.repository_embedding is not None:
+                sim_cal_model = load_similaritycal_model("topic")
                 cluster_df = run_similaritycal_search(index, repo_topic_clusters, sim_cal_model,
                                                       query_doc, topic_cluster_number, limit)
                 topic_cluster_numbers = run_topic_cluster_search(repo_topic_clusters, cluster_df["name"])

assets/Repository-Code Cluster Assignments.png CHANGED Viewed

assets/Repository-Topic Cluster Assignments.png CHANGED Viewed

common/pair_classifier.py CHANGED Viewed

@@ -29,9 +29,10 @@ class PairClassifier(nn.Module):
             nn.Linear(1000, 2),
         )
-    def forward(self, data):
-        e1 = self.encoder(data[:, :768 * 4])
-        e2 = self.encoder(data[:, 768 * 4:])
         twins = torch.cat([e1, e2], dim=1)
         res = self.net(twins)
         return res

             nn.Linear(1000, 2),
         )
+    def forward(self, data1, data2):
+        # modify the logic of loading the data
+        e1 = self.encoder(data1)
+        e2 = self.encoder(data2)
         twins = torch.cat([e1, e2], dim=1)
         res = self.net(twins)
         return res

similarityCal/__init__.py ADDED Viewed

File without changes

data/SimilarityCal_model_NO1.pt → similarityCal/code.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9146d0736261db38bb6fe6d4d6dd17797c01980be23b114af4b86a18589af632
-size 102423158

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fca98b665ac3a35db1fa333b21f97d71cda5f2af27229d9e7d93b2fa8696a03
+size 102424453

similarityCal/topic.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b5481fc8c348f1784c29374cde09ad9374ad7c201e33b4748e6153c1ab4c832
+size 102424470

similarityCal/utils.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import json
+import os
+from pathlib import Path
+import torch
+from docarray.index import InMemoryExactNNIndex
+from common.repo_doc import RepoDoc
+import random
+from torchmetrics.classification import Accuracy, Precision, Recall, F1Score, AUROC
+from tqdm import tqdm
+INDEX_PATH = Path(__file__).parent.joinpath("..\\data\\")
+TOPIC_CLUSTER_PATH = Path(__file__).parent.joinpath("..\\data\\repo_topic_clusters.json")
+CODE_CLUSTER_PATH = Path(__file__).parent.joinpath("..\\data\\repo_code_clusters.json")
+def read_repo_cluster(filename):
+    # return repo name - cluster id key value pair
+    with open(filename, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+def find_files_in_directory(directory):
+    # loop all index files
+    files = []
+    for file in os.listdir(directory):
+        if file[:5] == "index" and file[5] != ".":
+            files.append(os.path.join(directory, file))
+    return files
+def read_repo_embedding():
+    # return repo name - embedding k-v pair
+    map = {}
+    for filename in find_files_in_directory(INDEX_PATH):
+        data = InMemoryExactNNIndex[RepoDoc](index_file_path=Path(__file__).parent.joinpath(filename))
+        docs_tmp = data._docs
+        for doc in docs_tmp:
+            map[doc.name] = doc.repository_embedding
+    return map
+def build_cluster_repo_embedding(mode: str):
+    """
+    build the dataset according to code cluster
+    where mode is "code" or "topic"
+    """
+    embedding = read_repo_embedding()
+    if mode == "code":
+        cluster_id = read_repo_cluster(CODE_CLUSTER_PATH)
+    elif mode == "topic":
+        cluster_id = read_repo_cluster(TOPIC_CLUSTER_PATH)
+    else:
+        raise ValueError("parameter 'mode' must be 'code' or 'topic'")
+    data = []
+    for name in embedding:
+        data.append({'name': name, 'embedding': embedding[name], 'id': cluster_id[name]})
+    return data
+def build_dataset(data, ratio=0.7):
+    """
+    return the train set and test set which are like (index1, index2) : (same, not same)
+    """
+    positive_repo = []
+    negative_repo = []
+    n = len(data)
+    # build the binary dataset
+    for i in range(n):
+        for j in range(i, n):
+            if data[i]['id'] == data[j]['id']:
+                positive_repo.append((i, j, (1.0, 0.0)))
+                positive_repo.append((j, i, (1.0, 0.0)))
+            else:
+                negative_repo.append((i, j, (0.0, 1.0)))
+                negative_repo.append((j, i, (0.0, 1.0)))
+    # make balance
+    positive_length = len(positive_repo)
+    negative_repo = random.choices(negative_repo, k=positive_length)
+    # split the dataset
+    random.shuffle(positive_repo)
+    random.shuffle(negative_repo)
+    split_index = int(positive_length * ratio)
+    train_set = positive_repo[:split_index] + negative_repo[:split_index]
+    random.shuffle(train_set)
+    test_set = positive_repo[split_index:] + negative_repo[split_index:]
+    random.shuffle(test_set)
+    print("Positive data:", len(positive_repo))
+    print("Negative data:", len(negative_repo))
+    return train_set, test_set
+def train_epoch(epoch, model, loader, device, criterion, optimizer):
+    model.train()
+    accuracy = Accuracy(task='binary')
+    precision = Precision(task='binary')
+    recall = Recall(task='binary')
+    f1 = F1Score(task='binary')
+    auroc = AUROC(task='binary')
+    accuracy.to(device)
+    precision.to(device)
+    recall.to(device)
+    f1.to(device)
+    auroc.to(device)
+    total_loss = 0
+    count = 0
+    for repo1, repo2, label in tqdm(loader):
+        count += len(label)
+        optimizer.zero_grad()
+        repo1 = repo1.to(device)
+        repo2 = repo2.to(device)
+        label = label.to(device)
+        pred = model(repo1, repo2)
+        loss = criterion(pred, label)
+        loss.backward()
+        total_loss += loss.item()
+        optimizer.step()
+        accuracy(pred, label)
+        precision(pred, label)
+        recall(pred, label)
+        f1(pred, label)
+        auroc(pred, label)
+    print("Epoch", epoch, "Train loss:", total_loss / count, "Acc", accuracy.compute().item(), "Precision:",
+          precision.compute().item(), "Recall:", recall.compute().item(), "F1:", f1.compute().item(),
+          "AUROC:", auroc.compute().item())
+def evaluate(model, loader, device, criterion):
+    model.eval()
+    with torch.no_grad():
+        test_accuracy = Accuracy(task='binary')
+        test_precision = Precision(task='binary')
+        test_recall = Recall(task='binary')
+        test_f1 = F1Score(task='binary')
+        test_auroc = AUROC(task='binary')
+        test_accuracy.to(device)
+        test_precision.to(device)
+        test_recall.to(device)
+        test_f1.to(device)
+        test_auroc.to(device)
+        total_loss = 0
+        count = 0
+        for repo1, repo2, label in tqdm(loader):
+            count += len(label)
+            repo1 = repo1.to(device)
+            repo2 = repo2.to(device)
+            label = label.to(device)
+            pred = model(repo1, repo2)
+            loss = criterion(pred, label)
+            total_loss += loss.item()
+            test_accuracy(pred, label)
+            test_precision(pred, label)
+            test_recall(pred, label)
+            test_f1(pred, label)
+            test_auroc(pred, label)
+        print("Test loss:", total_loss / count, "Acc", test_accuracy.compute().item(), "Precision:",
+              test_precision.compute().item(), "Recall:", test_recall.compute().item(), "F1:", test_f1.compute().item(),
+              "AUROC:", test_auroc.compute().item())
+    return test_accuracy.compute().item(), total_loss / count, test_precision.compute().item(), test_recall.compute().item(), \
+           test_f1.compute().item(), test_auroc.compute().item()
+def calculate_similarity(model, repo_emb1, repo_emb2):
+    return torch.nn.functional.softmax(model(repo_emb1, repo_emb2) + model(repo_emb2, repo_emb1), dim=1)