File size: 5,884 Bytes
e0d476d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import json
import os
from pathlib import Path
import torch
from docarray.index import InMemoryExactNNIndex
from common.repo_doc import RepoDoc
import random
from torchmetrics.classification import Accuracy, Precision, Recall, F1Score, AUROC
from tqdm import tqdm
INDEX_PATH = Path(__file__).parent.joinpath("..\\data\\")
TOPIC_CLUSTER_PATH = Path(__file__).parent.joinpath("..\\data\\repo_topic_clusters.json")
CODE_CLUSTER_PATH = Path(__file__).parent.joinpath("..\\data\\repo_code_clusters.json")
def read_repo_cluster(filename):
# return repo name - cluster id key value pair
with open(filename, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
def find_files_in_directory(directory):
# loop all index files
files = []
for file in os.listdir(directory):
if file[:5] == "index" and file[5] != ".":
files.append(os.path.join(directory, file))
return files
def read_repo_embedding():
# return repo name - embedding k-v pair
map = {}
for filename in find_files_in_directory(INDEX_PATH):
data = InMemoryExactNNIndex[RepoDoc](index_file_path=Path(__file__).parent.joinpath(filename))
docs_tmp = data._docs
for doc in docs_tmp:
map[doc.name] = doc.repository_embedding
return map
def build_cluster_repo_embedding(mode: str):
"""
build the dataset according to code cluster
where mode is "code" or "topic"
"""
embedding = read_repo_embedding()
if mode == "code":
cluster_id = read_repo_cluster(CODE_CLUSTER_PATH)
elif mode == "topic":
cluster_id = read_repo_cluster(TOPIC_CLUSTER_PATH)
else:
raise ValueError("parameter 'mode' must be 'code' or 'topic'")
data = []
for name in embedding:
data.append({'name': name, 'embedding': embedding[name], 'id': cluster_id[name]})
return data
def build_dataset(data, ratio=0.7):
"""
return the train set and test set which are like (index1, index2) : (same, not same)
"""
positive_repo = []
negative_repo = []
n = len(data)
# build the binary dataset
for i in range(n):
for j in range(i, n):
if data[i]['id'] == data[j]['id']:
positive_repo.append((i, j, (1.0, 0.0)))
positive_repo.append((j, i, (1.0, 0.0)))
else:
negative_repo.append((i, j, (0.0, 1.0)))
negative_repo.append((j, i, (0.0, 1.0)))
# make balance
positive_length = len(positive_repo)
negative_repo = random.choices(negative_repo, k=positive_length)
# split the dataset
random.shuffle(positive_repo)
random.shuffle(negative_repo)
split_index = int(positive_length * ratio)
train_set = positive_repo[:split_index] + negative_repo[:split_index]
random.shuffle(train_set)
test_set = positive_repo[split_index:] + negative_repo[split_index:]
random.shuffle(test_set)
print("Positive data:", len(positive_repo))
print("Negative data:", len(negative_repo))
return train_set, test_set
def train_epoch(epoch, model, loader, device, criterion, optimizer):
model.train()
accuracy = Accuracy(task='binary')
precision = Precision(task='binary')
recall = Recall(task='binary')
f1 = F1Score(task='binary')
auroc = AUROC(task='binary')
accuracy.to(device)
precision.to(device)
recall.to(device)
f1.to(device)
auroc.to(device)
total_loss = 0
count = 0
for repo1, repo2, label in tqdm(loader):
count += len(label)
optimizer.zero_grad()
repo1 = repo1.to(device)
repo2 = repo2.to(device)
label = label.to(device)
pred = model(repo1, repo2)
loss = criterion(pred, label)
loss.backward()
total_loss += loss.item()
optimizer.step()
accuracy(pred, label)
precision(pred, label)
recall(pred, label)
f1(pred, label)
auroc(pred, label)
print("Epoch", epoch, "Train loss:", total_loss / count, "Acc", accuracy.compute().item(), "Precision:",
precision.compute().item(), "Recall:", recall.compute().item(), "F1:", f1.compute().item(),
"AUROC:", auroc.compute().item())
def evaluate(model, loader, device, criterion):
model.eval()
with torch.no_grad():
test_accuracy = Accuracy(task='binary')
test_precision = Precision(task='binary')
test_recall = Recall(task='binary')
test_f1 = F1Score(task='binary')
test_auroc = AUROC(task='binary')
test_accuracy.to(device)
test_precision.to(device)
test_recall.to(device)
test_f1.to(device)
test_auroc.to(device)
total_loss = 0
count = 0
for repo1, repo2, label in tqdm(loader):
count += len(label)
repo1 = repo1.to(device)
repo2 = repo2.to(device)
label = label.to(device)
pred = model(repo1, repo2)
loss = criterion(pred, label)
total_loss += loss.item()
test_accuracy(pred, label)
test_precision(pred, label)
test_recall(pred, label)
test_f1(pred, label)
test_auroc(pred, label)
print("Test loss:", total_loss / count, "Acc", test_accuracy.compute().item(), "Precision:",
test_precision.compute().item(), "Recall:", test_recall.compute().item(), "F1:", test_f1.compute().item(),
"AUROC:", test_auroc.compute().item())
return test_accuracy.compute().item(), total_loss / count, test_precision.compute().item(), test_recall.compute().item(), \
test_f1.compute().item(), test_auroc.compute().item()
def calculate_similarity(model, repo_emb1, repo_emb2):
return torch.nn.functional.softmax(model(repo_emb1, repo_emb2) + model(repo_emb2, repo_emb1), dim=1)
|