File size: 3,338 Bytes
9b744c5
 
ac33554
9b744c5
 
ac33554
 
 
9b744c5
 
 
 
 
 
 
 
 
 
 
 
b42fea9
 
9b744c5
 
 
b42fea9
9b744c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1fc690
 
9b744c5
c1fc690
9b744c5
b42fea9
9b744c5
 
b42fea9
9b744c5
 
 
b42fea9
9b744c5
 
c1fc690
 
 
b42fea9
c1fc690
 
 
 
 
 
 
 
 
 
 
 
9b744c5
c1fc690
9b744c5
 
 
 
 
 
 
 
 
 
b42fea9
9b744c5
 
 
 
 
 
 
 
 
 
 
6b0b6fd
c1fc690
 
9b744c5
 
 
 
 
c1fc690
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import argparse

import requests
import numpy as np
from sentence_transformers import SentenceTransformer

from .defaults import OWNER, REPO, TOKEN

model_id = "all-mpnet-base-v2"
model = SentenceTransformer(model_id)


def load_embeddings():
    """
    Function to load embeddings from file
    """
    embeddings = np.load("issue_embeddings.npy")
    return embeddings


def load_issue_information(issue_type="issue"):
    """
    Function to load issue information from file
    """
    with open(f"embedding_index_to_{issue_type}.json", "r") as f:
        embedding_index_to_issue = json.load(f)

    with open("issues_dict.json", "r") as f:
        issues = json.load(f)

    return embedding_index_to_issue, issues


def cosine_similarity(a, b):
    if a.ndim == 1:
        a = a.reshape(1, -1)

    if b.ndim == 1:
        b = b.reshape(1, -1)

    return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))



def get_issue(issue_no, token=TOKEN, owner=OWNER, repo=REPO):
    """
    Function to get issue from GitHub
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_no}"
    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"{token}",
        "X-GitHub-Api-Version": "2022-11-28",
        "User-Agent": "amyeroberts",
    }
    request = requests.get(url, headers=headers)
    if request.status_code != 200:
        raise ValueError(f"Request failed with status code {request.status_code}")
    return request.json()


def get_similar_issues(issue_no, query, top_k=5, token=TOKEN, owner=OWNER, repo=REPO, issue_type="issue"):
    """
    Function to find similar issues
    """
    if issue_no is not None and query is not None:
        raise ValueError("Only one of issue_no or query can be provided")

    if issue_no is not None and query is not None:
        raise ValueError("Only one of issue_no or query can be provided")

    if issue_no is not None:
        issue = get_issue(issue_no, token=token, owner=owner, repo=repo)
        query = issue["title"] + "\n" +issue["body"]

    query_embedding = model.encode(query)
    query_embedding = query_embedding.reshape(1, -1)
    embeddings = load_embeddings()

    # Calculate the cosine similarity between the query and all the issues
    cosine_similarities = cosine_similarity(query_embedding, embeddings)

    # Get the index of the most similar issue
    most_similar_indices = np.argsort(cosine_similarities)
    most_similar_indices = most_similar_indices[0][::-1]

    embedding_index_to_issue, issues = load_issue_information(issue_type=issue_type)

    similar_issues = []
    for i in most_similar_indices[:top_k]:
        issue_no = embedding_index_to_issue[str(i)]
        similar_issues.append(issues[issue_no])

    return similar_issues


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("")
    parser.add_argument("--issue_no", type=int, default=None)
    parser.add_argument("--query", type=str, default=None)
    parser.add_argument("--top_k", type=int, default=5)
    parser.add_argument("--token", type=str, default=TOKEN)
    parser.add_argument("--owner", type=str, default=OWNER)
    parser.add_argument("--repo", type=str, default=REPO)
    args = parser.parse_args()
    get_similar_issues(**vars(args))