Spaces:
Sleeping
Sleeping
Amy Roberts
commited on
Commit
•
c1fc690
1
Parent(s):
9b744c5
Updates
Browse files- app.py +18 -53
- find_similar_issues.py +23 -6
- get_issues.py +9 -1
- get_topic.py +18 -4
- update_embeddings.py +0 -1
- update_stored_issues.py +14 -2
app.py
CHANGED
@@ -1,16 +1,8 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
import gradio as gr
|
4 |
-
|
5 |
import gradio as gr
|
6 |
from find_similar_issues import get_similar_issues
|
7 |
import requests
|
8 |
-
from html2image import Html2Image
|
9 |
-
import io
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
from defaults import OWNER, REPO, TOKEN
|
14 |
|
15 |
|
16 |
def get_query_issue_information(issue_no, token):
|
@@ -30,35 +22,17 @@ def get_query_issue_information(issue_no, token):
|
|
30 |
return request.json()
|
31 |
|
32 |
|
33 |
-
def
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
def render_issue_as_image(issue, filename="image.png"):
|
40 |
-
url = issue["html_url"]
|
41 |
-
print(url)
|
42 |
-
hti.screenshot(url=url, save_as=filename)
|
43 |
-
return filename
|
44 |
-
|
45 |
-
|
46 |
-
def run_find_similar_issues(issue, token, n_issues):
|
47 |
-
issue_information = get_query_issue_information(issue, token=token)
|
48 |
-
# issue_information_summary = f"#{issue_information['number']} - {issue_information['title']}\n\n{issue_information['body']}"
|
49 |
-
similar_issues = get_similar_issues(issue, token=token, top_k=n_issues)
|
50 |
-
# similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
|
51 |
-
|
52 |
-
issue_image = render_issue_as_image(issue_information, filename="query_issue.png")
|
53 |
-
|
54 |
-
image_names = []
|
55 |
-
for i, issue in enumerate(similar_issues):
|
56 |
-
image_names.append(render_issue_as_image(issue, filename=f"image{i}.png"))
|
57 |
|
58 |
-
|
59 |
-
page_html = requests.get(issue_information["html_url"]).text
|
60 |
|
61 |
-
|
|
|
|
|
62 |
|
63 |
|
64 |
with gr.Blocks(title="Github Bot") as demo:
|
@@ -66,29 +40,20 @@ with gr.Blocks(title="Github Bot") as demo:
|
|
66 |
with gr.Row():
|
67 |
with gr.Column():
|
68 |
with gr.Row():
|
69 |
-
|
70 |
-
|
71 |
with gr.Row():
|
72 |
-
|
|
|
73 |
|
74 |
with gr.Row():
|
75 |
submit_button = gr.Button(value="Submit")
|
76 |
|
77 |
with gr.Row():
|
78 |
-
with gr.
|
79 |
-
|
80 |
-
with gr.
|
81 |
-
|
82 |
-
issue_text = gr.HTML(label="Issue text", elem_id="issue_text")
|
83 |
-
submit_button.click(run_find_similar_issues, outputs=[issue_image, issue_text, similar_issues_screenshots], inputs=[issue, token, n_issues])
|
84 |
-
|
85 |
-
with gr.Tab("Search issues"):
|
86 |
-
with gr.Row():
|
87 |
-
query = gr.Textbox(label="Query", placeholder="Search for issues")
|
88 |
-
with gr.Row():
|
89 |
-
token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
|
90 |
-
with gr.Row():
|
91 |
-
pass
|
92 |
|
93 |
with gr.Tab("Find maintainers to ping"):
|
94 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from find_similar_issues import get_similar_issues
|
3 |
import requests
|
|
|
|
|
4 |
|
5 |
+
from defaults import OWNER, REPO
|
|
|
|
|
6 |
|
7 |
|
8 |
def get_query_issue_information(issue_no, token):
|
|
|
22 |
return request.json()
|
23 |
|
24 |
|
25 |
+
def run_find_similar_issues(token, n_issues, issue_no, query):
|
26 |
+
if issue_no == "":
|
27 |
+
issue_no = None
|
28 |
+
if query == "":
|
29 |
+
query = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
similar_issues = get_similar_issues(issue_no=issue_no, query=query, token=token, top_k=n_issues)
|
|
|
32 |
|
33 |
+
issues_html = [f"<a href='{issue['html_url']}' target='_blank'>#{issue['number']} - {issue['title']}</a>" for issue in similar_issues]
|
34 |
+
issues_html = "<br>".join(issues_html)
|
35 |
+
return issues_html
|
36 |
|
37 |
|
38 |
with gr.Blocks(title="Github Bot") as demo:
|
|
|
40 |
with gr.Row():
|
41 |
with gr.Column():
|
42 |
with gr.Row():
|
43 |
+
issue_no = gr.Textbox(label="Github Issue", placeholder="Github issue you want to find similar issues to")
|
44 |
+
query = gr.Textbox(label="Query", placeholder="Search for issues")
|
45 |
with gr.Row():
|
46 |
+
token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
|
47 |
+
n_issues = gr.Slider(1, 50, value=5, step=1, label="Number of similar issues", info="Choose between 1 and 50")
|
48 |
|
49 |
with gr.Row():
|
50 |
submit_button = gr.Button(value="Submit")
|
51 |
|
52 |
with gr.Row():
|
53 |
+
with gr.Row():
|
54 |
+
issues_html = gr.HTML(label="Issue text", elem_id="issue_html")
|
55 |
+
with gr.Row():
|
56 |
+
submit_button.click(run_find_similar_issues, outputs=[issues_html], inputs=[token, n_issues, issue_no, query])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
with gr.Tab("Find maintainers to ping"):
|
59 |
with gr.Row():
|
find_similar_issues.py
CHANGED
@@ -40,9 +40,10 @@ def cosine_similarity(a, b):
|
|
40 |
return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
|
41 |
|
42 |
|
43 |
-
|
|
|
44 |
"""
|
45 |
-
Function to
|
46 |
"""
|
47 |
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
|
48 |
headers = {
|
@@ -55,11 +56,26 @@ def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
|
|
55 |
f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
|
56 |
headers=headers,
|
57 |
)
|
58 |
-
|
59 |
if request.status_code != 200:
|
60 |
raise ValueError(f"Request failed with status code {request.status_code}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
query_embedding = model.encode(
|
63 |
query_embedding = query_embedding.reshape(1, -1)
|
64 |
embeddings = load_embeddings()
|
65 |
|
@@ -82,10 +98,11 @@ def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
|
|
82 |
|
83 |
if __name__ == "__main__":
|
84 |
parser = argparse.ArgumentParser()
|
85 |
-
parser.add_argument("issue_no", type=int)
|
|
|
86 |
parser.add_argument("--top_k", type=int, default=5)
|
87 |
parser.add_argument("--token", type=str, default=TOKEN)
|
88 |
parser.add_argument("--owner", type=str, default=OWNER)
|
89 |
parser.add_argument("--repo", type=str, default=REPO)
|
90 |
args = parser.parse_args()
|
91 |
-
get_similar_issues(args
|
|
|
40 |
return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
|
41 |
|
42 |
|
43 |
+
|
44 |
+
def get_issue(issue_no, token=TOKEN, owner=OWNER, repo=REPO):
|
45 |
"""
|
46 |
+
Function to get issue from GitHub
|
47 |
"""
|
48 |
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
|
49 |
headers = {
|
|
|
56 |
f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
|
57 |
headers=headers,
|
58 |
)
|
|
|
59 |
if request.status_code != 200:
|
60 |
raise ValueError(f"Request failed with status code {request.status_code}")
|
61 |
+
return request.json()
|
62 |
+
|
63 |
+
|
64 |
+
def get_similar_issues(issue_no, query, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
|
65 |
+
"""
|
66 |
+
Function to find similar issues
|
67 |
+
"""
|
68 |
+
if issue_no is not None and query is not None:
|
69 |
+
raise ValueError("Only one of issue_no or query can be provided")
|
70 |
+
|
71 |
+
if issue_no is not None and query is not None:
|
72 |
+
raise ValueError("Only one of issue_no or query can be provided")
|
73 |
+
|
74 |
+
if issue_no is not None:
|
75 |
+
issue = get_issue(issue_no, token=token, owner=owner, repo=repo)
|
76 |
+
query = issue["title"] + "\n" +issue["body"]
|
77 |
|
78 |
+
query_embedding = model.encode(query)
|
79 |
query_embedding = query_embedding.reshape(1, -1)
|
80 |
embeddings = load_embeddings()
|
81 |
|
|
|
98 |
|
99 |
if __name__ == "__main__":
|
100 |
parser = argparse.ArgumentParser()
|
101 |
+
parser.add_argument("--issue_no", type=int, default=None)
|
102 |
+
parser.add_argument("--query", type=str, default=None)
|
103 |
parser.add_argument("--top_k", type=int, default=5)
|
104 |
parser.add_argument("--token", type=str, default=TOKEN)
|
105 |
parser.add_argument("--owner", type=str, default=OWNER)
|
106 |
parser.add_argument("--repo", type=str, default=REPO)
|
107 |
args = parser.parse_args()
|
108 |
+
get_similar_issues(**vars(args))
|
get_issues.py
CHANGED
@@ -67,7 +67,15 @@ def get_issues(
|
|
67 |
}
|
68 |
last_issue_number = get_last_issue_number(file_path=output_filename)
|
69 |
per_page = 100
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
query_params = {
|
72 |
"state": "all",
|
73 |
"per_page": per_page,
|
|
|
67 |
}
|
68 |
last_issue_number = get_last_issue_number(file_path=output_filename)
|
69 |
per_page = 100
|
70 |
+
|
71 |
+
if os.path.exists(output_filename):
|
72 |
+
with open(output_filename, "r") as f:
|
73 |
+
num_lines = sum(1 for line in f)
|
74 |
+
else:
|
75 |
+
num_lines = 0
|
76 |
+
|
77 |
+
# Get the number of pages
|
78 |
+
page = num_lines // per_page + 1
|
79 |
query_params = {
|
80 |
"state": "all",
|
81 |
"per_page": per_page,
|
get_topic.py
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
2 |
|
3 |
topic_maintainers_map ={
|
4 |
"text models": ["@ArthurZucker", "@younesbelkada"],
|
@@ -30,14 +34,24 @@ topic_maintainers_map ={
|
|
30 |
}
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
33 |
from transformers import AutoTokenizer, LlamaForCausalLM
|
34 |
|
35 |
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
36 |
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
37 |
|
38 |
-
prompt = f"Which of the following topics {list(topic_maintainers_map.keys())} is this issue about:\n{issue['body']}"
|
|
|
39 |
inputs = tokenizer(prompt, return_tensors="pt")
|
40 |
|
|
|
|
|
41 |
# Generate
|
42 |
-
generate_ids = model.generate(inputs.input_ids, max_length=30)
|
43 |
-
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import json
|
3 |
+
|
4 |
+
with open("issues_dict.json", "r") as f:
|
5 |
+
issues = json.load(f)
|
6 |
|
7 |
topic_maintainers_map ={
|
8 |
"text models": ["@ArthurZucker", "@younesbelkada"],
|
|
|
34 |
}
|
35 |
|
36 |
|
37 |
+
issue_no = 2781
|
38 |
+
issue = issues[str(issue_no)]
|
39 |
+
|
40 |
+
|
41 |
from transformers import AutoTokenizer, LlamaForCausalLM
|
42 |
|
43 |
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
44 |
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
45 |
|
46 |
+
# prompt = f"Which of the following topics {list(topic_maintainers_map.keys())} is this issue about:\n{issue['body']}"
|
47 |
+
prompt = f"What is the provided issue about? Pick up to 3 topics from the following list: {list(topic_maintainers_map.keys())} \nIssue:\n{issue['body']}"
|
48 |
inputs = tokenizer(prompt, return_tensors="pt")
|
49 |
|
50 |
+
prefix_len = inputs.input_ids.shape[1]
|
51 |
+
|
52 |
# Generate
|
53 |
+
generate_ids = model.generate(inputs.input_ids, max_length=30 + prefix_len)
|
54 |
+
outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
55 |
+
print(outputs[prefix_len:])
|
56 |
+
|
57 |
+
print("TITLE", issue["number"] + " " + issue["title"])
|
update_embeddings.py
CHANGED
@@ -94,7 +94,6 @@ def embed_issues(
|
|
94 |
embeddings[index] = embedding
|
95 |
else:
|
96 |
index = len(embeddings)
|
97 |
-
# embeddings = np.concatenate([embeddings, embedding.reshape(1, -1)])
|
98 |
embeddings.append(embedding)
|
99 |
issue_to_embedding_index[issue_id] = index
|
100 |
embedding_to_issue_index[index] = issue_id
|
|
|
94 |
embeddings[index] = embedding
|
95 |
else:
|
96 |
index = len(embeddings)
|
|
|
97 |
embeddings.append(embedding)
|
98 |
issue_to_embedding_index[issue_id] = index
|
99 |
embedding_to_issue_index[index] = issue_id
|
update_stored_issues.py
CHANGED
@@ -70,8 +70,7 @@ def get_issues(
|
|
70 |
page = 1
|
71 |
query_params = {
|
72 |
"state": "all",
|
73 |
-
"since":
|
74 |
-
# "since": most_recent,
|
75 |
"sort": "created",
|
76 |
"direction": "asc",
|
77 |
"page": page,
|
@@ -110,6 +109,9 @@ def get_issues(
|
|
110 |
|
111 |
issue_lines_map = {issue["number"]: issue for issue in new_lines}
|
112 |
|
|
|
|
|
|
|
113 |
with open(input_filename, "r") as f:
|
114 |
with open("tmp_" + output_filename, "a") as g:
|
115 |
for line in f:
|
@@ -118,11 +120,21 @@ def get_issues(
|
|
118 |
if number in issue_lines_map:
|
119 |
g.write(json.dumps(issue_lines_map[number]))
|
120 |
g.write("\n")
|
|
|
121 |
else:
|
122 |
g.write(line)
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
os.rename("tmp_" + output_filename, output_filename)
|
125 |
|
|
|
126 |
with open("updated_issues.json", "w") as f:
|
127 |
json.dump(issue_lines_map, f, indent=4, sort_keys=True)
|
128 |
|
|
|
70 |
page = 1
|
71 |
query_params = {
|
72 |
"state": "all",
|
73 |
+
"since": most_recent,
|
|
|
74 |
"sort": "created",
|
75 |
"direction": "asc",
|
76 |
"page": page,
|
|
|
109 |
|
110 |
issue_lines_map = {issue["number"]: issue for issue in new_lines}
|
111 |
|
112 |
+
updated_issues = []
|
113 |
+
|
114 |
+
# Update any issues that already exist
|
115 |
with open(input_filename, "r") as f:
|
116 |
with open("tmp_" + output_filename, "a") as g:
|
117 |
for line in f:
|
|
|
120 |
if number in issue_lines_map:
|
121 |
g.write(json.dumps(issue_lines_map[number]))
|
122 |
g.write("\n")
|
123 |
+
updated_issues.append(number)
|
124 |
else:
|
125 |
g.write(line)
|
126 |
|
127 |
+
# Append any new issues
|
128 |
+
new_issues = [issue for issue in new_lines if issue["number"] not in updated_issues]
|
129 |
+
with open("tmp_" + output_filename, "a") as g:
|
130 |
+
for issue in new_issues:
|
131 |
+
g.write(json.dumps(issue))
|
132 |
+
g.write("\n")
|
133 |
+
|
134 |
+
# Overwrite the old file with the new file
|
135 |
os.rename("tmp_" + output_filename, output_filename)
|
136 |
|
137 |
+
# Save a record of the updated issues for the embedding update
|
138 |
with open("updated_issues.json", "w") as f:
|
139 |
json.dump(issue_lines_map, f, indent=4, sort_keys=True)
|
140 |
|