Amy Roberts commited on
Commit
c1fc690
1 Parent(s): 9b744c5
app.py CHANGED
@@ -1,16 +1,8 @@
1
- import os
2
-
3
- import gradio as gr
4
-
5
  import gradio as gr
6
  from find_similar_issues import get_similar_issues
7
  import requests
8
- from html2image import Html2Image
9
- import io
10
 
11
- hti = Html2Image(size=(1920, 1080 * 3))
12
-
13
- from defaults import OWNER, REPO, TOKEN
14
 
15
 
16
  def get_query_issue_information(issue_no, token):
@@ -30,35 +22,17 @@ def get_query_issue_information(issue_no, token):
30
  return request.json()
31
 
32
 
33
- def find_similar_issues(issue, token):
34
- similar_issues = get_similar_issues(issue, token=token)
35
- similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
36
- return similar_issues_summary
37
-
38
-
39
- def render_issue_as_image(issue, filename="image.png"):
40
- url = issue["html_url"]
41
- print(url)
42
- hti.screenshot(url=url, save_as=filename)
43
- return filename
44
-
45
-
46
- def run_find_similar_issues(issue, token, n_issues):
47
- issue_information = get_query_issue_information(issue, token=token)
48
- # issue_information_summary = f"#{issue_information['number']} - {issue_information['title']}\n\n{issue_information['body']}"
49
- similar_issues = get_similar_issues(issue, token=token, top_k=n_issues)
50
- # similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
51
-
52
- issue_image = render_issue_as_image(issue_information, filename="query_issue.png")
53
-
54
- image_names = []
55
- for i, issue in enumerate(similar_issues):
56
- image_names.append(render_issue_as_image(issue, filename=f"image{i}.png"))
57
 
58
- # return issue_information_summary, image_names
59
- page_html = requests.get(issue_information["html_url"]).text
60
 
61
- return issue_image, page_html, image_names
 
 
62
 
63
 
64
  with gr.Blocks(title="Github Bot") as demo:
@@ -66,29 +40,20 @@ with gr.Blocks(title="Github Bot") as demo:
66
  with gr.Row():
67
  with gr.Column():
68
  with gr.Row():
69
- issue = gr.Textbox(label="Github Issue", placeholder="Github issue you want to find similar issues to")
70
- token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
71
  with gr.Row():
72
- n_issues = gr.Slider(1, 50, value=5, label="Number of similar issues", info="Choose between 1 and 50")
 
73
 
74
  with gr.Row():
75
  submit_button = gr.Button(value="Submit")
76
 
77
  with gr.Row():
78
- with gr.Column():
79
- issue_image = gr.Image(type="filepath", label="Your issue")
80
- with gr.Column():
81
- similar_issues_screenshots = gr.Gallery(label="Similar Issues")
82
- issue_text = gr.HTML(label="Issue text", elem_id="issue_text")
83
- submit_button.click(run_find_similar_issues, outputs=[issue_image, issue_text, similar_issues_screenshots], inputs=[issue, token, n_issues])
84
-
85
- with gr.Tab("Search issues"):
86
- with gr.Row():
87
- query = gr.Textbox(label="Query", placeholder="Search for issues")
88
- with gr.Row():
89
- token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
90
- with gr.Row():
91
- pass
92
 
93
  with gr.Tab("Find maintainers to ping"):
94
  with gr.Row():
 
 
 
 
 
1
  import gradio as gr
2
  from find_similar_issues import get_similar_issues
3
  import requests
 
 
4
 
5
+ from defaults import OWNER, REPO
 
 
6
 
7
 
8
  def get_query_issue_information(issue_no, token):
 
22
  return request.json()
23
 
24
 
25
+ def run_find_similar_issues(token, n_issues, issue_no, query):
26
+ if issue_no == "":
27
+ issue_no = None
28
+ if query == "":
29
+ query = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ similar_issues = get_similar_issues(issue_no=issue_no, query=query, token=token, top_k=n_issues)
 
32
 
33
+ issues_html = [f"<a href='{issue['html_url']}' target='_blank'>#{issue['number']} - {issue['title']}</a>" for issue in similar_issues]
34
+ issues_html = "<br>".join(issues_html)
35
+ return issues_html
36
 
37
 
38
  with gr.Blocks(title="Github Bot") as demo:
 
40
  with gr.Row():
41
  with gr.Column():
42
  with gr.Row():
43
+ issue_no = gr.Textbox(label="Github Issue", placeholder="Github issue you want to find similar issues to")
44
+ query = gr.Textbox(label="Query", placeholder="Search for issues")
45
  with gr.Row():
46
+ token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
47
+ n_issues = gr.Slider(1, 50, value=5, step=1, label="Number of similar issues", info="Choose between 1 and 50")
48
 
49
  with gr.Row():
50
  submit_button = gr.Button(value="Submit")
51
 
52
  with gr.Row():
53
+ with gr.Row():
54
+ issues_html = gr.HTML(label="Issue text", elem_id="issue_html")
55
+ with gr.Row():
56
+ submit_button.click(run_find_similar_issues, outputs=[issues_html], inputs=[token, n_issues, issue_no, query])
 
 
 
 
 
 
 
 
 
 
57
 
58
  with gr.Tab("Find maintainers to ping"):
59
  with gr.Row():
find_similar_issues.py CHANGED
@@ -40,9 +40,10 @@ def cosine_similarity(a, b):
40
  return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
41
 
42
 
43
- def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
 
44
  """
45
- Function to find similar issues
46
  """
47
  url = f"https://api.github.com/repos/{owner}/{repo}/issues"
48
  headers = {
@@ -55,11 +56,26 @@ def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
55
  f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
56
  headers=headers,
57
  )
58
-
59
  if request.status_code != 200:
60
  raise ValueError(f"Request failed with status code {request.status_code}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- query_embedding = model.encode(request.json()["body"])
63
  query_embedding = query_embedding.reshape(1, -1)
64
  embeddings = load_embeddings()
65
 
@@ -82,10 +98,11 @@ def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
82
 
83
  if __name__ == "__main__":
84
  parser = argparse.ArgumentParser()
85
- parser.add_argument("issue_no", type=int)
 
86
  parser.add_argument("--top_k", type=int, default=5)
87
  parser.add_argument("--token", type=str, default=TOKEN)
88
  parser.add_argument("--owner", type=str, default=OWNER)
89
  parser.add_argument("--repo", type=str, default=REPO)
90
  args = parser.parse_args()
91
- get_similar_issues(args.issue_no, args.top_k, args.token, args.owner, args.repo)
 
40
  return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
41
 
42
 
43
+
44
+ def get_issue(issue_no, token=TOKEN, owner=OWNER, repo=REPO):
45
  """
46
+ Function to get issue from GitHub
47
  """
48
  url = f"https://api.github.com/repos/{owner}/{repo}/issues"
49
  headers = {
 
56
  f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
57
  headers=headers,
58
  )
 
59
  if request.status_code != 200:
60
  raise ValueError(f"Request failed with status code {request.status_code}")
61
+ return request.json()
62
+
63
+
64
+ def get_similar_issues(issue_no, query, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
65
+ """
66
+ Function to find similar issues
67
+ """
68
+ if issue_no is not None and query is not None:
69
+ raise ValueError("Only one of issue_no or query can be provided")
70
+
71
+ if issue_no is not None and query is not None:
72
+ raise ValueError("Only one of issue_no or query can be provided")
73
+
74
+ if issue_no is not None:
75
+ issue = get_issue(issue_no, token=token, owner=owner, repo=repo)
76
+ query = issue["title"] + "\n" +issue["body"]
77
 
78
+ query_embedding = model.encode(query)
79
  query_embedding = query_embedding.reshape(1, -1)
80
  embeddings = load_embeddings()
81
 
 
98
 
99
  if __name__ == "__main__":
100
  parser = argparse.ArgumentParser()
101
+ parser.add_argument("--issue_no", type=int, default=None)
102
+ parser.add_argument("--query", type=str, default=None)
103
  parser.add_argument("--top_k", type=int, default=5)
104
  parser.add_argument("--token", type=str, default=TOKEN)
105
  parser.add_argument("--owner", type=str, default=OWNER)
106
  parser.add_argument("--repo", type=str, default=REPO)
107
  args = parser.parse_args()
108
+ get_similar_issues(**vars(args))
get_issues.py CHANGED
@@ -67,7 +67,15 @@ def get_issues(
67
  }
68
  last_issue_number = get_last_issue_number(file_path=output_filename)
69
  per_page = 100
70
- page = last_issue_number // per_page + 1
 
 
 
 
 
 
 
 
71
  query_params = {
72
  "state": "all",
73
  "per_page": per_page,
 
67
  }
68
  last_issue_number = get_last_issue_number(file_path=output_filename)
69
  per_page = 100
70
+
71
+ if os.path.exists(output_filename):
72
+ with open(output_filename, "r") as f:
73
+ num_lines = sum(1 for line in f)
74
+ else:
75
+ num_lines = 0
76
+
77
+ # Get the number of pages
78
+ page = num_lines // per_page + 1
79
  query_params = {
80
  "state": "all",
81
  "per_page": per_page,
get_topic.py CHANGED
@@ -1,4 +1,8 @@
1
- "text": {}
 
 
 
 
2
 
3
  topic_maintainers_map ={
4
  "text models": ["@ArthurZucker", "@younesbelkada"],
@@ -30,14 +34,24 @@ topic_maintainers_map ={
30
  }
31
 
32
 
 
 
 
 
33
  from transformers import AutoTokenizer, LlamaForCausalLM
34
 
35
  model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
36
  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
37
 
38
- prompt = f"Which of the following topics {list(topic_maintainers_map.keys())} is this issue about:\n{issue['body']}"
 
39
  inputs = tokenizer(prompt, return_tensors="pt")
40
 
 
 
41
  # Generate
42
- generate_ids = model.generate(inputs.input_ids, max_length=30)
43
- tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
 
 
1
+
2
+ import json
3
+
4
+ with open("issues_dict.json", "r") as f:
5
+ issues = json.load(f)
6
 
7
  topic_maintainers_map ={
8
  "text models": ["@ArthurZucker", "@younesbelkada"],
 
34
  }
35
 
36
 
37
+ issue_no = 2781
38
+ issue = issues[str(issue_no)]
39
+
40
+
41
  from transformers import AutoTokenizer, LlamaForCausalLM
42
 
43
  model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
44
  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
45
 
46
+ # prompt = f"Which of the following topics {list(topic_maintainers_map.keys())} is this issue about:\n{issue['body']}"
47
+ prompt = f"What is the provided issue about? Pick up to 3 topics from the following list: {list(topic_maintainers_map.keys())} \nIssue:\n{issue['body']}"
48
  inputs = tokenizer(prompt, return_tensors="pt")
49
 
50
+ prefix_len = inputs.input_ids.shape[1]
51
+
52
  # Generate
53
+ generate_ids = model.generate(inputs.input_ids, max_length=30 + prefix_len)
54
+ outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
55
+ print(outputs[prefix_len:])
56
+
57
+ print("TITLE", issue["number"] + " " + issue["title"])
update_embeddings.py CHANGED
@@ -94,7 +94,6 @@ def embed_issues(
94
  embeddings[index] = embedding
95
  else:
96
  index = len(embeddings)
97
- # embeddings = np.concatenate([embeddings, embedding.reshape(1, -1)])
98
  embeddings.append(embedding)
99
  issue_to_embedding_index[issue_id] = index
100
  embedding_to_issue_index[index] = issue_id
 
94
  embeddings[index] = embedding
95
  else:
96
  index = len(embeddings)
 
97
  embeddings.append(embedding)
98
  issue_to_embedding_index[issue_id] = index
99
  embedding_to_issue_index[index] = issue_id
update_stored_issues.py CHANGED
@@ -70,8 +70,7 @@ def get_issues(
70
  page = 1
71
  query_params = {
72
  "state": "all",
73
- "since": "2024-02-01T11:33:35Z",
74
- # "since": most_recent,
75
  "sort": "created",
76
  "direction": "asc",
77
  "page": page,
@@ -110,6 +109,9 @@ def get_issues(
110
 
111
  issue_lines_map = {issue["number"]: issue for issue in new_lines}
112
 
 
 
 
113
  with open(input_filename, "r") as f:
114
  with open("tmp_" + output_filename, "a") as g:
115
  for line in f:
@@ -118,11 +120,21 @@ def get_issues(
118
  if number in issue_lines_map:
119
  g.write(json.dumps(issue_lines_map[number]))
120
  g.write("\n")
 
121
  else:
122
  g.write(line)
123
 
 
 
 
 
 
 
 
 
124
  os.rename("tmp_" + output_filename, output_filename)
125
 
 
126
  with open("updated_issues.json", "w") as f:
127
  json.dump(issue_lines_map, f, indent=4, sort_keys=True)
128
 
 
70
  page = 1
71
  query_params = {
72
  "state": "all",
73
+ "since": most_recent,
 
74
  "sort": "created",
75
  "direction": "asc",
76
  "page": page,
 
109
 
110
  issue_lines_map = {issue["number"]: issue for issue in new_lines}
111
 
112
+ updated_issues = []
113
+
114
+ # Update any issues that already exist
115
  with open(input_filename, "r") as f:
116
  with open("tmp_" + output_filename, "a") as g:
117
  for line in f:
 
120
  if number in issue_lines_map:
121
  g.write(json.dumps(issue_lines_map[number]))
122
  g.write("\n")
123
+ updated_issues.append(number)
124
  else:
125
  g.write(line)
126
 
127
+ # Append any new issues
128
+ new_issues = [issue for issue in new_lines if issue["number"] not in updated_issues]
129
+ with open("tmp_" + output_filename, "a") as g:
130
+ for issue in new_issues:
131
+ g.write(json.dumps(issue))
132
+ g.write("\n")
133
+
134
+ # Overwrite the old file with the new file
135
  os.rename("tmp_" + output_filename, output_filename)
136
 
137
+ # Save a record of the updated issues for the embedding update
138
  with open("updated_issues.json", "w") as f:
139
  json.dump(issue_lines_map, f, indent=4, sort_keys=True)
140