Amy Roberts commited on
Commit
17b91ab
·
1 Parent(s): 7d5704e

Small readme

Browse files
Files changed (2) hide show
  1. README.md +48 -0
  2. utils/update_embeddings.py +0 -108
README.md CHANGED
@@ -11,3 +11,51 @@ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # Issue bot 🤖
16
+
17
+ This bot is designed to help you manage your GitHub issues. At the moment, it can:
18
+
19
+ - Find similar issues to another issue
20
+ - Find similar issues similar to a query
21
+
22
+ ## Usage
23
+
24
+ ### Update stored issues and embeddings
25
+
26
+ If you want to use the command line to manually update you can run:
27
+
28
+ ```bash
29
+ # Fetches and saves new issues to issues.json
30
+ python utils/fetch.py --update
31
+
32
+ # Updates and issues which have had a status change and save to issues.json
33
+ python utils/update_stored_issues.py
34
+
35
+ # Update index of issues
36
+ python utils/build_issue_dict.py
37
+
38
+ # Update embeddings for issues
39
+ python utils/build_embeddings.py issue --update
40
+
41
+ # Update embeddings for pull requests
42
+ python utils/build_embeddings.py pull --update
43
+ ```
44
+
45
+ ### Find similar issues to another issue
46
+
47
+ By issue number:
48
+
49
+ ```bash
50
+ python utils/find_similar_issues.py --issue_no <issue_number>
51
+ ```
52
+
53
+ By query:
54
+
55
+ ```bash
56
+ python utils/find_similar_issues.py --query <query>
57
+ ```
58
+
59
+ ## TODO
60
+
61
+ - [ ] Find who to assign an issue to
utils/update_embeddings.py DELETED
@@ -1,108 +0,0 @@
1
- import argparse
2
- import json
3
- import logging
4
- import os
5
-
6
- import numpy as np
7
- from sentence_transformers import SentenceTransformer
8
-
9
- logging.basicConfig(level=logging.INFO)
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- def load_model(model_id: str):
15
- return SentenceTransformer(model_id)
16
-
17
-
18
- class EmbeddingWriter:
19
- def __init__(
20
- self,
21
- output_embedding_filename,
22
- output_index_filename,
23
- update,
24
- embedding_to_issue_index,
25
- embeddings=None
26
- ) -> None:
27
- self.output_embedding_filename = output_embedding_filename
28
- self.output_index_filename = output_index_filename
29
- self.embeddings = [] if embeddings is None else list(embeddings)
30
- self.embedding_to_issue_index = embedding_to_issue_index
31
- self.update = update
32
-
33
- def __enter__(self):
34
- return self.embeddings
35
-
36
- def __exit__(self, exc_type, exc_val, exc_tb):
37
- embeddings = np.array(self.embeddings)
38
-
39
- if self.update and os.path.exists(self.output_embedding_filename):
40
- embeddings = np.concatenate([np.load(self.output_embedding_filename), embeddings])
41
-
42
- logger.info(f"Saving embeddings to {self.output_embedding_filename}")
43
- np.save(self.output_embedding_filename, embeddings)
44
-
45
- logger.info(f"Saving embedding index to {self.output_index_filename}")
46
- with open(self.output_index_filename, "w") as f:
47
- json.dump(self.embedding_to_issue_index, f, indent=4)
48
-
49
-
50
- def embed_issues(
51
- input_filename: str,
52
- model_id: str,
53
- issue_type: str,
54
- ):
55
- output_embedding_filename = f"{issue_type}_embeddings.npy"
56
- output_index_filename = f"embedding_index_to_{issue_type}.json"
57
- model = load_model(model_id)
58
-
59
- with open(input_filename, "r") as f:
60
- updated_issues = json.load(f)
61
-
62
- with open(output_index_filename, "r") as f:
63
- embedding_to_issue_index = json.load(f)
64
-
65
- embeddings = np.load(output_embedding_filename)
66
-
67
- issue_to_embedding_index = {v: k for k, v in embedding_to_issue_index.items()}
68
-
69
- with EmbeddingWriter(
70
- output_embedding_filename=output_embedding_filename,
71
- output_index_filename=output_index_filename,
72
- update=False,
73
- embedding_to_issue_index=embedding_to_issue_index,
74
- embeddings=embeddings
75
- ) as embeddings:
76
- for issue_id, issue in updated_issues.items():
77
- if "body" not in issue:
78
- logger.info(f"Skipping issue {issue_id} as it has no body")
79
- continue
80
-
81
- if issue_type == "pull_request" and "pull_request" not in issue:
82
- logger.info(f"Skipping issue {issue_id} as it is not a pull request")
83
- continue
84
-
85
- elif issue_type == "issue" and "pull_request" in issue:
86
- logger.info(f"Skipping issue {issue_id} as it is a pull request")
87
- continue
88
-
89
- logger.info(f"Embedding issue {issue_id}")
90
- embedding = model.encode(issue["body"])
91
-
92
- if issue_id in issue_to_embedding_index:
93
- index = issue_to_embedding_index[issue_id]
94
- embeddings[index] = embedding
95
- else:
96
- index = len(embeddings)
97
- embeddings.append(embedding)
98
- issue_to_embedding_index[issue_id] = index
99
- embedding_to_issue_index[index] = issue_id
100
-
101
-
102
- if __name__ == "__main__":
103
- parser = argparse.ArgumentParser()
104
- parser.add_argument('issue_type', choices=['issue', 'pull'], default='issue')
105
- parser.add_argument("--input_filename", type=str, default="updated_issues.json")
106
- parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
107
- args = parser.parse_args()
108
- embed_issues(**vars(args))