Amy Roberts commited on
Commit
9b744c5
1 Parent(s): c9976b6
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ # Data
163
+ *.json
164
+ *.png
165
+ *.npy
166
+ *.jpg
167
+ *.pdf
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+
5
+ import gradio as gr
6
+ from find_similar_issues import get_similar_issues
7
+ import requests
8
+ from html2image import Html2Image
9
+ import io
10
+
11
+ hti = Html2Image(size=(1920, 1080 * 3))
12
+
13
+ from defaults import OWNER, REPO, TOKEN
14
+
15
+
16
+ def get_query_issue_information(issue_no, token):
17
+ headers = {
18
+ "Accept": "application/vnd.github+json",
19
+ "Authorization": f"{token}",
20
+ "X-GitHub-Api-Version": "2022-11-28",
21
+ "User-Agent": "amyeroberts",
22
+ }
23
+ request = requests.get(
24
+ f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
25
+ headers=headers,
26
+ )
27
+ if request.status_code != 200:
28
+ raise ValueError(f"Request failed with status code {request.status_code} and message {request.text}")
29
+
30
+ return request.json()
31
+
32
+
33
+ def find_similar_issues(issue, token):
34
+ similar_issues = get_similar_issues(issue, token=token)
35
+ similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
36
+ return similar_issues_summary
37
+
38
+
39
+ def render_issue_as_image(issue, filename="image.png"):
40
+ url = issue["html_url"]
41
+ print(url)
42
+ hti.screenshot(url=url, save_as=filename)
43
+ return filename
44
+
45
+
46
+ def run_find_similar_issues(issue, token, n_issues):
47
+ issue_information = get_query_issue_information(issue, token=token)
48
+ # issue_information_summary = f"#{issue_information['number']} - {issue_information['title']}\n\n{issue_information['body']}"
49
+ similar_issues = get_similar_issues(issue, token=token, top_k=n_issues)
50
+ # similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
51
+
52
+ issue_image = render_issue_as_image(issue_information, filename="query_issue.png")
53
+
54
+ image_names = []
55
+ for i, issue in enumerate(similar_issues):
56
+ image_names.append(render_issue_as_image(issue, filename=f"image{i}.png"))
57
+
58
+ # return issue_information_summary, image_names
59
+ page_html = requests.get(issue_information["html_url"]).text
60
+
61
+ return issue_image, page_html, image_names
62
+
63
+
64
+ with gr.Blocks(title="Github Bot") as demo:
65
+ with gr.Tab("Find similar issues"):
66
+ with gr.Row():
67
+ with gr.Column():
68
+ with gr.Row():
69
+ issue = gr.Textbox(label="Github Issue", placeholder="Github issue you want to find similar issues to")
70
+ token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
71
+ with gr.Row():
72
+ n_issues = gr.Slider(1, 50, value=5, label="Number of similar issues", info="Choose between 1 and 50")
73
+
74
+ with gr.Row():
75
+ submit_button = gr.Button(value="Submit")
76
+
77
+ with gr.Row():
78
+ with gr.Column():
79
+ issue_image = gr.Image(type="filepath", label="Your issue")
80
+ with gr.Column():
81
+ similar_issues_screenshots = gr.Gallery(label="Similar Issues")
82
+ issue_text = gr.HTML(label="Issue text", elem_id="issue_text")
83
+ submit_button.click(run_find_similar_issues, outputs=[issue_image, issue_text, similar_issues_screenshots], inputs=[issue, token, n_issues])
84
+
85
+ with gr.Tab("Search issues"):
86
+ with gr.Row():
87
+ query = gr.Textbox(label="Query", placeholder="Search for issues")
88
+ with gr.Row():
89
+ token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
90
+ with gr.Row():
91
+ pass
92
+
93
+ with gr.Tab("Find maintainers to ping"):
94
+ with gr.Row():
95
+ issue = gr.Textbox(label="Github Issue / PR", placeholder="Issue or PR you want to find maintainers to ping for")
96
+ with gr.Row():
97
+ token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
98
+
99
+
100
+ if __name__ == "__main__":
101
+ demo.launch()
build_embeddings.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+
6
+ import numpy as np
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def load_model(model_id: str):
15
+ return SentenceTransformer(model_id)
16
+
17
+
18
+ class EmbeddingWriter:
19
+ def __init__(self, output_embedding_filename, output_index_filename, update, embedding_to_issue_index) -> None:
20
+ self.output_embedding_filename = output_embedding_filename
21
+ self.output_index_filename = output_index_filename
22
+ self.embeddings = []
23
+ self.embedding_to_issue_index = embedding_to_issue_index
24
+ self.update = update
25
+
26
+ def __enter__(self):
27
+ return self.embeddings
28
+
29
+ def __exit__(self, exc_type, exc_val, exc_tb):
30
+ if len(self.embeddings) == 0:
31
+ return
32
+
33
+ embeddings = np.array(self.embeddings)
34
+
35
+ if self.update and os.path.exists(self.output_embedding_filename):
36
+ embeddings = np.concatenate([np.load(self.output_embedding_filename), embeddings])
37
+
38
+ logger.info(f"Saving embeddings to {self.output_embedding_filename}")
39
+ np.save(self.output_embedding_filename, embeddings)
40
+
41
+ logger.info(f"Saving embedding index to {self.output_index_filename}")
42
+ with open(self.output_index_filename, "w") as f:
43
+ json.dump(self.embedding_to_issue_index, f, indent=4)
44
+
45
+
46
+ def embed_issues(
47
+ input_filename: str,
48
+ model_id: str,
49
+ issue_type: str,
50
+ n_issues: int = -1,
51
+ update: bool = False
52
+ ):
53
+ model = load_model(model_id)
54
+
55
+ output_embedding_filename = f"{issue_type}_embeddings.npy"
56
+ output_index_filename = f"embedding_index_to_{issue_type}.json"
57
+
58
+ with open(input_filename, "r") as f:
59
+ issues = json.load(f)
60
+
61
+ if update and os.path.exists(output_index_filename):
62
+ with open(output_index_filename, "r") as f:
63
+ embedding_to_issue_index = json.load(f)
64
+ embedding_index = len(embedding_to_issue_index)
65
+ else:
66
+ embedding_to_issue_index = {}
67
+ embedding_index = 0
68
+
69
+ max_issues = n_issues if n_issues > 0 else len(issues)
70
+ n_issues = 0
71
+
72
+ with EmbeddingWriter(
73
+ output_embedding_filename=output_embedding_filename,
74
+ output_index_filename=output_index_filename,
75
+ update=update,
76
+ embedding_to_issue_index=embedding_to_issue_index
77
+ ) as embeddings: #, embedding_to_issue_index:
78
+ for issue_id, issue in issues.items():
79
+ if n_issues >= max_issues:
80
+ break
81
+
82
+ if issue_id in embedding_to_issue_index.values() and update:
83
+ logger.info(f"Skipping issue {issue_id} as it is already embedded")
84
+ continue
85
+
86
+ if "body" not in issue:
87
+ logger.info(f"Skipping issue {issue_id} as it has no body")
88
+ continue
89
+
90
+ if issue_type == "pull_request" and "pull_request" not in issue:
91
+ logger.info(f"Skipping issue {issue_id} as it is not a pull request")
92
+ continue
93
+
94
+ elif issue_type == "issue" and "pull_request" in issue:
95
+ logger.info(f"Skipping issue {issue_id} as it is a pull request")
96
+ continue
97
+
98
+ title = issue["title"] if issue["title"] is not None else ""
99
+ body = issue["body"] if issue["body"] is not None else ""
100
+
101
+ logger.info(f"Embedding issue {issue_id}")
102
+ embedding = model.encode(title + "\n" + body)
103
+ embedding_to_issue_index[embedding_index] = issue_id
104
+ embeddings.append(embedding)
105
+ embedding_index += 1
106
+ n_issues += 1
107
+
108
+
109
+ if __name__ == "__main__":
110
+ parser = argparse.ArgumentParser()
111
+ parser.add_argument('issue_type', choices=['issue', 'pull'], default='issue')
112
+ parser.add_argument("--input_filename", type=str, default="issues_dict.json")
113
+ parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
114
+ parser.add_argument("--n_issues", type=int, default=-1)
115
+ parser.add_argument("--update", action="store_true")
116
+ args = parser.parse_args()
117
+ embed_issues(**vars(args))
build_issue_dict.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+
4
+ def build_json_file(input_filename, output_filename):
5
+ with open(input_filename, "r") as f:
6
+ json_lines = f.readlines()
7
+
8
+ issues = [json.loads(line) for line in json_lines]
9
+ json_dict = {issue["number"]: issue for issue in issues}
10
+
11
+ with open(output_filename, "w") as f:
12
+ json.dump(json_dict, f, indent=4)
13
+
14
+ if __name__ == "__main__":
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--input_filename", type=str, default="issues.json")
17
+ parser.add_argument("--output_filename", type=str, default="issues_dict.json")
18
+ args = parser.parse_args()
19
+ build_json_file(**vars(args))
defaults.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+
3
+ OWNER = "huggingface"
4
+ REPO = "transformers"
5
+ TOKEN = os.environ.get("GITHUB_TOKEN")
find_similar_issues.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pprint
2
+ import json
3
+ import argparse
4
+ import requests
5
+ from defaults import OWNER, REPO, TOKEN
6
+ from sentence_transformers import SentenceTransformer
7
+ import numpy as np
8
+
9
+ model_id = "all-mpnet-base-v2"
10
+ model = SentenceTransformer(model_id)
11
+
12
+
13
+ def load_embeddings():
14
+ """
15
+ Function to load embeddings from file
16
+ """
17
+ embeddings = np.load("issue_embeddings.npy")
18
+ return embeddings
19
+
20
+ def load_issue_information():
21
+ """
22
+ Function to load issue information from file
23
+ """
24
+ with open("embedding_index_to_issue.json", "r") as f:
25
+ embedding_index_to_issue = json.load(f)
26
+
27
+ with open("issues_dict.json", "r") as f:
28
+ issues = json.load(f)
29
+
30
+ return embedding_index_to_issue, issues
31
+
32
+
33
+ def cosine_similarity(a, b):
34
+ if a.ndim == 1:
35
+ a = a.reshape(1, -1)
36
+
37
+ if b.ndim == 1:
38
+ b = b.reshape(1, -1)
39
+
40
+ return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
41
+
42
+
43
+ def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
44
+ """
45
+ Function to find similar issues
46
+ """
47
+ url = f"https://api.github.com/repos/{owner}/{repo}/issues"
48
+ headers = {
49
+ "Accept": "application/vnd.github+json",
50
+ f"Authorization": "{token}",
51
+ "X-GitHub-Api-Version": "2022-11-28",
52
+ "User-Agent": "amyeroberts",
53
+ }
54
+ request = requests.get(
55
+ f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
56
+ headers=headers,
57
+ )
58
+
59
+ if request.status_code != 200:
60
+ raise ValueError(f"Request failed with status code {request.status_code}")
61
+
62
+ query_embedding = model.encode(request.json()["body"])
63
+ query_embedding = query_embedding.reshape(1, -1)
64
+ embeddings = load_embeddings()
65
+
66
+ # Calculate the cosine similarity between the query and all the issues
67
+ cosine_similarities = cosine_similarity(query_embedding, embeddings)
68
+
69
+ # Get the index of the most similar issue
70
+ most_similar_indices = np.argsort(cosine_similarities)
71
+ most_similar_indices = most_similar_indices[0][::-1]
72
+
73
+ embedding_index_to_issue, issues = load_issue_information()
74
+
75
+ similar_issues = []
76
+ for i in most_similar_indices[:top_k]:
77
+ issue_no = embedding_index_to_issue[str(i)]
78
+ similar_issues.append(issues[issue_no])
79
+
80
+ return similar_issues
81
+
82
+
83
+ if __name__ == "__main__":
84
+ parser = argparse.ArgumentParser()
85
+ parser.add_argument("issue_no", type=int)
86
+ parser.add_argument("--top_k", type=int, default=5)
87
+ parser.add_argument("--token", type=str, default=TOKEN)
88
+ parser.add_argument("--owner", type=str, default=OWNER)
89
+ parser.add_argument("--repo", type=str, default=REPO)
90
+ args = parser.parse_args()
91
+ get_similar_issues(args.issue_no, args.top_k, args.token, args.owner, args.repo)
get_issues.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import argparse
4
+
5
+ import requests
6
+ import os
7
+ import numpy as np
8
+ import json
9
+ import datetime
10
+ import logging
11
+
12
+ logging.basicConfig(level=logging.INFO)
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
17
+
18
+ OWNER = "huggingface"
19
+ REPO = "transformers"
20
+ GITHUB_API_VERSION = "2022-11-28"
21
+ TOKEN = os.environ.get("GITHUB_TOKEN")
22
+ JSON_FILE = f"issues.json"
23
+ UPDATE_FILE = False
24
+ OVERWRITE_FILE = True
25
+
26
+
27
+ def get_last_entry(file_path):
28
+ with open(file_path, 'r') as file:
29
+ # Read the last line
30
+ last_line = file.readlines()[-1]
31
+ return json.loads(last_line)
32
+
33
+
34
+ def get_last_issue_number(file_path):
35
+ if os.path.exists(file_path):
36
+ last_entry = get_last_entry(file_path=file_path)
37
+ return last_entry['number']
38
+ return 0
39
+
40
+
41
+ def get_issues(
42
+ overwrite=OVERWRITE_FILE,
43
+ update=UPDATE_FILE,
44
+ output_filename=JSON_FILE,
45
+ github_api_version=GITHUB_API_VERSION,
46
+ owner=OWNER,
47
+ repo=REPO,
48
+ token=TOKEN,
49
+ n_pages=-1,
50
+ ):
51
+ """
52
+ Function to get the issues from the transformers repo and save them to a json file
53
+ """
54
+
55
+ # If file exists and we want to overwrite it, delete it
56
+ if os.path.exists(output_filename) and overwrite:
57
+ logging.info(f"Deleting file {output_filename}")
58
+ os.remove(output_filename)
59
+
60
+ # Define the URL and headers
61
+ url = f"https://api.github.com/repos/{owner}/{repo}/issues"
62
+ headers = {
63
+ "Accept": "application/vnd.github+json",
64
+ f"Authorization": f"{token}",
65
+ "X-GitHub-Api-Version": f"{github_api_version}",
66
+ "User-Agent": "amyeroberts",
67
+ }
68
+ last_issue_number = get_last_issue_number(file_path=output_filename)
69
+ per_page = 100
70
+ page = last_issue_number // per_page + 1
71
+ query_params = {
72
+ "state": "all",
73
+ "per_page": per_page,
74
+ "sort": "created",
75
+ "direction": "asc",
76
+ "page": page,
77
+ }
78
+
79
+ if os.path.exists(output_filename) and not update and not overwrite:
80
+ raise ValueError(f"File {output_filename} already exists")
81
+
82
+ page_limit = (n_pages + page) if n_pages > 0 else np.inf
83
+ while True:
84
+ if page >= page_limit:
85
+ break
86
+
87
+ # Send the GET request
88
+ response = requests.get(url, headers=headers, params=query_params)
89
+
90
+ if not response.status_code == 200:
91
+ raise ValueError(
92
+ f"Request failed with status code {response.status_code} and message {response.text}"
93
+ )
94
+
95
+ json_response = response.json()
96
+ logger.info(f"Page: {page}, number of issues: {len(json_response)}")
97
+
98
+ # If we get an empty response, we've reached the end of the issues
99
+ if len(json_response) == 0:
100
+ break
101
+
102
+ with open(output_filename, "a") as f:
103
+ for value in json_response:
104
+ if value["number"] <= last_issue_number:
105
+ continue
106
+ json.dump(value, f)
107
+ f.write("\n")
108
+
109
+ if len(json_response) < per_page:
110
+ break
111
+
112
+ page += 1
113
+ query_params["page"] = page
114
+
115
+ return output_filename
116
+
117
+
118
+ if __name__ == "__main__":
119
+ parser = argparse.ArgumentParser()
120
+ parser.add_argument("--update", action="store_true", default=True)
121
+ parser.add_argument("--overwrite", action="store_true", default=False)
122
+ parser.add_argument("--output_filename", type=str, default=JSON_FILE)
123
+ parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
124
+ parser.add_argument("--owner", type=str, default=OWNER)
125
+ parser.add_argument("--repo", type=str, default=REPO)
126
+ parser.add_argument("--token", type=str, default=TOKEN)
127
+ parser.add_argument("--n_pages", type=int, default=-1)
128
+ args = parser.parse_args()
129
+ get_issues(**vars(args))
get_topic.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "text": {}
2
+
3
+ topic_maintainers_map ={
4
+ "text models": ["@ArthurZucker", "@younesbelkada"],
5
+ "vision models": "@amyeroberts",
6
+ "speech models": "@sanchit-gandhi",
7
+ "graph models": "@clefourrier",
8
+ "flax": "@sanchit-gandhi",
9
+ "generate": "@gante",
10
+ "pipelines": "@Narsil",
11
+ "tensorflow": ["@gante", "@Rocketknight1"],
12
+ "tokenizers": "@ArthurZucker",
13
+ "trainer": ["@muellerzr", "@pacman100"],
14
+ "deepspeed": "@pacman100",
15
+ "ray/raytune": ["@richardliaw", "@amogkam"],
16
+ "Big Model Inference": "@SunMarc",
17
+ "quantization (bitsandbytes, autogpt)": ["@SunMarc", "@younesbelkada"],
18
+ "Documentation": ["@stevhliu", "@MKhalusova"],
19
+ "accelerate": "different repo",
20
+ "datasets": "different repo",
21
+ "diffusers": "different repo",
22
+ "rust tokenizers": "different repo",
23
+ "Flax examples": "@sanchit-gandhi",
24
+ "PyTorch vision examples": "@amyeroberts",
25
+ "PyTorch text examples": "@ArthurZucker",
26
+ "PyTorch speech examples": "@sanchit-gandhi",
27
+ "PyTorch generate examples": "@gante",
28
+ "TensorFlow": "@Rocketknight1",
29
+ "Research projects and examples": "not maintained",
30
+ }
31
+
32
+
33
+ from transformers import AutoTokenizer, LlamaForCausalLM
34
+
35
+ model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
36
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
37
+
38
+ prompt = f"Which of the following topics {list(topic_maintainers_map.keys())} is this issue about:\n{issue['body']}"
39
+ inputs = tokenizer(prompt, return_tensors="pt")
40
+
41
+ # Generate
42
+ generate_ids = model.generate(inputs.input_ids, max_length=30)
43
+ tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
retrieval.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import pprint
4
+
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ def cosine_similarity(a, b):
9
+ if a.ndim == 1:
10
+ a = a.reshape(1, -1)
11
+
12
+ if b.ndim == 1:
13
+ b = b.reshape(1, -1)
14
+
15
+ return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
16
+
17
+
18
+ def retrieve_issue_rankings(
19
+ query: str,
20
+ model_id: str,
21
+ input_embedding_filename: str,
22
+ ):
23
+ """
24
+ Given a query returns the list of issues sorted by similarity to the query
25
+ according to their embedding index
26
+ """
27
+ model = SentenceTransformer(model_id)
28
+
29
+ embeddings = np.load(input_embedding_filename)
30
+
31
+ query_embedding = model.encode(query)
32
+
33
+ # Calculate the cosine similarity between the query and all the issues
34
+ cosine_similarities = cosine_similarity(query_embedding, embeddings)
35
+
36
+ # Get the index of the most similar issue
37
+ most_similar_indices = np.argsort(cosine_similarities)
38
+ most_similar_indices = most_similar_indices[0][::-1]
39
+ return most_similar_indices
40
+
41
+
42
+ def print_issue(issues, issue_id):
43
+ # Get the issue id of the most similar issue
44
+ issue_info = issues[issue_id]
45
+
46
+ print(f"#{issue_id}", issue_info["title"])
47
+ print(issue_info["body"])
48
+
49
+
50
+ if __name__ == "__main__":
51
+ parser = argparse.ArgumentParser()
52
+ parser.add_argument("query", type=str)
53
+ parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
54
+ parser.add_argument("--input_embedding_filename", type=str, default="issue_embeddings.npy")
55
+ parser.add_argument("--input_index_filename", type=str, default="embedding_index_to_issue.json")
56
+
57
+ args = parser.parse_args()
58
+
59
+ issue_rankings = retrieve_issue_rankings(
60
+ query=args.query,
61
+ model_id=args.model_id,
62
+ input_embedding_filename=args.input_embedding_filename,
63
+ )
64
+
65
+ with open("issues_dict.json", "r") as f:
66
+ issues = json.load(f)
67
+
68
+ with open(args.input_index_filename, "r") as f:
69
+ embedding_index_to_issue = json.load(f)
70
+
71
+ issue_ids = [embedding_index_to_issue[str(i)] for i in issue_rankings]
72
+
73
+ for issue_id in issue_ids[:3]:
74
+ print(issue_id)
75
+ print_issue(issues, issue_id)
76
+ print("\n\n\n")
update_embeddings.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+
6
+ import numpy as np
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def load_model(model_id: str):
15
+ return SentenceTransformer(model_id)
16
+
17
+
18
+ class EmbeddingWriter:
19
+ def __init__(
20
+ self,
21
+ output_embedding_filename,
22
+ output_index_filename,
23
+ update,
24
+ embedding_to_issue_index,
25
+ embeddings=None
26
+ ) -> None:
27
+ self.output_embedding_filename = output_embedding_filename
28
+ self.output_index_filename = output_index_filename
29
+ self.embeddings = [] if embeddings is None else list(embeddings)
30
+ self.embedding_to_issue_index = embedding_to_issue_index
31
+ self.update = update
32
+
33
+ def __enter__(self):
34
+ return self.embeddings
35
+
36
+ def __exit__(self, exc_type, exc_val, exc_tb):
37
+ embeddings = np.array(self.embeddings)
38
+
39
+ if self.update and os.path.exists(self.output_embedding_filename):
40
+ embeddings = np.concatenate([np.load(self.output_embedding_filename), embeddings])
41
+
42
+ logger.info(f"Saving embeddings to {self.output_embedding_filename}")
43
+ np.save(self.output_embedding_filename, embeddings)
44
+
45
+ logger.info(f"Saving embedding index to {self.output_index_filename}")
46
+ with open(self.output_index_filename, "w") as f:
47
+ json.dump(self.embedding_to_issue_index, f, indent=4)
48
+
49
+
50
+ def embed_issues(
51
+ input_filename: str,
52
+ model_id: str,
53
+ issue_type: str,
54
+ ):
55
+ output_embedding_filename = f"{issue_type}_embeddings.npy"
56
+ output_index_filename = f"embedding_index_to_{issue_type}.json"
57
+ model = load_model(model_id)
58
+
59
+ with open(input_filename, "r") as f:
60
+ updated_issues = json.load(f)
61
+
62
+ with open(output_index_filename, "r") as f:
63
+ embedding_to_issue_index = json.load(f)
64
+
65
+ embeddings = np.load(output_embedding_filename)
66
+
67
+ issue_to_embedding_index = {v: k for k, v in embedding_to_issue_index.items()}
68
+
69
+ with EmbeddingWriter(
70
+ output_embedding_filename=output_embedding_filename,
71
+ output_index_filename=output_index_filename,
72
+ update=False,
73
+ embedding_to_issue_index=embedding_to_issue_index,
74
+ embeddings=embeddings
75
+ ) as embeddings:
76
+ for issue_id, issue in updated_issues.items():
77
+ if "body" not in issue:
78
+ logger.info(f"Skipping issue {issue_id} as it has no body")
79
+ continue
80
+
81
+ if issue_type == "pull_request" and "pull_request" not in issue:
82
+ logger.info(f"Skipping issue {issue_id} as it is not a pull request")
83
+ continue
84
+
85
+ elif issue_type == "issue" and "pull_request" in issue:
86
+ logger.info(f"Skipping issue {issue_id} as it is a pull request")
87
+ continue
88
+
89
+ logger.info(f"Embedding issue {issue_id}")
90
+ embedding = model.encode(issue["body"])
91
+
92
+ if issue_id in issue_to_embedding_index:
93
+ index = issue_to_embedding_index[issue_id]
94
+ embeddings[index] = embedding
95
+ else:
96
+ index = len(embeddings)
97
+ # embeddings = np.concatenate([embeddings, embedding.reshape(1, -1)])
98
+ embeddings.append(embedding)
99
+ issue_to_embedding_index[issue_id] = index
100
+ embedding_to_issue_index[index] = issue_id
101
+
102
+
103
+ if __name__ == "__main__":
104
+ parser = argparse.ArgumentParser()
105
+ parser.add_argument('issue_type', choices=['issue', 'pull'], default='issue')
106
+ parser.add_argument("--input_filename", type=str, default="updated_issues.json")
107
+ parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
108
+ args = parser.parse_args()
109
+ embed_issues(**vars(args))
update_stored_issues.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module which updates any of the issues to reflect changes in the issue state
3
+ """
4
+ import json
5
+ import datetime
6
+ from defaults import TOKEN, OWNER, REPO
7
+
8
+ GITHUB_API_VERSION = "2022-11-28"
9
+
10
+
11
+
12
+ # Get the issues that have been updated since the last update
13
+
14
+ import json
15
+
16
+ import argparse
17
+
18
+ import requests
19
+ import os
20
+ import numpy as np
21
+ import json
22
+ import datetime
23
+ import logging
24
+
25
+ logging.basicConfig(level=logging.INFO)
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
30
+
31
+ OWNER = "huggingface"
32
+ REPO = "transformers"
33
+ GITHUB_API_VERSION = "2022-11-28"
34
+ TOKEN = os.environ.get("GITHUB_TOKEN")
35
+ JSON_FILE = f"issues.json"
36
+
37
+
38
+ def get_issues(
39
+ input_filename=JSON_FILE,
40
+ output_filename=JSON_FILE,
41
+ github_api_version=GITHUB_API_VERSION,
42
+ owner=OWNER,
43
+ repo=REPO,
44
+ token=TOKEN,
45
+ n_pages=-1,
46
+ ):
47
+ """
48
+ Function to get the issues from the transformers repo and save them to a json file
49
+ """
50
+ with open("issues_dict.json", "r") as f:
51
+ issues = json.load(f)
52
+
53
+ # Get most recent updated at information
54
+ updated_at = [issue["updated_at"] for issue in issues.values()]
55
+ most_recent = max(updated_at)
56
+
57
+ # If file exists and we want to overwrite it, delete it
58
+ if not os.path.exists(output_filename):
59
+ raise ValueError(f"File {output_filename} does not exist")
60
+
61
+ # Define the URL and headers
62
+ url = f"https://api.github.com/repos/{owner}/{repo}/issues"
63
+ headers = {
64
+ "Accept": "application/vnd.github+json",
65
+ f"Authorization": f"{token}",
66
+ "X-GitHub-Api-Version": f"{github_api_version}",
67
+ "User-Agent": "amyeroberts",
68
+ }
69
+ per_page = 100
70
+ page = 1
71
+ query_params = {
72
+ "state": "all",
73
+ "since": "2024-02-01T11:33:35Z",
74
+ # "since": most_recent,
75
+ "sort": "created",
76
+ "direction": "asc",
77
+ "page": page,
78
+ }
79
+
80
+ new_lines = []
81
+
82
+ page_limit = (n_pages + page) if n_pages > 0 else np.inf
83
+ while True:
84
+ if page >= page_limit:
85
+ break
86
+
87
+ # Send the GET request
88
+ response = requests.get(url, headers=headers, params=query_params)
89
+
90
+ if not response.status_code == 200:
91
+ raise ValueError(
92
+ f"Request failed with status code {response.status_code} and message {response.text}"
93
+ )
94
+
95
+ json_response = response.json()
96
+ logger.info(f"Page: {page}, number of issues: {len(json_response)}")
97
+
98
+ # If we get an empty response, we've reached the end of the issues
99
+ if len(json_response) == 0:
100
+ break
101
+
102
+ new_lines.extend(json_response)
103
+
104
+ # If we get less than the number of issues per page, we've reached the end of the issues
105
+ if len(json_response) < per_page:
106
+ break
107
+
108
+ page += 1
109
+ query_params["page"] = page
110
+
111
+ issue_lines_map = {issue["number"]: issue for issue in new_lines}
112
+
113
+ with open(input_filename, "r") as f:
114
+ with open("tmp_" + output_filename, "a") as g:
115
+ for line in f:
116
+ issue = json.loads(line)
117
+ number = issue["number"]
118
+ if number in issue_lines_map:
119
+ g.write(json.dumps(issue_lines_map[number]))
120
+ g.write("\n")
121
+ else:
122
+ g.write(line)
123
+
124
+ os.rename("tmp_" + output_filename, output_filename)
125
+
126
+ with open("updated_issues.json", "w") as f:
127
+ json.dump(issue_lines_map, f, indent=4, sort_keys=True)
128
+
129
+ return output_filename
130
+
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser()
134
+ parser.add_argument("--input_filename", type=str, default=JSON_FILE)
135
+ parser.add_argument("--output_filename", type=str, default=JSON_FILE)
136
+ parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
137
+ parser.add_argument("--owner", type=str, default=OWNER)
138
+ parser.add_argument("--repo", type=str, default=REPO)
139
+ parser.add_argument("--token", type=str, default=TOKEN)
140
+ parser.add_argument("--n_pages", type=int, default=-1)
141
+ args = parser.parse_args()
142
+ get_issues(**vars(args))