Amy Roberts commited on
Commit
12ae336
1 Parent(s): 2c3812c

Add documentation

Browse files
Files changed (6) hide show
  1. build_embeddings.py +18 -0
  2. build_issue_dict.py +4 -0
  3. defaults.py +1 -0
  4. fetch.py +19 -12
  5. retrieval.py +5 -0
  6. update_stored_issues.py +13 -4
build_embeddings.py CHANGED
@@ -1,3 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import argparse
2
  import json
3
  import logging
 
1
+ """
2
+ Module which builds embeddings for issues and pull requests
3
+
4
+ The module is designed to be run from the command line and takes the following arguments:
5
+
6
+ --input_filename: The name of the file containing the issues and pull requests
7
+ --model_id: The name of the sentence transformer model to use
8
+ --issue_type: The type of issue to embed (either "issue" or "pull")
9
+ --n_issues: The number of issues to embed
10
+ --update: Whether to update the existing embeddings
11
+
12
+ The module saves the embeddings to a file called <issue_type>_embeddings.npy and the index to a file called
13
+ embedding_index_to_<issue_type>.json
14
+
15
+ The index provides a mapping from the index of the embedding to the issue or pull request number.
16
+
17
+ """
18
+
19
  import argparse
20
  import json
21
  import logging
build_issue_dict.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import argparse
2
  import json
3
 
 
1
+ """
2
+ Module which builds a dictionary keyed by issue number from a json file
3
+ """
4
+
5
  import argparse
6
  import json
7
 
defaults.py CHANGED
@@ -4,3 +4,4 @@ OWNER = "huggingface"
4
  REPO = "transformers"
5
  TOKEN = os.environ.get("GITHUB_TOKEN")
6
  GITHUB_API_VERSION = "2022-11-28"
 
 
4
  REPO = "transformers"
5
  TOKEN = os.environ.get("GITHUB_TOKEN")
6
  GITHUB_API_VERSION = "2022-11-28"
7
+ ISSUE_JSON_FILE = "issues.json"
fetch.py CHANGED
@@ -1,9 +1,20 @@
1
  """
2
  Script to fetch issues from the transformers repo and save them to a json file
 
 
 
 
 
 
 
 
 
 
 
 
3
  """
4
 
5
  import argparse
6
- import datetime
7
  import logging
8
  import json
9
  import os
@@ -11,17 +22,13 @@ import os
11
  import requests
12
  import numpy as np
13
 
14
- from defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN
15
 
16
  logging.basicConfig(level=logging.INFO)
17
-
18
  logger = logging.getLogger(__name__)
19
 
20
- today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
21
-
22
- JSON_FILE = "issues.json"
23
- UPDATE_FILE = False
24
- OVERWRITE_FILE = True
25
 
26
 
27
  def get_last_entry(file_path):
@@ -41,7 +48,7 @@ def get_last_issue_number(file_path):
41
  def get_issues(
42
  overwrite=OVERWRITE_FILE,
43
  update=UPDATE_FILE,
44
- output_filename=JSON_FILE,
45
  github_api_version=GITHUB_API_VERSION,
46
  owner=OWNER,
47
  repo=REPO,
@@ -125,9 +132,9 @@ def get_issues(
125
 
126
  if __name__ == "__main__":
127
  parser = argparse.ArgumentParser()
128
- parser.add_argument("--update", action="store_true", default=True)
129
- parser.add_argument("--overwrite", action="store_true", default=False)
130
- parser.add_argument("--output_filename", type=str, default=JSON_FILE)
131
  parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
132
  parser.add_argument("--owner", type=str, default=OWNER)
133
  parser.add_argument("--repo", type=str, default=REPO)
 
1
  """
2
  Script to fetch issues from the transformers repo and save them to a json file
3
+
4
+ The script can be run from the command line with the following arguments:
5
+ --update: Whether to update the existing file. If True the script will fetch
6
+ the most recent issues and append them to the file
7
+ --overwrite: Whether to overwrite the existing file
8
+ --output_filename: The name of the output file
9
+ --github_api_version: The version of the GitHub API to use
10
+ --owner: The owner of the repo
11
+ --repo: The name of the repo
12
+ --token: The GitHub token to use
13
+ --n_pages: The number of pages to fetch. Useful for testing
14
+
15
  """
16
 
17
  import argparse
 
18
  import logging
19
  import json
20
  import os
 
22
  import requests
23
  import numpy as np
24
 
25
+ from defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN, ISSUE_JSON_FILE
26
 
27
  logging.basicConfig(level=logging.INFO)
 
28
  logger = logging.getLogger(__name__)
29
 
30
+ UPDATE_FILE = True
31
+ OVERWRITE_FILE = False
 
 
 
32
 
33
 
34
  def get_last_entry(file_path):
 
48
  def get_issues(
49
  overwrite=OVERWRITE_FILE,
50
  update=UPDATE_FILE,
51
+ output_filename=ISSUE_JSON_FILE,
52
  github_api_version=GITHUB_API_VERSION,
53
  owner=OWNER,
54
  repo=REPO,
 
132
 
133
  if __name__ == "__main__":
134
  parser = argparse.ArgumentParser()
135
+ parser.add_argument("--update", action="store_true", default=UPDATE_FILE)
136
+ parser.add_argument("--overwrite", action="store_true", default=OVERWRITE_FILE)
137
+ parser.add_argument("--output_filename", type=str, default=ISSUE_JSON_FILE)
138
  parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
139
  parser.add_argument("--owner", type=str, default=OWNER)
140
  parser.add_argument("--repo", type=str, default=REPO)
retrieval.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import argparse
2
  import json
3
 
 
1
+ """
2
+ Module which contains functionality to retrieve the most similar issues for a given query
3
+ """
4
+
5
+
6
  import argparse
7
  import json
8
 
update_stored_issues.py CHANGED
@@ -1,5 +1,14 @@
1
  """
2
- Module which updates any of the issues to reflect changes in the issue state
 
 
 
 
 
 
 
 
 
3
  """
4
  import argparse
5
  import json
@@ -9,7 +18,7 @@ import os
9
  import numpy as np
10
  import requests
11
 
12
- from defaults import TOKEN, OWNER, REPO, GITHUB_API_VERSION
13
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
@@ -18,8 +27,8 @@ JSON_FILE = "issues.json"
18
 
19
 
20
  def update_issues(
21
- input_filename=JSON_FILE,
22
- output_filename=JSON_FILE,
23
  github_api_version=GITHUB_API_VERSION,
24
  owner=OWNER,
25
  repo=REPO,
 
1
  """
2
+ Module which updates any of the issues to reflect changes in the issue state e.g. new comments
3
+
4
+ The module can be run from the command line using the following arguments:
5
+ --input_filename: The name of the input file containing the issues
6
+ --output_filename: The name of the output file to save the updated issues
7
+ --github_api_version: The version of the GitHub API to use
8
+ --owner: The owner of the repo
9
+ --repo: The name of the repo
10
+ --token: The GitHub token to use
11
+ --n_pages: The number of pages to fetch. Useful for testing
12
  """
13
  import argparse
14
  import json
 
18
  import numpy as np
19
  import requests
20
 
21
+ from defaults import TOKEN, OWNER, REPO, GITHUB_API_VERSION, ISSUE_JSON_FILE
22
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
 
27
 
28
 
29
  def update_issues(
30
+ input_filename=ISSUE_JSON_FILE,
31
+ output_filename=ISSUE_JSON_FILE,
32
  github_api_version=GITHUB_API_VERSION,
33
  owner=OWNER,
34
  repo=REPO,