Amy Roberts
Move to utils
7d5704e
raw
history blame
No virus
4.56 kB
"""
Script to fetch issues from the transformers repo and save them to a json file
The script can be run from the command line with the following arguments:
--update: Whether to update the existing file. If True the script will fetch
the most recent issues and append them to the file
--overwrite: Whether to overwrite the existing file
--output_filename: The name of the output file
--github_api_version: The version of the GitHub API to use
--owner: The owner of the repo
--repo: The name of the repo
--token: The GitHub token to use
--n_pages: The number of pages to fetch. Useful for testing
"""
import argparse
import logging
import json
import os
import requests
import numpy as np
from .defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN, ISSUE_JSON_FILE
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
UPDATE_FILE = True
OVERWRITE_FILE = False
def get_last_entry(file_path):
with open(file_path, 'r') as file:
# Read the last line
last_line = file.readlines()[-1]
return json.loads(last_line)
def get_last_issue_number(file_path):
if os.path.exists(file_path):
last_entry = get_last_entry(file_path=file_path)
return last_entry['number']
return 0
def get_issues(
overwrite=OVERWRITE_FILE,
update=UPDATE_FILE,
output_filename=ISSUE_JSON_FILE,
github_api_version=GITHUB_API_VERSION,
owner=OWNER,
repo=REPO,
token=TOKEN,
n_pages=-1,
):
"""
Function to get the issues from the transformers repo and save them to a json file
"""
# If file exists and we want to overwrite it, delete it
if os.path.exists(output_filename) and overwrite:
logging.info(f"Deleting file {output_filename}")
os.remove(output_filename)
# Define the URL and headers
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": f"{token}",
"X-GitHub-Api-Version": f"{github_api_version}",
"User-Agent": "amyeroberts",
}
last_issue_number = get_last_issue_number(file_path=output_filename)
per_page = 100
if os.path.exists(output_filename):
with open(output_filename, "r") as f:
num_lines = sum(1 for line in f)
else:
num_lines = 0
# Get the number of pages
page = num_lines // per_page + 1
query_params = {
"state": "all",
"per_page": per_page,
"sort": "created",
"direction": "asc",
"page": page,
}
if os.path.exists(output_filename) and not update and not overwrite:
raise ValueError(f"File {output_filename} already exists")
page_limit = (n_pages + page) if n_pages > 0 else np.inf
while True:
if page >= page_limit:
break
# Send the GET request
response = requests.get(url, headers=headers, params=query_params)
if not response.status_code == 200:
raise ValueError(
f"Request failed with status code {response.status_code} and message {response.text}"
)
json_response = response.json()
logger.info(f"Page: {page}, number of issues: {len(json_response)}")
# If we get an empty response, we've reached the end of the issues
if len(json_response) == 0:
break
with open(output_filename, "a") as f:
for value in json_response:
if value["number"] <= last_issue_number:
continue
json.dump(value, f)
f.write("\n")
if len(json_response) < per_page:
break
page += 1
query_params["page"] = page
return output_filename
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--update", action="store_true", default=UPDATE_FILE)
parser.add_argument("--overwrite", action="store_true", default=OVERWRITE_FILE)
parser.add_argument("--output_filename", type=str, default=ISSUE_JSON_FILE)
parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
parser.add_argument("--owner", type=str, default=OWNER)
parser.add_argument("--repo", type=str, default=REPO)
parser.add_argument("--token", type=str, default=TOKEN)
parser.add_argument("--n_pages", type=int, default=-1)
args = parser.parse_args()
get_issues(**vars(args))