transformers-github-bot / get_issues.py
Amy Roberts
Draft
9b744c5
import json
import argparse
import requests
import os
import numpy as np
import json
import datetime
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
OWNER = "huggingface"
REPO = "transformers"
GITHUB_API_VERSION = "2022-11-28"
TOKEN = os.environ.get("GITHUB_TOKEN")
JSON_FILE = f"issues.json"
UPDATE_FILE = False
OVERWRITE_FILE = True
def get_last_entry(file_path):
with open(file_path, 'r') as file:
# Read the last line
last_line = file.readlines()[-1]
return json.loads(last_line)
def get_last_issue_number(file_path):
if os.path.exists(file_path):
last_entry = get_last_entry(file_path=file_path)
return last_entry['number']
return 0
def get_issues(
overwrite=OVERWRITE_FILE,
update=UPDATE_FILE,
output_filename=JSON_FILE,
github_api_version=GITHUB_API_VERSION,
owner=OWNER,
repo=REPO,
token=TOKEN,
n_pages=-1,
):
"""
Function to get the issues from the transformers repo and save them to a json file
"""
# If file exists and we want to overwrite it, delete it
if os.path.exists(output_filename) and overwrite:
logging.info(f"Deleting file {output_filename}")
os.remove(output_filename)
# Define the URL and headers
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
headers = {
"Accept": "application/vnd.github+json",
f"Authorization": f"{token}",
"X-GitHub-Api-Version": f"{github_api_version}",
"User-Agent": "amyeroberts",
}
last_issue_number = get_last_issue_number(file_path=output_filename)
per_page = 100
page = last_issue_number // per_page + 1
query_params = {
"state": "all",
"per_page": per_page,
"sort": "created",
"direction": "asc",
"page": page,
}
if os.path.exists(output_filename) and not update and not overwrite:
raise ValueError(f"File {output_filename} already exists")
page_limit = (n_pages + page) if n_pages > 0 else np.inf
while True:
if page >= page_limit:
break
# Send the GET request
response = requests.get(url, headers=headers, params=query_params)
if not response.status_code == 200:
raise ValueError(
f"Request failed with status code {response.status_code} and message {response.text}"
)
json_response = response.json()
logger.info(f"Page: {page}, number of issues: {len(json_response)}")
# If we get an empty response, we've reached the end of the issues
if len(json_response) == 0:
break
with open(output_filename, "a") as f:
for value in json_response:
if value["number"] <= last_issue_number:
continue
json.dump(value, f)
f.write("\n")
if len(json_response) < per_page:
break
page += 1
query_params["page"] = page
return output_filename
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--update", action="store_true", default=True)
parser.add_argument("--overwrite", action="store_true", default=False)
parser.add_argument("--output_filename", type=str, default=JSON_FILE)
parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
parser.add_argument("--owner", type=str, default=OWNER)
parser.add_argument("--repo", type=str, default=REPO)
parser.add_argument("--token", type=str, default=TOKEN)
parser.add_argument("--n_pages", type=int, default=-1)
args = parser.parse_args()
get_issues(**vars(args))