""" Script to fetch issues from the transformers repo and save them to a json file The script can be run from the command line with the following arguments: --update: Whether to update the existing file. If True the script will fetch the most recent issues and append them to the file --overwrite: Whether to overwrite the existing file --output_filename: The name of the output file --github_api_version: The version of the GitHub API to use --owner: The owner of the repo --repo: The name of the repo --token: The GitHub token to use --n_pages: The number of pages to fetch. Useful for testing """ import argparse import logging import json import os import requests import numpy as np from defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN, ISSUE_JSON_FILE logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) UPDATE_FILE = True OVERWRITE_FILE = False def get_last_entry(file_path): with open(file_path, 'r') as file: # Read the last line last_line = file.readlines()[-1] return json.loads(last_line) def get_last_issue_number(file_path): if os.path.exists(file_path): last_entry = get_last_entry(file_path=file_path) return last_entry['number'] return 0 def get_issues( overwrite=OVERWRITE_FILE, update=UPDATE_FILE, output_filename=ISSUE_JSON_FILE, github_api_version=GITHUB_API_VERSION, owner=OWNER, repo=REPO, token=TOKEN, n_pages=-1, ): """ Function to get the issues from the transformers repo and save them to a json file """ # If file exists and we want to overwrite it, delete it if os.path.exists(output_filename) and overwrite: logging.info(f"Deleting file {output_filename}") os.remove(output_filename) # Define the URL and headers url = f"https://api.github.com/repos/{owner}/{repo}/issues" headers = { "Accept": "application/vnd.github+json", "Authorization": f"{token}", "X-GitHub-Api-Version": f"{github_api_version}", "User-Agent": "amyeroberts", } last_issue_number = get_last_issue_number(file_path=output_filename) per_page = 100 if os.path.exists(output_filename): with open(output_filename, "r") as f: num_lines = sum(1 for line in f) else: num_lines = 0 # Get the number of pages page = num_lines // per_page + 1 query_params = { "state": "all", "per_page": per_page, "sort": "created", "direction": "asc", "page": page, } if os.path.exists(output_filename) and not update and not overwrite: raise ValueError(f"File {output_filename} already exists") page_limit = (n_pages + page) if n_pages > 0 else np.inf while True: if page >= page_limit: break # Send the GET request response = requests.get(url, headers=headers, params=query_params) if not response.status_code == 200: raise ValueError( f"Request failed with status code {response.status_code} and message {response.text}" ) json_response = response.json() logger.info(f"Page: {page}, number of issues: {len(json_response)}") # If we get an empty response, we've reached the end of the issues if len(json_response) == 0: break with open(output_filename, "a") as f: for value in json_response: if value["number"] <= last_issue_number: continue json.dump(value, f) f.write("\n") if len(json_response) < per_page: break page += 1 query_params["page"] = page return output_filename if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--update", action="store_true", default=UPDATE_FILE) parser.add_argument("--overwrite", action="store_true", default=OVERWRITE_FILE) parser.add_argument("--output_filename", type=str, default=ISSUE_JSON_FILE) parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION) parser.add_argument("--owner", type=str, default=OWNER) parser.add_argument("--repo", type=str, default=REPO) parser.add_argument("--token", type=str, default=TOKEN) parser.add_argument("--n_pages", type=int, default=-1) args = parser.parse_args() get_issues(**vars(args))