Spaces:
Sleeping
Sleeping
File size: 4,561 Bytes
6b0b6fd 12ae336 6b0b6fd 9b744c5 b42fea9 9b744c5 b42fea9 7d5704e 9b744c5 12ae336 9b744c5 12ae336 9b744c5 b42fea9 9b744c5 c1fc690 9b744c5 12ae336 9b744c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
"""
Script to fetch issues from the transformers repo and save them to a json file
The script can be run from the command line with the following arguments:
--update: Whether to update the existing file. If True the script will fetch
the most recent issues and append them to the file
--overwrite: Whether to overwrite the existing file
--output_filename: The name of the output file
--github_api_version: The version of the GitHub API to use
--owner: The owner of the repo
--repo: The name of the repo
--token: The GitHub token to use
--n_pages: The number of pages to fetch. Useful for testing
"""
import argparse
import logging
import json
import os
import requests
import numpy as np
from .defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN, ISSUE_JSON_FILE
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
UPDATE_FILE = True
OVERWRITE_FILE = False
def get_last_entry(file_path):
with open(file_path, 'r') as file:
# Read the last line
last_line = file.readlines()[-1]
return json.loads(last_line)
def get_last_issue_number(file_path):
if os.path.exists(file_path):
last_entry = get_last_entry(file_path=file_path)
return last_entry['number']
return 0
def get_issues(
overwrite=OVERWRITE_FILE,
update=UPDATE_FILE,
output_filename=ISSUE_JSON_FILE,
github_api_version=GITHUB_API_VERSION,
owner=OWNER,
repo=REPO,
token=TOKEN,
n_pages=-1,
):
"""
Function to get the issues from the transformers repo and save them to a json file
"""
# If file exists and we want to overwrite it, delete it
if os.path.exists(output_filename) and overwrite:
logging.info(f"Deleting file {output_filename}")
os.remove(output_filename)
# Define the URL and headers
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": f"{token}",
"X-GitHub-Api-Version": f"{github_api_version}",
"User-Agent": "amyeroberts",
}
last_issue_number = get_last_issue_number(file_path=output_filename)
per_page = 100
if os.path.exists(output_filename):
with open(output_filename, "r") as f:
num_lines = sum(1 for line in f)
else:
num_lines = 0
# Get the number of pages
page = num_lines // per_page + 1
query_params = {
"state": "all",
"per_page": per_page,
"sort": "created",
"direction": "asc",
"page": page,
}
if os.path.exists(output_filename) and not update and not overwrite:
raise ValueError(f"File {output_filename} already exists")
page_limit = (n_pages + page) if n_pages > 0 else np.inf
while True:
if page >= page_limit:
break
# Send the GET request
response = requests.get(url, headers=headers, params=query_params)
if not response.status_code == 200:
raise ValueError(
f"Request failed with status code {response.status_code} and message {response.text}"
)
json_response = response.json()
logger.info(f"Page: {page}, number of issues: {len(json_response)}")
# If we get an empty response, we've reached the end of the issues
if len(json_response) == 0:
break
with open(output_filename, "a") as f:
for value in json_response:
if value["number"] <= last_issue_number:
continue
json.dump(value, f)
f.write("\n")
if len(json_response) < per_page:
break
page += 1
query_params["page"] = page
return output_filename
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--update", action="store_true", default=UPDATE_FILE)
parser.add_argument("--overwrite", action="store_true", default=OVERWRITE_FILE)
parser.add_argument("--output_filename", type=str, default=ISSUE_JSON_FILE)
parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
parser.add_argument("--owner", type=str, default=OWNER)
parser.add_argument("--repo", type=str, default=REPO)
parser.add_argument("--token", type=str, default=TOKEN)
parser.add_argument("--n_pages", type=int, default=-1)
args = parser.parse_args()
get_issues(**vars(args))
|