File size: 4,561 Bytes
6b0b6fd
 
12ae336
 
 
 
 
 
 
 
 
 
 
 
6b0b6fd
 
9b744c5
b42fea9
 
 
9b744c5
 
 
b42fea9
7d5704e
9b744c5
 
 
 
12ae336
 
9b744c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12ae336
9b744c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b42fea9
9b744c5
 
 
 
 
c1fc690
 
 
 
 
 
 
 
 
9b744c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12ae336
 
 
9b744c5
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Script to fetch issues from the transformers repo and save them to a json file

The script can be run from the command line with the following arguments:
            --update: Whether to update the existing file. If True the script will fetch
                the most recent issues and append them to the file
            --overwrite: Whether to overwrite the existing file
            --output_filename: The name of the output file
            --github_api_version: The version of the GitHub API to use
            --owner: The owner of the repo
            --repo: The name of the repo
            --token: The GitHub token to use
            --n_pages: The number of pages to fetch. Useful for testing

"""

import argparse
import logging
import json
import os

import requests
import numpy as np

from .defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN, ISSUE_JSON_FILE

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

UPDATE_FILE = True
OVERWRITE_FILE = False


def get_last_entry(file_path):
    with open(file_path, 'r') as file:
        # Read the last line
        last_line = file.readlines()[-1]
    return json.loads(last_line)


def get_last_issue_number(file_path):
    if os.path.exists(file_path):
        last_entry = get_last_entry(file_path=file_path)
        return last_entry['number']
    return 0


def get_issues(
    overwrite=OVERWRITE_FILE,
    update=UPDATE_FILE,
    output_filename=ISSUE_JSON_FILE,
    github_api_version=GITHUB_API_VERSION,
    owner=OWNER,
    repo=REPO,
    token=TOKEN,
    n_pages=-1,
):
    """
    Function to get the issues from the transformers repo and save them to a json file
    """

    # If file exists and we want to overwrite it, delete it
    if os.path.exists(output_filename) and overwrite:
        logging.info(f"Deleting file {output_filename}")
        os.remove(output_filename)

    # Define the URL and headers
    url = f"https://api.github.com/repos/{owner}/{repo}/issues"
    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"{token}",
        "X-GitHub-Api-Version": f"{github_api_version}",
        "User-Agent": "amyeroberts",
    }
    last_issue_number = get_last_issue_number(file_path=output_filename)
    per_page = 100

    if os.path.exists(output_filename):
        with open(output_filename, "r") as f:
            num_lines = sum(1 for line in f)
    else:
        num_lines = 0

    # Get the number of pages
    page = num_lines // per_page + 1
    query_params = {
        "state": "all",
        "per_page": per_page,
        "sort": "created",
        "direction": "asc",
        "page": page,
    }

    if os.path.exists(output_filename) and not update and not overwrite:
        raise ValueError(f"File {output_filename} already exists")

    page_limit = (n_pages + page) if n_pages > 0 else np.inf
    while True:
        if page >= page_limit:
            break

        # Send the GET request
        response = requests.get(url, headers=headers, params=query_params)

        if not response.status_code == 200:
            raise ValueError(
                f"Request failed with status code {response.status_code} and message {response.text}"
            )

        json_response = response.json()
        logger.info(f"Page: {page}, number of issues: {len(json_response)}")

        # If we get an empty response, we've reached the end of the issues
        if len(json_response) == 0:
            break

        with open(output_filename, "a") as f:
            for value in json_response:
                if value["number"] <= last_issue_number:
                    continue
                json.dump(value, f)
                f.write("\n")

        if len(json_response) < per_page:
            break

        page += 1
        query_params["page"] = page

    return output_filename


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--update", action="store_true", default=UPDATE_FILE)
    parser.add_argument("--overwrite", action="store_true", default=OVERWRITE_FILE)
    parser.add_argument("--output_filename", type=str, default=ISSUE_JSON_FILE)
    parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
    parser.add_argument("--owner", type=str, default=OWNER)
    parser.add_argument("--repo", type=str, default=REPO)
    parser.add_argument("--token", type=str, default=TOKEN)
    parser.add_argument("--n_pages", type=int, default=-1)
    args = parser.parse_args()
    get_issues(**vars(args))