File size: 3,790 Bytes
9b744c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json

import argparse

import requests
import os
import numpy as np
import json
import datetime
import logging

logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(__name__)

today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

OWNER = "huggingface"
REPO = "transformers"
GITHUB_API_VERSION = "2022-11-28"
TOKEN = os.environ.get("GITHUB_TOKEN")
JSON_FILE = f"issues.json"
UPDATE_FILE = False
OVERWRITE_FILE = True


def get_last_entry(file_path):
    with open(file_path, 'r') as file:
        # Read the last line
        last_line = file.readlines()[-1]
    return json.loads(last_line)


def get_last_issue_number(file_path):
    if os.path.exists(file_path):
        last_entry = get_last_entry(file_path=file_path)
        return last_entry['number']
    return 0


def get_issues(
    overwrite=OVERWRITE_FILE,
    update=UPDATE_FILE,
    output_filename=JSON_FILE,
    github_api_version=GITHUB_API_VERSION,
    owner=OWNER,
    repo=REPO,
    token=TOKEN,
    n_pages=-1,
):
    """
    Function to get the issues from the transformers repo and save them to a json file
    """

    # If file exists and we want to overwrite it, delete it
    if os.path.exists(output_filename) and overwrite:
        logging.info(f"Deleting file {output_filename}")
        os.remove(output_filename)

    # Define the URL and headers
    url = f"https://api.github.com/repos/{owner}/{repo}/issues"
    headers = {
        "Accept": "application/vnd.github+json",
        f"Authorization": f"{token}",
        "X-GitHub-Api-Version": f"{github_api_version}",
        "User-Agent": "amyeroberts",
    }
    last_issue_number = get_last_issue_number(file_path=output_filename)
    per_page = 100
    page = last_issue_number // per_page + 1
    query_params = {
        "state": "all",
        "per_page": per_page,
        "sort": "created",
        "direction": "asc",
        "page": page,
    }

    if os.path.exists(output_filename) and not update and not overwrite:
        raise ValueError(f"File {output_filename} already exists")

    page_limit = (n_pages + page) if n_pages > 0 else np.inf
    while True:
        if page >= page_limit:
            break

        # Send the GET request
        response = requests.get(url, headers=headers, params=query_params)

        if not response.status_code == 200:
            raise ValueError(
                f"Request failed with status code {response.status_code} and message {response.text}"
            )

        json_response = response.json()
        logger.info(f"Page: {page}, number of issues: {len(json_response)}")

        # If we get an empty response, we've reached the end of the issues
        if len(json_response) == 0:
            break

        with open(output_filename, "a") as f:
            for value in json_response:
                if value["number"] <= last_issue_number:
                    continue
                json.dump(value, f)
                f.write("\n")

        if len(json_response) < per_page:
            break

        page += 1
        query_params["page"] = page

    return output_filename


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--update", action="store_true", default=True)
    parser.add_argument("--overwrite", action="store_true", default=False)
    parser.add_argument("--output_filename", type=str, default=JSON_FILE)
    parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
    parser.add_argument("--owner", type=str, default=OWNER)
    parser.add_argument("--repo", type=str, default=REPO)
    parser.add_argument("--token", type=str, default=TOKEN)
    parser.add_argument("--n_pages", type=int, default=-1)
    args = parser.parse_args()
    get_issues(**vars(args))