File size: 3,257 Bytes
a8639ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
import time
from tqdm import tqdm
from dotenv import load_dotenv
import os
from datetime import datetime, timedelta

load_dotenv()

url = "https://api.github.com/search/repositories"
headers = {"Authorization": "token " + os.environ["GITHUB_PAT"]}
timeout_duration = 10  # Timeout for requests in seconds
output_file = "repositories.txt"
last_date_file = "last_date.txt"


def fetch_repositories_by_date_range(start_date, end_date):
    page = 1
    query = f"language:Python size:5..5000 stars:>=10 created:{start_date}..{end_date}"
    params = {
        "q": query,
        "per_page": 100,
        "sort": "stars",
    }

    with open(output_file, "a") as file, tqdm(
        desc=f"Fetching {start_date} to {end_date}", unit="page"
    ) as pbar:
        while True:
            params["page"] = page
            try:
                response = requests.get(
                    url, headers=headers, params=params, timeout=timeout_duration
                )

                if response.status_code == 200:
                    data = response.json()
                    repositories = data.get("items", [])

                    for repo in repositories:
                        file.write(f"{repo['html_url']}\n")

                    if len(repositories) < params["per_page"]:
                        break  # End if fewer results are returned

                    page += 1
                    pbar.update(1)
                    time.sleep(1)  # Adjust delay as needed

                elif response.status_code == 429:
                    reset_time = int(response.headers.get("x-ratelimit-reset", 0))
                    wait_time = max(reset_time - int(time.time()), 0)
                    print(f"Rate limit exceeded. Waiting for {wait_time} seconds...")
                    time.sleep(wait_time)

                else:
                    print("Error:", response.status_code, response.json())
                    break

            except requests.exceptions.Timeout:
                print("Request timed out. Retrying...")
                time.sleep(5)  # Delay before retrying on timeout


def generate_date_ranges(start_year=2015):
    end_date = datetime.now()
    current_date = datetime(start_year, 1, 1)

    if os.path.exists(last_date_file):
        with open(last_date_file, "r") as f:
            last_date_str = f.read().strip()
            if last_date_str:
                current_date = datetime.strptime(last_date_str, "%Y-%m-%d")

    while current_date < end_date:
        next_date = current_date + timedelta(days=30)  # Move by roughly one month
        yield current_date.strftime("%Y-%m-%d"), min(next_date, end_date).strftime(
            "%Y-%m-%d"
        )
        current_date = next_date


# Run the script across date ranges with progress tracking
date_ranges = list(generate_date_ranges())
with tqdm(total=len(date_ranges), desc="Total Date Ranges") as date_pbar:
    for start_date, end_date in date_ranges:
        print(f"Fetching repositories created between {start_date} and {end_date}")
        fetch_repositories_by_date_range(start_date, end_date)

        # Save last processed date
        with open(last_date_file, "w") as f:
            f.write(end_date)

        date_pbar.update(1)