File size: 6,727 Bytes
d27cdb4
 
 
5d18ec4
d27cdb4
 
 
 
 
 
 
109623e
 
 
d27cdb4
 
5d18ec4
 
d27cdb4
 
5d18ec4
 
 
 
 
109623e
d27cdb4
aed3125
d27cdb4
109623e
 
 
 
d27cdb4
109623e
 
d27cdb4
109623e
 
d27cdb4
109623e
 
 
 
 
 
 
 
 
 
 
 
d27cdb4
109623e
 
d27cdb4
109623e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d27cdb4
 
109623e
d27cdb4
109623e
 
d27cdb4
109623e
 
5d18ec4
d27cdb4
109623e
 
d27cdb4
5d18ec4
d27cdb4
 
109623e
5d18ec4
109623e
 
 
 
5d18ec4
109623e
 
5d18ec4
d27cdb4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
from datetime import datetime
import json
import matplotlib.ticker as ticker
from huggingface_hub import snapshot_download
from collections import defaultdict
import pandas as pd
import streamlit as st
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

plt.rcParams.update({'font.size': 40})

libraries = {
    "open-source-metrics/transformers-dependents",
    "open-source-metrics/diffusers-dependents",
    "open-source-metrics/pytorch-image-models-dependents",
    "open-source-metrics/datasets-dependents",
    "open-source-metrics/gradio-dependents",
    "open-source-metrics/accelerate-dependents",
    "open-source-metrics/evaluate-dependents",
    "open-source-metrics/tokenizers-dependents",
    "open-source-metrics/optimum-dependents",
    "open-source-metrics/hub-docs-dependents",
    "open-source-metrics/huggingface_hub-dependents",
}

MAP = {"-".join(k.split("/")[-1].split("-")[:-1]): k for k in libraries}

selected_libraries = st.multiselect(
    'Choose libraries',
    list(MAP.keys())
)

def get_frames(option):
    cached_folder = snapshot_download(option, repo_type="dataset")
    
    num_dependents = defaultdict(int)
    num_stars_all_dependents = defaultdict(int)
    
    def load_json_files(directory):
        for subdir, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith('.json'):
                    file_path = os.path.join(subdir, file)
                    date = "_".join(file_path.split(".")[-2].split("/")[-3:])
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                        # Process the JSON data as needed
                        if "name" in data and "stars" in data:
                            num_dependents[date] = len(data["name"])
                            num_stars_all_dependents[date] = sum(data["stars"])
    
    # Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
    load_json_files(cached_folder)
    
    def sort_dict_by_date(d):
        # Convert date strings to datetime objects and sort
        sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
        # Convert back to dictionary if needed
        return defaultdict(int, sorted_tuples)
    
    def remove_incorrect_entries(data):
        # Convert string dates to datetime objects for easier comparison
        sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
        
        # Initialize a new dictionary to store the corrected data
        corrected_data = defaultdict(int)
        
        # Variable to keep track of the number of dependents on the previous date
        previous_dependents = None
    
        for date, dependents in sorted_data:
            # If the current number of dependents is not less than the previous, add it to the corrected data
            if previous_dependents is None or dependents >= previous_dependents:
                corrected_data[date] = dependents
                previous_dependents = dependents
    
        return corrected_data
    
    def interpolate_missing_dates(data):
        # Convert string dates to datetime objects
        temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
        
        # Find the min and max dates to establish the range
        min_date, max_date = min(temp_data.keys()), max(temp_data.keys())
    
        # Generate a date range
        current_date = min_date
        while current_date <= max_date:
            # If the current date is missing
            if current_date not in temp_data:
                # Find previous and next dates that are present
                prev_date = current_date - timedelta(days=1)
                next_date = current_date + timedelta(days=1)
                while prev_date not in temp_data:
                    prev_date -= timedelta(days=1)
                while next_date not in temp_data:
                    next_date += timedelta(days=1)
    
                # Linear interpolation
                prev_value = temp_data[prev_date]
                next_value = temp_data[next_date]
                interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
                temp_data[current_date] = interpolated_value
    
            current_date += timedelta(days=1)
    
        # Convert datetime objects back to string format
        interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
        
        return interpolated_data
    
    num_dependents = remove_incorrect_entries(num_dependents)
    num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)
    
    num_dependents = interpolate_missing_dates(num_dependents)
    num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)
    
    num_dependents = sort_dict_by_date(num_dependents)
    num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)
    
    num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
    num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])
    
    num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
    num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')
    
    num_dependents_df.set_index('Date', inplace=True)
    num_dependents_df = num_dependents_df.resample('D').asfreq()
    num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()
    
    num_cum_stars_df.set_index('Date', inplace=True)
    num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
    num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()
    
    return num_dependents_df, num_cum_stars_df


lib_frames = {l: get_frames(MAP[l]) for l in selected_libraries}

plt.figure(figsize=(40, 24))
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

for l, (df_dep, _) in lib_frames.items():
    plt.plot(df_dep.index, df_dep['Value'], label=l, marker='o')

plt.xlabel('Date')
plt.ylabel('# Dependencies')
plt.legend()
plt.title('Dependencies History')
st.pyplot(plt)

# Display in Streamlit
plt.figure(figsize=(40, 24))
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

for l, (_, df_stars) in lib_frames.items():
    plt.plot(df_stars.index, df_stars['Value'], label=l, marker='o')

plt.xlabel('Date')
plt.ylabel('SUM stars of dependencies')
plt.legend()
plt.title('Dependents Stars History')
st.pyplot(plt)