patrickvonplaten commited on
Commit
109623e
1 Parent(s): 5d18ec4
Files changed (1) hide show
  1. app.py +122 -107
app.py CHANGED
@@ -9,7 +9,9 @@ import streamlit as st
9
  from datetime import datetime, timedelta
10
  import matplotlib.pyplot as plt
11
 
12
- libraries = [
 
 
13
  "open-source-metrics/transformers-dependents",
14
  "open-source-metrics/diffusers-dependents",
15
  "open-source-metrics/pytorch-image-models-dependents",
@@ -21,130 +23,143 @@ libraries = [
21
  "open-source-metrics/optimum-dependents",
22
  "open-source-metrics/hub-docs-dependents",
23
  "open-source-metrics/huggingface_hub-dependents",
24
- ]
25
-
26
- option = st.selectbox(
27
- 'Choose library',
28
- libraries
29
- )
30
-
31
- cached_folder = snapshot_download(option, repo_type="dataset")
32
-
33
- num_dependents = defaultdict(int)
34
- num_stars_all_dependents = defaultdict(int)
35
 
36
- def load_json_files(directory):
37
- for subdir, dirs, files in os.walk(directory):
38
- for file in files:
39
- if file.endswith('.json'):
40
- file_path = os.path.join(subdir, file)
41
- date = "_".join(file_path.split(".")[-2].split("/")[-3:])
42
- with open(file_path, 'r') as f:
43
- data = json.load(f)
44
- # Process the JSON data as needed
45
- if "name" in data and "stars" in data:
46
- num_dependents[date] = len(data["name"])
47
- num_stars_all_dependents[date] = sum(data["stars"])
48
 
49
- # Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
50
- load_json_files(cached_folder)
51
-
52
- def sort_dict_by_date(d):
53
- # Convert date strings to datetime objects and sort
54
- sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
55
- # Convert back to dictionary if needed
56
- return defaultdict(int, sorted_tuples)
57
 
58
- def remove_incorrect_entries(data):
59
- # Convert string dates to datetime objects for easier comparison
60
- sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
61
 
62
- # Initialize a new dictionary to store the corrected data
63
- corrected_data = defaultdict(int)
64
 
65
- # Variable to keep track of the number of dependents on the previous date
66
- previous_dependents = None
67
-
68
- for date, dependents in sorted_data:
69
- # If the current number of dependents is not less than the previous, add it to the corrected data
70
- if previous_dependents is None or dependents >= previous_dependents:
71
- corrected_data[date] = dependents
72
- previous_dependents = dependents
73
-
74
- return corrected_data
75
-
76
- def interpolate_missing_dates(data):
77
- # Convert string dates to datetime objects
78
- temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
79
 
80
- # Find the min and max dates to establish the range
81
- min_date, max_date = min(temp_data.keys()), max(temp_data.keys())
82
-
83
- # Generate a date range
84
- current_date = min_date
85
- while current_date <= max_date:
86
- # If the current date is missing
87
- if current_date not in temp_data:
88
- # Find previous and next dates that are present
89
- prev_date = current_date - timedelta(days=1)
90
- next_date = current_date + timedelta(days=1)
91
- while prev_date not in temp_data:
92
- prev_date -= timedelta(days=1)
93
- while next_date not in temp_data:
94
- next_date += timedelta(days=1)
95
-
96
- # Linear interpolation
97
- prev_value = temp_data[prev_date]
98
- next_value = temp_data[next_date]
99
- interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
100
- temp_data[current_date] = interpolated_value
101
-
102
- current_date += timedelta(days=1)
103
-
104
- # Convert datetime objects back to string format
105
- interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
106
 
107
- return interpolated_data
108
-
109
- num_dependents = remove_incorrect_entries(num_dependents)
110
- num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)
111
-
112
- num_dependents = interpolate_missing_dates(num_dependents)
113
- num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)
114
-
115
- num_dependents = sort_dict_by_date(num_dependents)
116
- num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)
117
-
118
- num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
119
- num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
122
- num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')
123
 
124
- num_dependents_df.set_index('Date', inplace=True)
125
- num_dependents_df = num_dependents_df.resample('D').asfreq()
126
- num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()
127
 
128
- num_cum_stars_df.set_index('Date', inplace=True)
129
- num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
130
- num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()
131
 
132
- filename = "_".join(option.split("/"))
 
133
 
134
- # Plotting
135
- plt.figure(figsize=(10, 6))
136
- plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
137
- plt.plot(num_dependents_df.index, num_dependents_df['Value'], marker='o')
138
  plt.xlabel('Date')
139
- plt.ylabel('Number of Dependents')
 
140
  plt.title('Dependencies History')
141
  st.pyplot(plt)
142
 
143
  # Display in Streamlit
144
- plt.figure(figsize=(10, 6))
145
  plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
146
- plt.plot(num_cum_stars_df.index, num_cum_stars_df['Value'], marker='o')
 
 
 
147
  plt.xlabel('Date')
148
- plt.ylabel('Cumulative number of stars of Dependents')
 
149
  plt.title('Dependents Stars History')
150
  st.pyplot(plt)
 
9
  from datetime import datetime, timedelta
10
  import matplotlib.pyplot as plt
11
 
12
+ plt.rcParams.update({'font.size': 40})
13
+
14
+ libraries = {
15
  "open-source-metrics/transformers-dependents",
16
  "open-source-metrics/diffusers-dependents",
17
  "open-source-metrics/pytorch-image-models-dependents",
 
23
  "open-source-metrics/optimum-dependents",
24
  "open-source-metrics/hub-docs-dependents",
25
  "open-source-metrics/huggingface_hub-dependents",
26
+ }
 
 
 
 
 
 
 
 
 
 
27
 
28
+ MAP = {k.split("/")[-1].split("-")[0]: k for k in libraries}
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ selected_libraries = st.multiselect(
31
+ 'Choose libraries',
32
+ list(MAP.keys())
33
+ )
 
 
 
 
34
 
35
+ def get_frames(option):
36
+ cached_folder = snapshot_download(option, repo_type="dataset")
 
37
 
38
+ num_dependents = defaultdict(int)
39
+ num_stars_all_dependents = defaultdict(int)
40
 
41
+ def load_json_files(directory):
42
+ for subdir, dirs, files in os.walk(directory):
43
+ for file in files:
44
+ if file.endswith('.json'):
45
+ file_path = os.path.join(subdir, file)
46
+ date = "_".join(file_path.split(".")[-2].split("/")[-3:])
47
+ with open(file_path, 'r') as f:
48
+ data = json.load(f)
49
+ # Process the JSON data as needed
50
+ if "name" in data and "stars" in data:
51
+ num_dependents[date] = len(data["name"])
52
+ num_stars_all_dependents[date] = sum(data["stars"])
 
 
53
 
54
+ # Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
55
+ load_json_files(cached_folder)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ def sort_dict_by_date(d):
58
+ # Convert date strings to datetime objects and sort
59
+ sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
60
+ # Convert back to dictionary if needed
61
+ return defaultdict(int, sorted_tuples)
62
+
63
+ def remove_incorrect_entries(data):
64
+ # Convert string dates to datetime objects for easier comparison
65
+ sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
66
+
67
+ # Initialize a new dictionary to store the corrected data
68
+ corrected_data = defaultdict(int)
69
+
70
+ # Variable to keep track of the number of dependents on the previous date
71
+ previous_dependents = None
72
+
73
+ for date, dependents in sorted_data:
74
+ # If the current number of dependents is not less than the previous, add it to the corrected data
75
+ if previous_dependents is None or dependents >= previous_dependents:
76
+ corrected_data[date] = dependents
77
+ previous_dependents = dependents
78
+
79
+ return corrected_data
80
+
81
+ def interpolate_missing_dates(data):
82
+ # Convert string dates to datetime objects
83
+ temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
84
+
85
+ # Find the min and max dates to establish the range
86
+ min_date, max_date = min(temp_data.keys()), max(temp_data.keys())
87
+
88
+ # Generate a date range
89
+ current_date = min_date
90
+ while current_date <= max_date:
91
+ # If the current date is missing
92
+ if current_date not in temp_data:
93
+ # Find previous and next dates that are present
94
+ prev_date = current_date - timedelta(days=1)
95
+ next_date = current_date + timedelta(days=1)
96
+ while prev_date not in temp_data:
97
+ prev_date -= timedelta(days=1)
98
+ while next_date not in temp_data:
99
+ next_date += timedelta(days=1)
100
+
101
+ # Linear interpolation
102
+ prev_value = temp_data[prev_date]
103
+ next_value = temp_data[next_date]
104
+ interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
105
+ temp_data[current_date] = interpolated_value
106
+
107
+ current_date += timedelta(days=1)
108
+
109
+ # Convert datetime objects back to string format
110
+ interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
111
+
112
+ return interpolated_data
113
+
114
+ num_dependents = remove_incorrect_entries(num_dependents)
115
+ num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)
116
+
117
+ num_dependents = interpolate_missing_dates(num_dependents)
118
+ num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)
119
+
120
+ num_dependents = sort_dict_by_date(num_dependents)
121
+ num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)
122
+
123
+ num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
124
+ num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])
125
+
126
+ num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
127
+ num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')
128
+
129
+ num_dependents_df.set_index('Date', inplace=True)
130
+ num_dependents_df = num_dependents_df.resample('D').asfreq()
131
+ num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()
132
+
133
+ num_cum_stars_df.set_index('Date', inplace=True)
134
+ num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
135
+ num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()
136
+
137
+ return num_dependents_df, num_cum_stars_df
138
 
 
 
139
 
140
+ lib_frames = {l: get_frames(MAP[l]) for l in selected_libraries}
 
 
141
 
142
+ plt.figure(figsize=(40, 24))
143
+ plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
 
144
 
145
+ for l, (df_dep, _) in lib_frames.items():
146
+ plt.plot(df_dep.index, df_dep['Value'], label=l, marker='o')
147
 
 
 
 
 
148
  plt.xlabel('Date')
149
+ plt.ylabel('# Dependencies')
150
+ plt.legend()
151
  plt.title('Dependencies History')
152
  st.pyplot(plt)
153
 
154
  # Display in Streamlit
155
+ plt.figure(figsize=(40, 24))
156
  plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
157
+
158
+ for l, (_, df_stars) in lib_frames.items():
159
+ plt.plot(df_stars.index, df_stars['Value'], label=l, marker='o')
160
+
161
  plt.xlabel('Date')
162
+ plt.ylabel('SUM stars of dependencies')
163
+ plt.legend()
164
  plt.title('Dependents Stars History')
165
  st.pyplot(plt)