Spaces:
Runtime error
Runtime error
File size: 6,770 Bytes
d27cdb4 5d18ec4 d27cdb4 109623e d27cdb4 5d18ec4 d27cdb4 5d18ec4 b900287 109623e d27cdb4 aed3125 d27cdb4 109623e d27cdb4 109623e d27cdb4 109623e d27cdb4 109623e d27cdb4 109623e d27cdb4 109623e d27cdb4 109623e d27cdb4 109623e d27cdb4 109623e 5d18ec4 d27cdb4 109623e d27cdb4 5d18ec4 d27cdb4 109623e 5d18ec4 109623e 5d18ec4 109623e 5d18ec4 d27cdb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import os
from datetime import datetime
import json
import matplotlib.ticker as ticker
from huggingface_hub import snapshot_download
from collections import defaultdict
import pandas as pd
import streamlit as st
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 40})
libraries = {
"open-source-metrics/transformers-dependents",
"open-source-metrics/diffusers-dependents",
"open-source-metrics/pytorch-image-models-dependents",
"open-source-metrics/datasets-dependents",
"open-source-metrics/gradio-dependents",
"open-source-metrics/accelerate-dependents",
"open-source-metrics/evaluate-dependents",
"open-source-metrics/tokenizers-dependents",
"open-source-metrics/optimum-dependents",
"open-source-metrics/hub-docs-dependents",
"open-source-metrics/huggingface_hub-dependents",
"open-source-metrics/peft-dependents",
}
MAP = {"-".join(k.split("/")[-1].split("-")[:-1]): k for k in libraries}
selected_libraries = st.multiselect(
'Choose libraries',
list(MAP.keys())
)
def get_frames(option):
cached_folder = snapshot_download(option, repo_type="dataset")
num_dependents = defaultdict(int)
num_stars_all_dependents = defaultdict(int)
def load_json_files(directory):
for subdir, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.json'):
file_path = os.path.join(subdir, file)
date = "_".join(file_path.split(".")[-2].split("/")[-3:])
with open(file_path, 'r') as f:
data = json.load(f)
# Process the JSON data as needed
if "name" in data and "stars" in data:
num_dependents[date] = len(data["name"])
num_stars_all_dependents[date] = sum(data["stars"])
# Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
load_json_files(cached_folder)
def sort_dict_by_date(d):
# Convert date strings to datetime objects and sort
sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
# Convert back to dictionary if needed
return defaultdict(int, sorted_tuples)
def remove_incorrect_entries(data):
# Convert string dates to datetime objects for easier comparison
sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
# Initialize a new dictionary to store the corrected data
corrected_data = defaultdict(int)
# Variable to keep track of the number of dependents on the previous date
previous_dependents = None
for date, dependents in sorted_data:
# If the current number of dependents is not less than the previous, add it to the corrected data
if previous_dependents is None or dependents >= previous_dependents:
corrected_data[date] = dependents
previous_dependents = dependents
return corrected_data
def interpolate_missing_dates(data):
# Convert string dates to datetime objects
temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
# Find the min and max dates to establish the range
min_date, max_date = min(temp_data.keys()), max(temp_data.keys())
# Generate a date range
current_date = min_date
while current_date <= max_date:
# If the current date is missing
if current_date not in temp_data:
# Find previous and next dates that are present
prev_date = current_date - timedelta(days=1)
next_date = current_date + timedelta(days=1)
while prev_date not in temp_data:
prev_date -= timedelta(days=1)
while next_date not in temp_data:
next_date += timedelta(days=1)
# Linear interpolation
prev_value = temp_data[prev_date]
next_value = temp_data[next_date]
interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
temp_data[current_date] = interpolated_value
current_date += timedelta(days=1)
# Convert datetime objects back to string format
interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
return interpolated_data
num_dependents = remove_incorrect_entries(num_dependents)
num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)
num_dependents = interpolate_missing_dates(num_dependents)
num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)
num_dependents = sort_dict_by_date(num_dependents)
num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)
num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])
num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')
num_dependents_df.set_index('Date', inplace=True)
num_dependents_df = num_dependents_df.resample('D').asfreq()
num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()
num_cum_stars_df.set_index('Date', inplace=True)
num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()
return num_dependents_df, num_cum_stars_df
lib_frames = {l: get_frames(MAP[l]) for l in selected_libraries}
plt.figure(figsize=(40, 24))
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
for l, (df_dep, _) in lib_frames.items():
plt.plot(df_dep.index, df_dep['Value'], label=l, marker='o')
plt.xlabel('Date')
plt.ylabel('# Dependencies')
plt.legend()
plt.title('Dependencies History')
st.pyplot(plt)
# Display in Streamlit
plt.figure(figsize=(40, 24))
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
for l, (_, df_stars) in lib_frames.items():
plt.plot(df_stars.index, df_stars['Value'], label=l, marker='o')
plt.xlabel('Date')
plt.ylabel('SUM stars of dependencies')
plt.legend()
plt.title('Dependents Stars History')
st.pyplot(plt)
|