import os from datetime import datetime import json import matplotlib.ticker as ticker from huggingface_hub import snapshot_download from collections import defaultdict import pandas as pd import streamlit as st from datetime import datetime, timedelta import matplotlib.pyplot as plt plt.rcParams.update({'font.size': 40}) libraries = { "open-source-metrics/transformers-dependents", "open-source-metrics/diffusers-dependents", "open-source-metrics/pytorch-image-models-dependents", "open-source-metrics/datasets-dependents", "open-source-metrics/gradio-dependents", "open-source-metrics/accelerate-dependents", "open-source-metrics/evaluate-dependents", "open-source-metrics/tokenizers-dependents", "open-source-metrics/optimum-dependents", "open-source-metrics/hub-docs-dependents", "open-source-metrics/huggingface_hub-dependents", } MAP = {"-".join(k.split("/")[-1].split("-")[:-1]): k for k in libraries} selected_libraries = st.multiselect( 'Choose libraries', list(MAP.keys()) ) def get_frames(option): cached_folder = snapshot_download(option, repo_type="dataset") num_dependents = defaultdict(int) num_stars_all_dependents = defaultdict(int) def load_json_files(directory): for subdir, dirs, files in os.walk(directory): for file in files: if file.endswith('.json'): file_path = os.path.join(subdir, file) date = "_".join(file_path.split(".")[-2].split("/")[-3:]) with open(file_path, 'r') as f: data = json.load(f) # Process the JSON data as needed if "name" in data and "stars" in data: num_dependents[date] = len(data["name"]) num_stars_all_dependents[date] = sum(data["stars"]) # Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders load_json_files(cached_folder) def sort_dict_by_date(d): # Convert date strings to datetime objects and sort sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d')) # Convert back to dictionary if needed return defaultdict(int, sorted_tuples) def remove_incorrect_entries(data): # Convert string dates to datetime objects for easier comparison sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d')) # Initialize a new dictionary to store the corrected data corrected_data = defaultdict(int) # Variable to keep track of the number of dependents on the previous date previous_dependents = None for date, dependents in sorted_data: # If the current number of dependents is not less than the previous, add it to the corrected data if previous_dependents is None or dependents >= previous_dependents: corrected_data[date] = dependents previous_dependents = dependents return corrected_data def interpolate_missing_dates(data): # Convert string dates to datetime objects temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()} # Find the min and max dates to establish the range min_date, max_date = min(temp_data.keys()), max(temp_data.keys()) # Generate a date range current_date = min_date while current_date <= max_date: # If the current date is missing if current_date not in temp_data: # Find previous and next dates that are present prev_date = current_date - timedelta(days=1) next_date = current_date + timedelta(days=1) while prev_date not in temp_data: prev_date -= timedelta(days=1) while next_date not in temp_data: next_date += timedelta(days=1) # Linear interpolation prev_value = temp_data[prev_date] next_value = temp_data[next_date] interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date))) temp_data[current_date] = interpolated_value current_date += timedelta(days=1) # Convert datetime objects back to string format interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()}) return interpolated_data num_dependents = remove_incorrect_entries(num_dependents) num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents) num_dependents = interpolate_missing_dates(num_dependents) num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents) num_dependents = sort_dict_by_date(num_dependents) num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents) num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value']) num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value']) num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d') num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d') num_dependents_df.set_index('Date', inplace=True) num_dependents_df = num_dependents_df.resample('D').asfreq() num_dependents_df['Value'] = num_dependents_df['Value'].interpolate() num_cum_stars_df.set_index('Date', inplace=True) num_cum_stars_df = num_cum_stars_df.resample('D').asfreq() num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate() return num_dependents_df, num_cum_stars_df lib_frames = {l: get_frames(MAP[l]) for l in selected_libraries} plt.figure(figsize=(40, 24)) plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}')) for l, (df_dep, _) in lib_frames.items(): plt.plot(df_dep.index, df_dep['Value'], label=l, marker='o') plt.xlabel('Date') plt.ylabel('# Dependencies') plt.legend() plt.title('Dependencies History') st.pyplot(plt) # Display in Streamlit plt.figure(figsize=(40, 24)) plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}')) for l, (_, df_stars) in lib_frames.items(): plt.plot(df_stars.index, df_stars['Value'], label=l, marker='o') plt.xlabel('Date') plt.ylabel('SUM stars of dependencies') plt.legend() plt.title('Dependents Stars History') st.pyplot(plt)