Spaces:

open-source-metrics
/

dependency-history

Runtime error

App Files Files Community

dependency-history / app.py

patrickvonplaten

improve

aed3125 almost 2 years ago

raw

history blame

6.73 kB

	import os
	from datetime import datetime
	import json
	import matplotlib.ticker as ticker
	from huggingface_hub import snapshot_download
	from collections import defaultdict
	import pandas as pd
	import streamlit as st
	from datetime import datetime, timedelta
	import matplotlib.pyplot as plt

	plt.rcParams.update({'font.size': 40})

	libraries = {
	"open-source-metrics/transformers-dependents",
	"open-source-metrics/diffusers-dependents",
	"open-source-metrics/pytorch-image-models-dependents",
	"open-source-metrics/datasets-dependents",
	"open-source-metrics/gradio-dependents",
	"open-source-metrics/accelerate-dependents",
	"open-source-metrics/evaluate-dependents",
	"open-source-metrics/tokenizers-dependents",
	"open-source-metrics/optimum-dependents",
	"open-source-metrics/hub-docs-dependents",
	"open-source-metrics/huggingface_hub-dependents",
	}

	MAP = {"-".join(k.split("/")[-1].split("-")[:-1]): k for k in libraries}

	selected_libraries = st.multiselect(
	'Choose libraries',
	list(MAP.keys())
	)

	def get_frames(option):
	cached_folder = snapshot_download(option, repo_type="dataset")

	num_dependents = defaultdict(int)
	num_stars_all_dependents = defaultdict(int)

	def load_json_files(directory):
	for subdir, dirs, files in os.walk(directory):
	for file in files:
	if file.endswith('.json'):
	file_path = os.path.join(subdir, file)
	date = "_".join(file_path.split(".")[-2].split("/")[-3:])
	with open(file_path, 'r') as f:
	data = json.load(f)
	# Process the JSON data as needed
	if "name" in data and "stars" in data:
	num_dependents[date] = len(data["name"])
	num_stars_all_dependents[date] = sum(data["stars"])

	# Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
	load_json_files(cached_folder)

	def sort_dict_by_date(d):
	# Convert date strings to datetime objects and sort
	sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
	# Convert back to dictionary if needed
	return defaultdict(int, sorted_tuples)

	def remove_incorrect_entries(data):
	# Convert string dates to datetime objects for easier comparison
	sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))

	# Initialize a new dictionary to store the corrected data
	corrected_data = defaultdict(int)

	# Variable to keep track of the number of dependents on the previous date
	previous_dependents = None

	for date, dependents in sorted_data:
	# If the current number of dependents is not less than the previous, add it to the corrected data
	if previous_dependents is None or dependents >= previous_dependents:
	corrected_data[date] = dependents
	previous_dependents = dependents

	return corrected_data

	def interpolate_missing_dates(data):
	# Convert string dates to datetime objects
	temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}

	# Find the min and max dates to establish the range
	min_date, max_date = min(temp_data.keys()), max(temp_data.keys())

	# Generate a date range
	current_date = min_date
	while current_date <= max_date:
	# If the current date is missing
	if current_date not in temp_data:
	# Find previous and next dates that are present
	prev_date = current_date - timedelta(days=1)
	next_date = current_date + timedelta(days=1)
	while prev_date not in temp_data:
	prev_date -= timedelta(days=1)
	while next_date not in temp_data:
	next_date += timedelta(days=1)

	# Linear interpolation
	prev_value = temp_data[prev_date]
	next_value = temp_data[next_date]
	interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
	temp_data[current_date] = interpolated_value

	current_date += timedelta(days=1)

	# Convert datetime objects back to string format
	interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})

	return interpolated_data

	num_dependents = remove_incorrect_entries(num_dependents)
	num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)

	num_dependents = interpolate_missing_dates(num_dependents)
	num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)

	num_dependents = sort_dict_by_date(num_dependents)
	num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)

	num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
	num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])

	num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
	num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')

	num_dependents_df.set_index('Date', inplace=True)
	num_dependents_df = num_dependents_df.resample('D').asfreq()
	num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()

	num_cum_stars_df.set_index('Date', inplace=True)
	num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
	num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()

	return num_dependents_df, num_cum_stars_df


	lib_frames = {l: get_frames(MAP[l]) for l in selected_libraries}

	plt.figure(figsize=(40, 24))
	plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

	for l, (df_dep, _) in lib_frames.items():
	plt.plot(df_dep.index, df_dep['Value'], label=l, marker='o')

	plt.xlabel('Date')
	plt.ylabel('# Dependencies')
	plt.legend()
	plt.title('Dependencies History')
	st.pyplot(plt)

	# Display in Streamlit
	plt.figure(figsize=(40, 24))
	plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

	for l, (_, df_stars) in lib_frames.items():
	plt.plot(df_stars.index, df_stars['Value'], label=l, marker='o')

	plt.xlabel('Date')
	plt.ylabel('SUM stars of dependencies')
	plt.legend()
	plt.title('Dependents Stars History')
	st.pyplot(plt)