from datetime import datetime import streamlit as st import pandas as pd import matplotlib.pyplot as plt # from load_dataframe import get_data def aggregated_data(df, aggregation_level="week"): st.write(f"Aggregated data by {aggregation_level}") # Create a column that indicates if a paper has any artifacts df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0) # Resample by week freq = 'W' if aggregation_level == "week" else 'M' weekly_total_papers = df.resample(freq).size() weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum() # Calculate the percentage of papers with artifacts percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100 # Create the plot plt.figure(figsize=(12, 6)) plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact') # Set the y-axis limits plt.ylim(0, 100) plt.xlabel(aggregation_level) plt.ylabel('Percentage') plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time') plt.legend() plt.grid(True) # Use Streamlit to display the plot st.pyplot(plt) def display_data(df): num_artifacts = df[(df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)].shape[0] st.markdown(f""" ## Number of papers: {df.shape[0]} #### Number of papers with a Github link: {df['github'].notnull().sum()} #### Number of papers with at least one HF artifact: {num_artifacts} """) st.write("Papers with at least one artifact") df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0) st.dataframe(df[df['has_artifact']], hide_index=True, column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"), column_config={"github": st.column_config.LinkColumn(), "paper_page": st.column_config.LinkColumn()}, width=2000) st.write("Papers without artifacts") st.dataframe(df[~df['has_artifact']], hide_index=True, column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"), column_config={"github": st.column_config.LinkColumn(), "paper_page": st.column_config.LinkColumn()}, width=2000) def main(): st.title("Hugging Face Artifacts KPI Dashboard") # 2 tabs: one for daily data, one for weekly data st.sidebar.title("Navigation") selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"]) # TODO use this instead # df = get_data() df = pd.read_csv('/Users/nielsrogge/Downloads/daily_papers_enriched (1).csv') df = df.drop(['Unnamed: 0'], axis=1) # Use date as index df = df.set_index('date') df.index = pd.to_datetime(df.index) df = df.sort_index() if selection == "Daily/weekly/monthly data": # Button to select day, month or week # Add streamlit selectbox. view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"]) if view_level == "day": # make a button to select the day, defaulting to today day = st.date_input("Select day", value="today", format="DD/MM/YYYY") # convert to the day of a Pandas Timestamp day = pd.Timestamp(day) print("Day:", day) df = df[df.index.date == day.date()] st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}") display_data(df) elif view_level == "week": # make a button to select the week week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52) # Extract week number from the index df['week'] = df.index.isocalendar().week # Filter the dataframe for the desired week number df = df[df['week'] == week_number] st.write(f"Showing data for week {week_number}") display_data(df) elif view_level == "month": # make a button to select the month, defaulting to current month month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]) year_str = st.selectbox("Select year", options=["2024"]) # Filter the dataframe for the desired week number month_map = { 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12 } # Convert month string to number month = month_map[month_str] year = int(year_str) df = df[(df.index.month == month) & (df.index.year == year)] st.write(f"Showing data for {month_str} {year_str}") display_data(df) elif selection == "Aggregated data": aggregated_data(df) aggregated_data(df, aggregation_level="month") else: st.write("Error: selection not recognized") # Display data based on aggregation level if __name__ == "__main__": main()