File size: 7,292 Bytes
57c87c9
 
 
 
9325c4d
404478b
57c87c9
1396667
57c87c9
 
404478b
 
 
 
 
 
 
 
4dd059d
404478b
 
 
 
 
 
3170ddb
 
1396667
9325c4d
 
 
1396667
3170ddb
404478b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3170ddb
 
 
 
 
9325c4d
3170ddb
9325c4d
404478b
 
3170ddb
 
 
 
 
404478b
b58eec2
 
3170ddb
b58eec2
9325c4d
b58eec2
3170ddb
 
 
 
404478b
b58eec2
3170ddb
404478b
9325c4d
404478b
 
3170ddb
 
4dd059d
 
3170ddb
4dd059d
9325c4d
4dd059d
 
3170ddb
 
404478b
 
57c87c9
b58eec2
57c87c9
570845b
 
 
 
404478b
1396667
 
 
 
 
4dd059d
404478b
1396667
 
404478b
570845b
404478b
570845b
 
 
 
 
 
 
 
 
 
 
 
b58eec2
570845b
404478b
570845b
 
 
404478b
 
 
 
 
 
 
570845b
404478b
570845b
404478b
570845b
 
 
404478b
 
 
 
 
 
 
 
 
 
 
 
 
 
570845b
b58eec2
570845b
404478b
570845b
 
404478b
 
570845b
 
 
57c87c9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from datetime import datetime

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from load_dataframe import get_data


def aggregated_data(df, aggregation_level="week"):

    st.write(f"Aggregated data by {aggregation_level}")

    # Create a column that indicates if a paper has any artifacts
    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)

    # Resample by week
    freq = 'W' if aggregation_level == "week" else 'ME'
    weekly_total_papers = df.resample(freq).size()
    weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()

    # Calculate the percentage of papers with artifacts
    percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100

    # Calculate the growth rate
    growth_rate = percentage_papers_with_artifacts.pct_change() * 100
    growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()

    # Display the average growth rate as a big number
    average_growth_rate = growth_rate.mean()
    st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")

    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')

    # Set the y-axis limits
    plt.ylim(0, 100)
    
    plt.xlabel(aggregation_level)
    plt.ylabel('Percentage')
    plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
    plt.legend()
    plt.grid(True)

    # Use Streamlit to display the plot
    st.pyplot(plt)


def display_data(df):
    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
    num_artifacts = df['has_artifact'].sum()
    percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
    percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)

    # add reached out and reached out link columns
    df['reached_out'] = [False for _ in range(df.shape[0])]
    df["reached_out_link"] = ["" for _ in range(df.shape[0])]

    st.markdown(f"""
    ## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
    
    * Number of papers: {df.shape[0]}
    * Number of papers with a Github link: {df['github'].notnull().sum()}
    * Number of papers with at least one HF artifact: {num_artifacts}
    """)

    st.write("Papers with at least one artifact")
    st.data_editor(df[df['has_artifact']],
                hide_index=True,
                column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn(),
                                "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
                width=2000,
                key="papers_with_artifacts")
    
    st.write("Papers without artifacts")
    st.data_editor(df[~df['has_artifact']],
                hide_index=True,
                column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn()},
                width=2000,
                key="papers_without_artifacts")
    
    st.write("Papers with a HF mention in README but no artifacts")
    st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
                hide_index=True,
                column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn()},
                width=2000,
                key="papers_with_hf_mention_no_artifacts")


def main():
    st.title("Hugging Face Artifacts KPI Dashboard")

    # 2 tabs: one for daily data, one for weekly data
    st.sidebar.title("Navigation")
    selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])

    # TODO use this instead
    df = get_data()

    print(df.head())

    # df = pd.read_csv('daily_papers_enriched (3).csv')
    df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
    # Use date as index
    # df = df.set_index('date')
    # df.index = pd.to_datetime(df.index)
    df = df.sort_index()

    if selection == "Daily/weekly/monthly data":
        # Button to select day, month or week
        # Add streamlit selectbox.
        view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])

        if view_level == "day":
            # make a button to select the day, defaulting to today
            day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
            # convert to the day of a Pandas Timestamp
            day = pd.Timestamp(day)

            df = df[df.index.date == day.date()]

            st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")

            display_data(df)

        elif view_level == "week":
            # make a button to select the week
            week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
            
            # Extract week number from the index
            df['week'] = df.index.isocalendar().week

            # Filter the dataframe for the desired week number
            df = df[df['week'] == week_number]
            
            st.write(f"Showing data for week {week_number}")
            
            display_data(df)

        elif view_level == "month":
            # make a button to select the month, defaulting to current month
            month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
            year_str = st.selectbox("Select year", options=["2024"])
            
            # Filter the dataframe for the desired week number
            month_map = {
                'January': 1, 'February': 2, 'March': 3, 'April': 4, 
                'May': 5, 'June': 6, 'July': 7, 'August': 8, 
                'September': 9, 'October': 10, 'November': 11, 'December': 12
            }
            
            # Convert month string to number
            month = month_map[month_str]
            year = int(year_str)
            df = df[(df.index.month == month) & (df.index.year == year)]
            
            st.write(f"Showing data for {month_str} {year_str}")
            
            display_data(df)

    elif selection == "Aggregated data":
        aggregated_data(df)
        aggregated_data(df, aggregation_level="month")

    else:
        st.write("Error: selection not recognized")


if __name__ == "__main__":
    main()