Spaces:

liandev
/

open-source

Running

File size: 17,138 Bytes

c1e3e0a

import streamlit as st
import pandas as pd
import plotly.express as px
import datetime as dt
from datetime import datetime
from library import *

st.set_page_config(
    page_title="GitHub Analytics Dashboard",
    layout="wide",
    page_icon="📊"
)

st.title("🚀 Repo Radar")
st.text("""

This website provides a thorough analysis of the most popular open-source Github repositories.

Skip the search and start buliding with Repo Radar!!!

Our data was last updated on May 2025.

""")

import streamlit as st

repo_data = pd.read_csv('repo_data_clean.csv', sep=';')

usage_data = repo_data.iloc[:, [1, -5, 18, 19, -8, -4, -6, -7, 3, -3, -2]]
usage_data.iloc[:, 1:-3] = usage_data.iloc[:, 1:-3].map(parse_to_list)

usage_data['Main Programming Language'] = usage_data['Main Programming Language'].map(lambda x: x[0] if isinstance(x, list) else x)

all_prog_langs = extract_unique(repo_data['Programming Languages used'].map(parse_to_list))

# Problema - [Solucinado]: Detectaba 'java' y 'javascript' como iguales
count_langs = lambda column: {key: len(repo_data[repo_data[column].str.contains(fr'\b{key}\b', na = False)]) \
                    for key in all_prog_langs}

tabs = st.tabs(['🔎 Find your repo', '📈 Data Analysis', "🚨 What's next?"])

with tabs[0]:
    st.header('What repo are you looking for?')

    name = st.text_input('Name', placeholder = 'Do you know the name of the repo?')

    col1, col2 = st.columns(2)
    col3, col4 = st.columns(2)

    with col1:
        cats = st.multiselect('Categories',
            placeholder = "What's  it for?",
            options=extract_unique(usage_data['Category']))
    with col2:
        nat_langs = st.multiselect('Languages available:',
            placeholder = "What language do you speak?",
            options=extract_unique(usage_data['Languages Available']))
    with col3:
        platforms = st.multiselect('Platform',
            placeholder = "What platform is it available on?",
            options=extract_unique(usage_data['Platforms']))
    with col4:
        tecs = st.multiselect('Techniques',
            placeholder = "What techniques does it use?",
            options=extract_unique(usage_data['Techniques used']))
        
    prog_langs = st.multiselect('Programming Languages used:',
            placeholder = "In what language was it written?",
            options=extract_unique(usage_data['Programming Languages used']))
    
    usage_filtered = usage_data
    if any([name, cats, nat_langs, platforms, tecs, prog_langs]):
        if name:
            usage_filtered = usage_filtered[usage_filtered['Name'].str.contains(name.strip('\\'), case = False, na = False)]
        if cats:
            usage_filtered = usage_filtered[usage_filtered['Category'].apply(filter(cats))]
        if nat_langs:
            usage_filtered = usage_filtered[usage_filtered['Languages Available'].apply(filter(nat_langs))]
        if platforms:
            usage_filtered = usage_filtered[usage_filtered['Platforms'].apply(filter(platforms))]
        if tecs:
            usage_filtered = usage_filtered[usage_filtered['Techniques used'].apply(filter(tecs))]
        if prog_langs:
            usage_filtered = usage_filtered[usage_filtered['Programming Languages used'].apply(filter(prog_langs))]

    df_event = st.dataframe(usage_filtered.iloc[:, :-3], 
                 key = 'repo',
                 hide_index = True, 
                 on_select = 'rerun',
                 selection_mode = 'single-row')
    

    with st.expander('REPO DATA', expanded = True):
        if df_event.selection['rows']:
            index = df_event.selection['rows'][0]
            repo_name = usage_filtered.iloc[index]['Name']
            current_repo = lambda column: repo_data.loc[repo_data['Name'] == repo_name, column].values[0]
            st.header(f'{repo_name}')
            col5, col6 = st.columns(2, border=True)
            with col5:
                st.header('Repo Description: ')
                st.subheader('Purpose')
                st.write(current_repo('purpose'))
                st.subheader('Functionality')
                st.write(current_repo('functionality'))
            with col6:
                col7, col8 = st.columns(2)
                with col7:
                    st.subheader('Stars:')
                    st.text(current_repo('stars'))
                    st.subheader('Forks:')
                    st.text(current_repo('forks'))
                    st.subheader('Ist.session_stateues:')
                    st.text(current_repo('open_issues'))
                    st.subheader('Created:')
                    st.text(parse_date(current_repo('created_at')))
                with col8:
                    st.subheader('Pull Requests:')
                    st.text(current_repo('pull_requests'))
                    st.subheader('Subscribers: ')
                    st.text(current_repo('subscribers_count'))
                    st.subheader('License:')
                    st.text(current_repo('License'))
                    st.subheader('Last Updated:')
                    st.text(parse_date(current_repo('updated_at')))
            
            st.subheader(f"URL link: {current_repo('url')}")
        else:
            st.subheader("Select a Repo to view it's data")
            st.text('You can select a repo by clicking on the checkbox in the first column of the DataFrame')

with tabs[1]:
    st.header("💻 GitHub Analytics Dashboard")
    st.text('Open the sidebar to use our filters')
    st.subheader("📚 Programming Languages Distribution")
    

    # Setting up the dataframes:

    language_count = count_langs('Programming Languages used')
    as_main_language = count_langs('Main Programming Language')
    
    sort_criteria = st.selectbox('Sort by:',
                    options=['Times Used', 'Times Used as Main Language' ],
                    )
    sort_parsed = 'As Main' if sort_criteria == 'Times Used as Main Language' else sort_criteria
    language_count = pd.DataFrame(
                    {'Language':  list(language_count.keys()),
                     'Times Used': list(language_count.values()),
                     'Used %': [num/len(repo_data) for num in language_count.values()],
                     'As Main': list(as_main_language.values()),
                     'As Main %': [num/len(repo_data) for num in as_main_language.values()]
                     }).sort_values(sort_parsed, ascending=False,)

    license_stats = pd.merge(
        pd.DataFrame(repo_data['License'].value_counts()),
        pd.DataFrame(repo_data['License'].value_counts(normalize=True)),
        left_index=True, 
        right_index=True
    ).reset_index()
    license_stats.columns = ['License', 'Count', 'Percent']

    # Filters
    with st.sidebar:
        if 'max_repos' not in st.session_state:
            st.session_state.max_repos = int(language_count['Times Used'].max())

        min_repos = st.slider(
            "Minimum repositories",
            min_value=1,
            max_value=int(language_count['Times Used'].max()),
            value=15
        )
        st.session_state.max_repos = st.slider(
            "Maximum repositories",
            min_value=min_repos,
            max_value=int(language_count['Times Used'].max()),
            value=st.session_state.max_repos
        )

        filtered_lang_count = language_count[language_count['Times Used'] >= min_repos]
        filtered_lang_count = filtered_lang_count[filtered_lang_count['Times Used'] <= st.session_state.max_repos]
        
        filtered_licenses = license_stats[license_stats['Count'] >= min_repos]
        filtered_licenses = filtered_licenses[filtered_licenses['Count'] <= st.session_state.max_repos]
    
    col1, col2 = st.columns(2, border=True)

    with col1:
        st.subheader("Frequency Table")
        st.dataframe(
            filtered_lang_count.style.format({'Used %': '{:.2%}', 'As Main %': '{:.2%}'}),
            use_container_width=True,
            hide_index= True,
        )

    with col2:
        st.subheader("Top Languages")
        top_lang =  st.slider('Amount of languages shown:', 5, 20,12)
        fig = px.bar(
            filtered_lang_count.head(top_lang),
            x='Language',
            y=['Times Used', 'As Main'],
            barmode = 'overlay',
            color='Language',
            text_auto=True,
            hover_name='Language',
            hover_data={
                'Used %': ':.2%', 'As Main %': ':.2%',
                }
        )
        fig.update_traces(
            # texttemplate='%{text:.2%}', 
            textposition='outside',
            showlegend = False
            )
        fig.update_layout(
            xaxis_title="Programming Language",
            yaxis_title="Count",
        )
        st.plotly_chart(fig, use_container_width=True)

    with st.expander("🀄 See More", expanded=False):
        st.subheader('🌲 Tree Map Visualization')
        st.text(f'Viewing and sorting by: {sort_criteria}')
        fig_tree = px.treemap(
                filtered_lang_count,
                names='Language',
                values=sort_parsed,
                path = ['Language', sort_parsed],
                hover_name='Language',
                hover_data={
                    'Times Used': True, 'As Main': True,
                    'Used %': ':.2%', 'As Main %': ':.2%'}
        )
        fig_tree.update_layout(
            height = 400,
            margin = dict(l=0,r=0,b=40,t=0)
        )
        st.plotly_chart(fig_tree, use_container_width=True)
    """ --------------------------------------------------------------------- """

    st.subheader("📜 Software License Distribution")

    fig = px.bar(
        filtered_licenses.sort_values('Count', ascending=True),
        x='Count',
        y='License',
        orientation='h',
        color='Count',
        text='Percent',
        color_continuous_scale='Teal',
        labels={'Count': 'Number of Repos'},
        hover_data={'Percent': ':.2%'}
    )
    fig.update_traces(
        texttemplate='%{text:.2%}', 
        textposition='outside'
    )
    fig.update_layout(
        yaxis={'categoryorder':'total ascending'},
        height=500
    )
    st.plotly_chart(fig, use_container_width=True)
    with st.expander("🧩 See More", expanded=False):
        col1, col2 = st.columns(2, border = True)
        with col1:
            st.subheader('📋 Tabular Data')
            st.dataframe(
                filtered_licenses.style.format({'Percent': '{:.2%}'}),
                use_container_width=True,
            )
        with col2:
            st.subheader('🌳 Tree Map Visualization')
            fig_tree = px.treemap(
                filtered_licenses,
                names='License',
                values='Count',
                path = ['License'],
                hover_name='License',
                hover_data={'Percent': ':.2%'},
            )
            fig_tree.update_layout(
                height = 600,
                margin = dict(l=0,r=0,b=40,t=0)
            )
            st.plotly_chart(fig_tree, use_container_width=True)
        

    """ ----------------------------------------------------------------------- """

    st.subheader("🔧 Repo Stats According to License and Language")

    metrics_options = ['Forks', 'Pull Requests', 'Stars', 'Open Issues']    
    parsed_options = [metric.lower().replace(' ', '_') for metric in metrics_options]

    metric_choice = st.multiselect(
        "Select Stats to compare",
        options=metrics_options,
        default=metrics_options[:3],
    )

    parsed_choices = [metric.lower().replace(' ', '_') for metric in metric_choice]

    measurement = st.selectbox('How do you want to group the data',
                    options = ['Mean', 'Median', 'Total'])
    
    if metric_choice:
        big_number = max(repo_data[metric].max() for metric in parsed_options)

        if not 'min_threshold' in st.session_state:
            st.session_state.min_threshold = big_number//50
        if not 'max_threshold' in st.session_state:
            st.session_state.max_threshold = big_number
        

        col1, col2 = st.columns(2)
        with col1:
            st.session_state.min_threshold = st.number_input(f"Minimum quantity of {', '.join(metric_choice)}:", 1, big_number, st.session_state.min_threshold, step=1000)
        with col2:
            st.session_state.max_threshold = st.number_input(f"Maximum quantity of {', '.join(metric_choice)}:", 1, big_number, st.session_state.max_threshold, step=1000)
        
        condition = ((repo_data[parsed_choices] >= st.session_state.min_threshold) & (repo_data[parsed_choices] <= st.session_state.max_threshold)).all(axis=1)
        metrics_df = repo_data[['Main Programming Language', 'License'] + parsed_choices]
        metrics_df = metrics_df[condition]
        if measurement == 'Mean':
            lang_metrics = metrics_df.groupby(['Main Programming Language'], as_index=False)[parsed_choices].mean()
            license_metrics = metrics_df.groupby(['License'], as_index=False)[parsed_choices].mean()
        elif measurement == 'Median':
            lang_metrics = metrics_df.groupby(['Main Programming Language'], as_index=False)[parsed_choices].median()
            license_metrics = metrics_df.groupby(['License'], as_index=False)[parsed_choices].median()
        else:
            license_metrics = lang_metrics = metrics_df

        top_bars = st.slider(
            'Amount of languages/licenses to show:', 
            4, 100, 10)

        col3, col4 = st.columns(2, border=True)
        with col3:
            st.subheader('By Main Language')
            fig = px.bar(
                lang_metrics.sort_values(parsed_choices[0], ascending=False).head(top_bars),
                x='Main Programming Language',
                y=parsed_choices,
                barmode='group',
                color_discrete_sequence=px.colors.qualitative.Pastel,
                labels={'value': 'Count', 'variable': 'Metric', 'Main Programming Language': 'Language'}
            
            )
            fig.update_layout(
                xaxis_title="Programming Language",
                yaxis_title="Count",
                hovermode="x unified"
            )
            st.plotly_chart(fig, use_container_width=True)
        
        with col4:
            st.subheader('By License')
            fig = px.bar(
                license_metrics.sort_values(parsed_choices[0], ascending=False).head(top_bars),
                x='License',
                y=parsed_choices,
                barmode='group',
                color_discrete_sequence=px.colors.qualitative.Pastel,
                labels={'value': 'Count', 'variable': 'Metric', 'Main Programming Language': 'Language'}
                )
            
            fig.update_layout(
                xaxis_title="License",
                yaxis_title="Count",
                hovermode="x unified"
            )
            st.plotly_chart(fig, use_container_width=True)

        with st.expander("📊 See aggregated data:"):
            agg_df = metrics_df.groupby(['License','Main Programming Language'])[parsed_choices].sum().sort_values(parsed_choices[0], ascending=False)
            
            st.dataframe(
                agg_df.style.background_gradient(cmap='Blues'),
                use_container_width=True
            )
    else:
        st.warning("Please select at least one metric to visualize")

with tabs[2]:
    st.header('We are still not done!!!')
    st.markdown('''

    We will be upgrading this website soon.\n

    Stay tuned for more updates.\n

    \n

    📸 Follow us on instagram:\n

    https://www.instagram.com/ds.open.source/

    \n

    👓 Keep in touch with our Github repository if you want to see how we code:\n

    https://github.com/liandeveloper/open_source/

    ''')

    # quant_1 = st.selectbox('1',
    #     options = ['Stars','Forks', 'Ist.session_stateues', 'Pull Requests']
    #     )
    # quant_2 = st.selectbox('2',
    #     options = ['Stars','Forks', 'Ist.session_stateues', 'Pull Requests'])
    # quali_1 =st.selectbox('3',
    #     options = ['License','Main Programming Language'])
    # qn1_p = 'open_ist.session_stateues' if quant_1 == 'Ist.session_stateues' else quant_1.replace(' ', '_').lower()
    # qn2_p = 'open_ist.session_stateues' if quant_2 == 'Ist.session_stateues' else quant_2.replace(' ', '_').lower()
    # ql1_p = 'License' if quali_1 == 'License' else quali_1

    # st.plotly_chart(px.scatter(
    #     repo_data, 
    #     x=qn1_p, y=qn2_p,
    #     labels = [quant_1, quant_2],
    #     color = ql1_p
    #     )
    #     )
    # st.dataframe(repo_data)