Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import datetime as dt | |
from datetime import datetime | |
from library import * | |
st.set_page_config( | |
page_title="GitHub Analytics Dashboard", | |
layout="wide", | |
page_icon="π" | |
) | |
st.title("π Repo Radar") | |
st.text(""" | |
This website provides a thorough analysis of the most popular open-source Github repositories. | |
Skip the search and start buliding with Repo Radar!!! | |
Our data was last updated on May 2025. | |
""") | |
import streamlit as st | |
repo_data = pd.read_csv('repo_data_clean.csv', sep=';') | |
usage_data = repo_data.iloc[:, [1, -5, 18, 19, -8, -4, -6, -7, 3, -3, -2]] | |
usage_data.iloc[:, 1:-3] = usage_data.iloc[:, 1:-3].map(parse_to_list) | |
usage_data['Main Programming Language'] = usage_data['Main Programming Language'].map(lambda x: x[0] if isinstance(x, list) else x) | |
all_prog_langs = extract_unique(repo_data['Programming Languages used'].map(parse_to_list)) | |
# Problema - [Solucinado]: Detectaba 'java' y 'javascript' como iguales | |
count_langs = lambda column: {key: len(repo_data[repo_data[column].str.contains(fr'\b{key}\b', na = False)]) \ | |
for key in all_prog_langs} | |
tabs = st.tabs(['π Find your repo', 'π Data Analysis', "π¨ What's next?"]) | |
with tabs[0]: | |
st.header('What repo are you looking for?') | |
name = st.text_input('Name', placeholder = 'Do you know the name of the repo?') | |
col1, col2 = st.columns(2) | |
col3, col4 = st.columns(2) | |
with col1: | |
cats = st.multiselect('Categories', | |
placeholder = "What's it for?", | |
options=extract_unique(usage_data['Category'])) | |
with col2: | |
nat_langs = st.multiselect('Languages available:', | |
placeholder = "What language do you speak?", | |
options=extract_unique(usage_data['Languages Available'])) | |
with col3: | |
platforms = st.multiselect('Platform', | |
placeholder = "What platform is it available on?", | |
options=extract_unique(usage_data['Platforms'])) | |
with col4: | |
tecs = st.multiselect('Techniques', | |
placeholder = "What techniques does it use?", | |
options=extract_unique(usage_data['Techniques used'])) | |
prog_langs = st.multiselect('Programming Languages used:', | |
placeholder = "In what language was it written?", | |
options=extract_unique(usage_data['Programming Languages used'])) | |
usage_filtered = usage_data | |
if any([name, cats, nat_langs, platforms, tecs, prog_langs]): | |
if name: | |
usage_filtered = usage_filtered[usage_filtered['Name'].str.contains(name.strip('\\'), case = False, na = False)] | |
if cats: | |
usage_filtered = usage_filtered[usage_filtered['Category'].apply(filter(cats))] | |
if nat_langs: | |
usage_filtered = usage_filtered[usage_filtered['Languages Available'].apply(filter(nat_langs))] | |
if platforms: | |
usage_filtered = usage_filtered[usage_filtered['Platforms'].apply(filter(platforms))] | |
if tecs: | |
usage_filtered = usage_filtered[usage_filtered['Techniques used'].apply(filter(tecs))] | |
if prog_langs: | |
usage_filtered = usage_filtered[usage_filtered['Programming Languages used'].apply(filter(prog_langs))] | |
df_event = st.dataframe(usage_filtered.iloc[:, :-3], | |
key = 'repo', | |
hide_index = True, | |
on_select = 'rerun', | |
selection_mode = 'single-row') | |
with st.expander('REPO DATA', expanded = True): | |
if df_event.selection['rows']: | |
index = df_event.selection['rows'][0] | |
repo_name = usage_filtered.iloc[index]['Name'] | |
current_repo = lambda column: repo_data.loc[repo_data['Name'] == repo_name, column].values[0] | |
st.header(f'{repo_name}') | |
col5, col6 = st.columns(2, border=True) | |
with col5: | |
st.header('Repo Description: ') | |
st.subheader('Purpose') | |
st.write(current_repo('purpose')) | |
st.subheader('Functionality') | |
st.write(current_repo('functionality')) | |
with col6: | |
col7, col8 = st.columns(2) | |
with col7: | |
st.subheader('Stars:') | |
st.text(current_repo('stars')) | |
st.subheader('Forks:') | |
st.text(current_repo('forks')) | |
st.subheader('Ist.session_stateues:') | |
st.text(current_repo('open_issues')) | |
st.subheader('Created:') | |
st.text(parse_date(current_repo('created_at'))) | |
with col8: | |
st.subheader('Pull Requests:') | |
st.text(current_repo('pull_requests')) | |
st.subheader('Subscribers: ') | |
st.text(current_repo('subscribers_count')) | |
st.subheader('License:') | |
st.text(current_repo('License')) | |
st.subheader('Last Updated:') | |
st.text(parse_date(current_repo('updated_at'))) | |
st.subheader(f"URL link: {current_repo('url')}") | |
else: | |
st.subheader("Select a Repo to view it's data") | |
st.text('You can select a repo by clicking on the checkbox in the first column of the DataFrame') | |
with tabs[1]: | |
st.header("π» GitHub Analytics Dashboard") | |
st.text('Open the sidebar to use our filters') | |
st.subheader("π Programming Languages Distribution") | |
# Setting up the dataframes: | |
language_count = count_langs('Programming Languages used') | |
as_main_language = count_langs('Main Programming Language') | |
sort_criteria = st.selectbox('Sort by:', | |
options=['Times Used', 'Times Used as Main Language' ], | |
) | |
sort_parsed = 'As Main' if sort_criteria == 'Times Used as Main Language' else sort_criteria | |
language_count = pd.DataFrame( | |
{'Language': list(language_count.keys()), | |
'Times Used': list(language_count.values()), | |
'Used %': [num/len(repo_data) for num in language_count.values()], | |
'As Main': list(as_main_language.values()), | |
'As Main %': [num/len(repo_data) for num in as_main_language.values()] | |
}).sort_values(sort_parsed, ascending=False,) | |
license_stats = pd.merge( | |
pd.DataFrame(repo_data['License'].value_counts()), | |
pd.DataFrame(repo_data['License'].value_counts(normalize=True)), | |
left_index=True, | |
right_index=True | |
).reset_index() | |
license_stats.columns = ['License', 'Count', 'Percent'] | |
# Filters | |
with st.sidebar: | |
if 'max_repos' not in st.session_state: | |
st.session_state.max_repos = int(language_count['Times Used'].max()) | |
min_repos = st.slider( | |
"Minimum repositories", | |
min_value=1, | |
max_value=int(language_count['Times Used'].max()), | |
value=15 | |
) | |
st.session_state.max_repos = st.slider( | |
"Maximum repositories", | |
min_value=min_repos, | |
max_value=int(language_count['Times Used'].max()), | |
value=st.session_state.max_repos | |
) | |
filtered_lang_count = language_count[language_count['Times Used'] >= min_repos] | |
filtered_lang_count = filtered_lang_count[filtered_lang_count['Times Used'] <= st.session_state.max_repos] | |
filtered_licenses = license_stats[license_stats['Count'] >= min_repos] | |
filtered_licenses = filtered_licenses[filtered_licenses['Count'] <= st.session_state.max_repos] | |
col1, col2 = st.columns(2, border=True) | |
with col1: | |
st.subheader("Frequency Table") | |
st.dataframe( | |
filtered_lang_count.style.format({'Used %': '{:.2%}', 'As Main %': '{:.2%}'}), | |
use_container_width=True, | |
hide_index= True, | |
) | |
with col2: | |
st.subheader("Top Languages") | |
top_lang = st.slider('Amount of languages shown:', 5, 20,12) | |
fig = px.bar( | |
filtered_lang_count.head(top_lang), | |
x='Language', | |
y=['Times Used', 'As Main'], | |
barmode = 'overlay', | |
color='Language', | |
text_auto=True, | |
hover_name='Language', | |
hover_data={ | |
'Used %': ':.2%', 'As Main %': ':.2%', | |
} | |
) | |
fig.update_traces( | |
# texttemplate='%{text:.2%}', | |
textposition='outside', | |
showlegend = False | |
) | |
fig.update_layout( | |
xaxis_title="Programming Language", | |
yaxis_title="Count", | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
with st.expander("π See More", expanded=False): | |
st.subheader('π² Tree Map Visualization') | |
st.text(f'Viewing and sorting by: {sort_criteria}') | |
fig_tree = px.treemap( | |
filtered_lang_count, | |
names='Language', | |
values=sort_parsed, | |
path = ['Language', sort_parsed], | |
hover_name='Language', | |
hover_data={ | |
'Times Used': True, 'As Main': True, | |
'Used %': ':.2%', 'As Main %': ':.2%'} | |
) | |
fig_tree.update_layout( | |
height = 400, | |
margin = dict(l=0,r=0,b=40,t=0) | |
) | |
st.plotly_chart(fig_tree, use_container_width=True) | |
""" --------------------------------------------------------------------- """ | |
st.subheader("π Software License Distribution") | |
fig = px.bar( | |
filtered_licenses.sort_values('Count', ascending=True), | |
x='Count', | |
y='License', | |
orientation='h', | |
color='Count', | |
text='Percent', | |
color_continuous_scale='Teal', | |
labels={'Count': 'Number of Repos'}, | |
hover_data={'Percent': ':.2%'} | |
) | |
fig.update_traces( | |
texttemplate='%{text:.2%}', | |
textposition='outside' | |
) | |
fig.update_layout( | |
yaxis={'categoryorder':'total ascending'}, | |
height=500 | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
with st.expander("π§© See More", expanded=False): | |
col1, col2 = st.columns(2, border = True) | |
with col1: | |
st.subheader('π Tabular Data') | |
st.dataframe( | |
filtered_licenses.style.format({'Percent': '{:.2%}'}), | |
use_container_width=True, | |
) | |
with col2: | |
st.subheader('π³ Tree Map Visualization') | |
fig_tree = px.treemap( | |
filtered_licenses, | |
names='License', | |
values='Count', | |
path = ['License'], | |
hover_name='License', | |
hover_data={'Percent': ':.2%'}, | |
) | |
fig_tree.update_layout( | |
height = 600, | |
margin = dict(l=0,r=0,b=40,t=0) | |
) | |
st.plotly_chart(fig_tree, use_container_width=True) | |
""" ----------------------------------------------------------------------- """ | |
st.subheader("π§ Repo Stats According to License and Language") | |
metrics_options = ['Forks', 'Pull Requests', 'Stars', 'Open Issues'] | |
parsed_options = [metric.lower().replace(' ', '_') for metric in metrics_options] | |
metric_choice = st.multiselect( | |
"Select Stats to compare", | |
options=metrics_options, | |
default=metrics_options[:3], | |
) | |
parsed_choices = [metric.lower().replace(' ', '_') for metric in metric_choice] | |
measurement = st.selectbox('How do you want to group the data', | |
options = ['Mean', 'Median', 'Total']) | |
if metric_choice: | |
big_number = max(repo_data[metric].max() for metric in parsed_options) | |
if not 'min_threshold' in st.session_state: | |
st.session_state.min_threshold = big_number//50 | |
if not 'max_threshold' in st.session_state: | |
st.session_state.max_threshold = big_number | |
col1, col2 = st.columns(2) | |
with col1: | |
st.session_state.min_threshold = st.number_input(f"Minimum quantity of {', '.join(metric_choice)}:", 1, big_number, st.session_state.min_threshold, step=1000) | |
with col2: | |
st.session_state.max_threshold = st.number_input(f"Maximum quantity of {', '.join(metric_choice)}:", 1, big_number, st.session_state.max_threshold, step=1000) | |
condition = ((repo_data[parsed_choices] >= st.session_state.min_threshold) & (repo_data[parsed_choices] <= st.session_state.max_threshold)).all(axis=1) | |
metrics_df = repo_data[['Main Programming Language', 'License'] + parsed_choices] | |
metrics_df = metrics_df[condition] | |
if measurement == 'Mean': | |
lang_metrics = metrics_df.groupby(['Main Programming Language'], as_index=False)[parsed_choices].mean() | |
license_metrics = metrics_df.groupby(['License'], as_index=False)[parsed_choices].mean() | |
elif measurement == 'Median': | |
lang_metrics = metrics_df.groupby(['Main Programming Language'], as_index=False)[parsed_choices].median() | |
license_metrics = metrics_df.groupby(['License'], as_index=False)[parsed_choices].median() | |
else: | |
license_metrics = lang_metrics = metrics_df | |
top_bars = st.slider( | |
'Amount of languages/licenses to show:', | |
4, 100, 10) | |
col3, col4 = st.columns(2, border=True) | |
with col3: | |
st.subheader('By Main Language') | |
fig = px.bar( | |
lang_metrics.sort_values(parsed_choices[0], ascending=False).head(top_bars), | |
x='Main Programming Language', | |
y=parsed_choices, | |
barmode='group', | |
color_discrete_sequence=px.colors.qualitative.Pastel, | |
labels={'value': 'Count', 'variable': 'Metric', 'Main Programming Language': 'Language'} | |
) | |
fig.update_layout( | |
xaxis_title="Programming Language", | |
yaxis_title="Count", | |
hovermode="x unified" | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
with col4: | |
st.subheader('By License') | |
fig = px.bar( | |
license_metrics.sort_values(parsed_choices[0], ascending=False).head(top_bars), | |
x='License', | |
y=parsed_choices, | |
barmode='group', | |
color_discrete_sequence=px.colors.qualitative.Pastel, | |
labels={'value': 'Count', 'variable': 'Metric', 'Main Programming Language': 'Language'} | |
) | |
fig.update_layout( | |
xaxis_title="License", | |
yaxis_title="Count", | |
hovermode="x unified" | |
) | |
st.plotly_chart(fig, use_container_width=True) | |
with st.expander("π See aggregated data:"): | |
agg_df = metrics_df.groupby(['License','Main Programming Language'])[parsed_choices].sum().sort_values(parsed_choices[0], ascending=False) | |
st.dataframe( | |
agg_df.style.background_gradient(cmap='Blues'), | |
use_container_width=True | |
) | |
else: | |
st.warning("Please select at least one metric to visualize") | |
with tabs[2]: | |
st.header('We are still not done!!!') | |
st.markdown(''' | |
We will be upgrading this website soon.\n | |
Stay tuned for more updates.\n | |
\n | |
πΈ Follow us on instagram:\n | |
https://www.instagram.com/ds.open.source/ | |
\n | |
π Keep in touch with our Github repository if you want to see how we code:\n | |
https://github.com/liandeveloper/open_source/ | |
''') | |
# quant_1 = st.selectbox('1', | |
# options = ['Stars','Forks', 'Ist.session_stateues', 'Pull Requests'] | |
# ) | |
# quant_2 = st.selectbox('2', | |
# options = ['Stars','Forks', 'Ist.session_stateues', 'Pull Requests']) | |
# quali_1 =st.selectbox('3', | |
# options = ['License','Main Programming Language']) | |
# qn1_p = 'open_ist.session_stateues' if quant_1 == 'Ist.session_stateues' else quant_1.replace(' ', '_').lower() | |
# qn2_p = 'open_ist.session_stateues' if quant_2 == 'Ist.session_stateues' else quant_2.replace(' ', '_').lower() | |
# ql1_p = 'License' if quali_1 == 'License' else quali_1 | |
# st.plotly_chart(px.scatter( | |
# repo_data, | |
# x=qn1_p, y=qn2_p, | |
# labels = [quant_1, quant_2], | |
# color = ql1_p | |
# ) | |
# ) | |
# st.dataframe(repo_data) |