Spaces:

espejelomar
/

Starknet_Dev_Metrics

Sleeping

App Files Files Community

espejelomar commited on Apr 3

Commit

151eb1b

•

1 Parent(s): 549c95d

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

data/source/all_networks_developer_classification.csv +0 -0
data/source/all_networks_developer_classification_updated_february.csv +0 -0
debug.csv +0 -0
github_metrics/__pycache__/utils.cpython-311.pyc +0 -0
github_metrics/developer_survival_plot.py +0 -1
github_metrics/main.py +211 -44
github_metrics/utils.py +2 -1

data/source/all_networks_developer_classification.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/source/all_networks_developer_classification_updated_february.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

debug.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

github_metrics/__pycache__/utils.cpython-311.pyc CHANGED Viewed

Binary files a/github_metrics/__pycache__/utils.cpython-311.pyc and b/github_metrics/__pycache__/utils.cpython-311.pyc differ

github_metrics/developer_survival_plot.py CHANGED Viewed

@@ -4,7 +4,6 @@ import pandas as pd
 import seaborn as sns
 from lifelines import KaplanMeierFitter
 from matplotlib.colors import LinearSegmentedColormap
 from utils import save_plot

 import seaborn as sns
 from lifelines import KaplanMeierFitter
 from matplotlib.colors import LinearSegmentedColormap
 from utils import save_plot

github_metrics/main.py CHANGED Viewed

@@ -2,60 +2,131 @@ import gradio as gr
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
-from termcolor import colored
 from scipy.stats import mannwhitneyu
 from utils import load_all_developers_dataset
 def process_input(input_text, uploaded_file, program_end_date=None, event_name=None):
     try:
         print(colored("Processing input...", "blue"))
         if uploaded_file is not None:
             print(colored("Reading from uploaded file...", "blue"))
             file_content = uploaded_file.decode("utf-8")
-            github_handles = [handle.strip() for handle in file_content.split("\n") if handle.strip()]
         else:
             github_handles = [handle.strip() for handle in input_text.split(",")]
         print(colored(f"GitHub handles: {github_handles}", "blue"))
         df = load_all_developers_dataset()
         print(colored("Filtering dataset...", "blue"))
         one_year_ago = pd.Timestamp.now() - pd.DateOffset(years=1)
-        filtered_df = df[(df["developer"].isin(github_handles)) & (df["month_year"] >= one_year_ago)]
         filtered_df = filtered_df.sort_values(by=["developer", "month_year"])
         filtered_df.loc[:, "month_year"] = pd.to_datetime(filtered_df["month_year"])
         line_fig = create_line_plot(filtered_df, github_handles, program_end_date)
-        analysis_result = perform_statistical_analysis(filtered_df, github_handles, program_end_date)
-        new_developers_count = count_new_developers(filtered_df, github_handles, program_end_date)
         last_3_months = pd.Timestamp.now() - pd.DateOffset(months=3)
         recent_activity_user = filtered_df[filtered_df["month_year"] >= last_3_months]
         all_devs_df = load_all_developers_dataset()
         all_devs_filtered_df = all_devs_df[(all_devs_df["month_year"] >= last_3_months)]
-        other_devs_recent_activity = all_devs_filtered_df[~all_devs_filtered_df["developer"].isin(github_handles)]
-        user_specified_active = recent_activity_user[recent_activity_user["total_commits"] > 0]
-        other_developers_active = other_devs_recent_activity[other_devs_recent_activity["total_commits"] > 0]
         box_fig = create_box_plot(user_specified_active, other_developers_active)
         print(colored("Classifying developers...", "blue"))
         classification_df = classify_developers(github_handles, recent_activity_user)
         print(colored("Classification completed.", "blue"))
-        comparison_result = compare_user_developers_to_others(user_specified_active, other_developers_active, df, program_end_date)
-        growth_rate_result = compare_growth_rate(user_specified_active, other_developers_active, df)
-        tldr_summary = generate_tldr_summary(github_handles, classification_df, analysis_result, new_developers_count, comparison_result, growth_rate_result, event_name)
-        return line_fig, box_fig, classification_df, analysis_result, new_developers_count, comparison_result, growth_rate_result, tldr_summary
     except Exception as e:
         print(colored(f"Error processing input: {e}", "red"))
-        return None, None, None, None, "Error in processing input.", None, None, "Error in processing input."
 def create_line_plot(filtered_df, github_handles, program_end_date):
-    all_developers = pd.DataFrame({"developer": github_handles, "month_year": pd.Timestamp.now(), "total_commits": 0})
     plot_df = pd.concat([filtered_df, all_developers])
-    plot_df = plot_df.groupby(["developer", "month_year"])["total_commits"].sum().reset_index()
     line_fig = px.line(
         plot_df,
         x="month_year",
@@ -66,13 +137,22 @@ def create_line_plot(filtered_df, github_handles, program_end_date):
     )
     if program_end_date:
         program_end_date = pd.to_datetime(program_end_date)
-        line_fig.add_vline(x=program_end_date, line_width=2, line_dash="dash", line_color="red")
     return line_fig
 def create_box_plot(user_specified_active, other_developers_active):
     box_fig = go.Figure()
-    box_fig.add_trace(go.Box(y=user_specified_active["total_commits"], name="User Specified Developers"))
-    box_fig.add_trace(go.Box(y=other_developers_active["total_commits"], name="Other Developers"))
     box_fig.update_layout(
         title="Comparison of Monthly Commits in the Last 3 Months: User Specified vs. Other Developers (Active Only)",
         yaxis_title="Total Monthly Commits",
@@ -80,6 +160,7 @@ def create_box_plot(user_specified_active, other_developers_active):
     )
     return box_fig
 def classify_developers(github_handles, recent_activity_user):
     classification = []
     for handle in github_handles:
@@ -99,12 +180,17 @@ def classify_developers(github_handles, recent_activity_user):
         "Previously active but no longer": 3,
         "Always been inactive": 4,
     }
-    classification_df = pd.DataFrame(classification, columns=["Developer", "Classification", "Total Recent Commits"])
     classification_df["Sort Key"] = classification_df["Classification"].map(sort_keys)
-    classification_df.sort_values(by=["Sort Key", "Total Recent Commits"], ascending=[True, False], inplace=True)
     classification_df.drop(["Sort Key", "Total Recent Commits"], axis=1, inplace=True)
     return classification_df
 def perform_statistical_analysis(filtered_df, github_handles, program_end_date_str):
     if program_end_date_str is None:
         return "Program end date not provided. Unable to perform statistical analysis."
@@ -120,33 +206,53 @@ def perform_statistical_analysis(filtered_df, github_handles, program_end_date_s
     before_counts = before_counts.reindex(all_developers.index, fill_value=0)
     after_counts = after_counts.reindex(all_developers.index, fill_value=0)
-    if len(before_counts) < 2 or len(after_counts) < 2:
-        return "Not enough data for statistical analysis."
     stat, p_value = mannwhitneyu(after_counts, before_counts)
-    analysis_result = f"Mann-Whitney U test statistic: {stat:.3f}, P-value: {p_value:.3f}\n"
     if p_value < 0.2:
         if stat > 0:
-            analysis_result += "Difference in commit activity before and after the program is considered significant. " \
-                               "The commit activity is higher after the program."
         else:
-            analysis_result += "Difference in commit activity before and after the program is considered significant. " \
-                               "The commit activity is lower after the program."
     else:
-        analysis_result += "No significant difference in commit activity before and after the program."
     return analysis_result
 def count_new_developers(filtered_df, github_handles, program_end_date_str):
     if program_end_date_str is None:
-        return "Program end date not provided. Unable to count new developers."
     program_end_date = pd.to_datetime(program_end_date_str)
     two_months_after_program = program_end_date + pd.DateOffset(months=2)
     before_program = filtered_df[filtered_df["month_year"] < program_end_date]
-    after_program = filtered_df[(filtered_df["month_year"] >= program_end_date) & (filtered_df["month_year"] <= two_months_after_program)]
     before_developers = before_program["developer"].unique()
     after_developers = after_program["developer"].unique()
@@ -156,17 +262,41 @@ def count_new_developers(filtered_df, github_handles, program_end_date_str):
     return f"Number of new developers committing code within 2 months after the program: {len(new_developers)}\nNew developers: {new_developers_str}"
-def compare_user_developers_to_others(user_specified_active, other_developers_active, df, program_end_date_str):
     if program_end_date_str is None:
-        return "Program end date not provided. Unable to compare user-specified developers to others."
     program_end_date = pd.to_datetime(program_end_date_str)
-    user_commits = df[(df["developer"].isin(user_specified_active["developer"])) & (df["month_year"] >= program_end_date)]["total_commits"]
-    other_commits = df[(df["developer"].isin(other_developers_active["developer"])) & (df["month_year"] >= program_end_date)]["total_commits"]
     stat, p_value = mannwhitneyu(user_commits, other_commits)
-    comparison_result = f"Mann-Whitney U test statistic: {stat:.3f}, P-value: {p_value:.3f}\n"
     if p_value < 0.25:
         if stat > 0:
@@ -178,6 +308,7 @@ def compare_user_developers_to_others(user_specified_active, other_developers_ac
     return comparison_result
 def compare_growth_rate(user_specified_active, other_developers_active, df):
     user_growth_rates = []
     other_growth_rates = []
@@ -197,7 +328,9 @@ def compare_growth_rate(user_specified_active, other_developers_active, df):
         other_growth_rates.append(other_growth_rate)
     stat, p_value = mannwhitneyu(user_growth_rates, other_growth_rates)
-    comparison_result = f"Mann-Whitney U test statistic: {stat:.3f}, P-value: {p_value:.3f}\n"
     if p_value < 0.25:
         if stat > 0:
@@ -209,6 +342,7 @@ def compare_growth_rate(user_specified_active, other_developers_active, df):
     return comparison_result
 def calculate_average_growth_rate(commits):
     growth_rates = []
     for i in range(1, len(commits)):
@@ -220,10 +354,21 @@ def calculate_average_growth_rate(commits):
     else:
         return 0
-def generate_tldr_summary(github_handles, classification_df, analysis_result, new_developers_count, comparison_result, growth_rate_result, event_name):
     summary = f"### 📝 TLDR Summary for {', '.join(github_handles)}\n\n"
-    highly_involved_devs = classification_df[classification_df["Classification"] == "Highly involved"]["Developer"].tolist()
     if highly_involved_devs:
         summary += f"**🌟 High Performers:** {', '.join(highly_involved_devs)}\n\n"
@@ -235,7 +380,9 @@ def generate_tldr_summary(github_handles, classification_df, analysis_result, ne
         summary += "**🔄 Commit Activity:** No significant change after the program.\n\n"
     if new_developers_count.startswith("Number of new developers"):
-        summary += f"**🆕 New Developers:** {new_developers_count.split(':')[1].strip()}\n\n"
     if "significantly higher number of commits" in comparison_result:
         summary += "**🔍 Comparison with Other Developers:** User-specified developers have a significantly higher number of commits.\n\n"
@@ -266,6 +413,11 @@ with gr.Blocks() as app:
         to see their monthly commit activity, involvement classification, and comparisons with other developers.
         """
     )
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
@@ -282,8 +434,14 @@ with gr.Blocks() as app:
                 """
             )
             with gr.Row():
-                program_end_date_input = gr.Textbox(label="Program End Date (YYYY-MM-DD)", placeholder="e.g., 2023-06-30")
-                event_name_input = gr.Textbox(label="Event Name (optional)", placeholder="e.g., Basecamp, Hackathon")
             gr.Markdown(
                 """
                 💡 *Tip: Specifying a program end date allows you to analyze the impact of events like Basecamp or Hackathons on developer activity. Leave it blank to analyze overall activity.*
@@ -360,11 +518,20 @@ with gr.Blocks() as app:
     btn.click(
         process_input,
         inputs=[text_input, file_input, program_end_date_input, event_name_input],
-        outputs=[plot_output, box_plot_output, table_output, stat_analysis_output, new_developers_output, comparison_output, growth_rate_output, tldr_output],
     )
 print(colored("Gradio app initialized.", "blue"))
 if __name__ == "__main__":
     print(colored("Launching app...", "blue"))
-    app.launch(share=True)

 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 from scipy.stats import mannwhitneyu
+from termcolor import colored
 from utils import load_all_developers_dataset
 def process_input(input_text, uploaded_file, program_end_date=None, event_name=None):
     try:
         print(colored("Processing input...", "blue"))
         if uploaded_file is not None:
             print(colored("Reading from uploaded file...", "blue"))
             file_content = uploaded_file.decode("utf-8")
+            github_handles = [
+                handle.strip() for handle in file_content.split("\n") if handle.strip()
+            ]
         else:
             github_handles = [handle.strip() for handle in input_text.split(",")]
         print(colored(f"GitHub handles: {github_handles}", "blue"))
+        if program_end_date == "":
+            program_end_date = None
         df = load_all_developers_dataset()
         print(colored("Filtering dataset...", "blue"))
         one_year_ago = pd.Timestamp.now() - pd.DateOffset(years=1)
+        filtered_df = df[
+            (df["developer"].isin(github_handles)) & (df["month_year"] >= one_year_ago)
+        ]
         filtered_df = filtered_df.sort_values(by=["developer", "month_year"])
         filtered_df.loc[:, "month_year"] = pd.to_datetime(filtered_df["month_year"])
         line_fig = create_line_plot(filtered_df, github_handles, program_end_date)
+        # Debug
+        # print(colored("Debugging filtered dataset and github handles...", "blue"))
+        # print(filtered_df.head(100))
+        # print(filtered_df["developer"].unique())
+        # print(github_handles)
+        filtered_df.to_csv("debug.csv", index=False)
+        # Debug
+        analysis_result = perform_statistical_analysis(
+            filtered_df, github_handles, program_end_date
+        )
+        new_developers_count = count_new_developers(
+            filtered_df, github_handles, program_end_date
+        )
         last_3_months = pd.Timestamp.now() - pd.DateOffset(months=3)
         recent_activity_user = filtered_df[filtered_df["month_year"] >= last_3_months]
         all_devs_df = load_all_developers_dataset()
         all_devs_filtered_df = all_devs_df[(all_devs_df["month_year"] >= last_3_months)]
+        other_devs_recent_activity = all_devs_filtered_df[
+            ~all_devs_filtered_df["developer"].isin(github_handles)
+        ]
+        user_specified_active = recent_activity_user[
+            recent_activity_user["total_commits"] > 0
+        ]
+        other_developers_active = other_devs_recent_activity[
+            other_devs_recent_activity["total_commits"] > 0
+        ]
         box_fig = create_box_plot(user_specified_active, other_developers_active)
         print(colored("Classifying developers...", "blue"))
         classification_df = classify_developers(github_handles, recent_activity_user)
         print(colored("Classification completed.", "blue"))
+        comparison_result = compare_user_developers_to_others(
+            user_specified_active, other_developers_active, df, program_end_date
+        )
+        growth_rate_result = compare_growth_rate(
+            user_specified_active, other_developers_active, df
+        )
+        tldr_summary = generate_tldr_summary(
+            github_handles,
+            classification_df,
+            analysis_result,
+            new_developers_count,
+            comparison_result,
+            growth_rate_result,
+            event_name,
+        )
+        return (
+            line_fig,
+            box_fig,
+            classification_df,
+            analysis_result,
+            new_developers_count,
+            comparison_result,
+            growth_rate_result,
+            tldr_summary,
+        )
     except Exception as e:
         print(colored(f"Error processing input: {e}", "red"))
+        return (
+            None,
+            None,
+            None,
+            None,
+            "Error in processing input. Check logs for more details on the error",
+            None,
+            None,
+            "Error in processing input. Check logs for more details on the error",
+        )
 def create_line_plot(filtered_df, github_handles, program_end_date):
+    all_developers = pd.DataFrame(
+        {
+            "developer": github_handles,
+            "month_year": pd.Timestamp.now(),
+            "total_commits": 0,
+        }
+    )
     plot_df = pd.concat([filtered_df, all_developers])
+    plot_df = (
+        plot_df.groupby(["developer", "month_year"])["total_commits"]
+        .sum()
+        .reset_index()
+    )
     line_fig = px.line(
         plot_df,
         x="month_year",
     )
     if program_end_date:
         program_end_date = pd.to_datetime(program_end_date)
+        line_fig.add_vline(
+            x=program_end_date, line_width=2, line_dash="dash", line_color="red"
+        )
     return line_fig
 def create_box_plot(user_specified_active, other_developers_active):
     box_fig = go.Figure()
+    box_fig.add_trace(
+        go.Box(
+            y=user_specified_active["total_commits"], name="User Specified Developers"
+        )
+    )
+    box_fig.add_trace(
+        go.Box(y=other_developers_active["total_commits"], name="Other Developers")
+    )
     box_fig.update_layout(
         title="Comparison of Monthly Commits in the Last 3 Months: User Specified vs. Other Developers (Active Only)",
         yaxis_title="Total Monthly Commits",
     )
     return box_fig
 def classify_developers(github_handles, recent_activity_user):
     classification = []
     for handle in github_handles:
         "Previously active but no longer": 3,
         "Always been inactive": 4,
     }
+    classification_df = pd.DataFrame(
+        classification, columns=["Developer", "Classification", "Total Recent Commits"]
+    )
     classification_df["Sort Key"] = classification_df["Classification"].map(sort_keys)
+    classification_df.sort_values(
+        by=["Sort Key", "Total Recent Commits"], ascending=[True, False], inplace=True
+    )
     classification_df.drop(["Sort Key", "Total Recent Commits"], axis=1, inplace=True)
     return classification_df
 def perform_statistical_analysis(filtered_df, github_handles, program_end_date_str):
     if program_end_date_str is None:
         return "Program end date not provided. Unable to perform statistical analysis."
     before_counts = before_counts.reindex(all_developers.index, fill_value=0)
     after_counts = after_counts.reindex(all_developers.index, fill_value=0)
+    if (before_counts == 0).all() or (after_counts == 0).all():
+        return "Not enough data for statistical analysis. All values are zero in either before or after counts."
     stat, p_value = mannwhitneyu(after_counts, before_counts)
+    analysis_result = (
+        f"Mann-Whitney U test statistic: {stat:.3f}, P-value: {p_value:.3f}\n"
+    )
     if p_value < 0.2:
         if stat > 0:
+            analysis_result += (
+                "Difference in commit activity before and after the program is considered significant. "
+                "The commit activity is higher after the program."
+            )
         else:
+            analysis_result += (
+                "Difference in commit activity before and after the program is considered significant. "
+                "The commit activity is lower after the program."
+            )
     else:
+        analysis_result += (
+            "No significant difference in commit activity before and after the program."
+        )
     return analysis_result
 def count_new_developers(filtered_df, github_handles, program_end_date_str):
     if program_end_date_str is None:
+        print(
+            colored(
+                "Program end date not provided. Unable to count new developers. No problem.",
+                "yellow",
+            )
+        )
+        return (
+            "Program end date not provided. Unable to count new developers. No problem."
+        )
     program_end_date = pd.to_datetime(program_end_date_str)
     two_months_after_program = program_end_date + pd.DateOffset(months=2)
     before_program = filtered_df[filtered_df["month_year"] < program_end_date]
+    after_program = filtered_df[
+        (filtered_df["month_year"] >= program_end_date)
+        & (filtered_df["month_year"] <= two_months_after_program)
+    ]
     before_developers = before_program["developer"].unique()
     after_developers = after_program["developer"].unique()
     return f"Number of new developers committing code within 2 months after the program: {len(new_developers)}\nNew developers: {new_developers_str}"
+def compare_user_developers_to_others(
+    user_specified_active, other_developers_active, df, program_end_date_str
+):
     if program_end_date_str is None:
+        print(
+            colored(
+                "Program end date not provided. Unable to compare user-specified developers to others. No problem.",
+                "yellow",
+            )
+        )
+        return "Program end date not provided. Unable to compare user-specified developers to others. No problem."
     program_end_date = pd.to_datetime(program_end_date_str)
+    user_commits = df[
+        (df["developer"].isin(user_specified_active["developer"]))
+        & (df["month_year"] >= program_end_date)
+    ]["total_commits"]
+    other_commits = df[
+        (df["developer"].isin(other_developers_active["developer"]))
+        & (df["month_year"] >= program_end_date)
+    ]["total_commits"]
+    if len(user_commits) == 0 or len(other_commits) == 0:
+        print(
+            colored(
+                "Not enough data for comparison. Either user-specified developers or developers in the database have no commits after the program end date. Update database",
+                "red",
+            )
+        )
     stat, p_value = mannwhitneyu(user_commits, other_commits)
+    comparison_result = (
+        f"Mann-Whitney U test statistic: {stat:.3f}, P-value: {p_value:.3f}\n"
+    )
     if p_value < 0.25:
         if stat > 0:
     return comparison_result
 def compare_growth_rate(user_specified_active, other_developers_active, df):
     user_growth_rates = []
     other_growth_rates = []
         other_growth_rates.append(other_growth_rate)
     stat, p_value = mannwhitneyu(user_growth_rates, other_growth_rates)
+    comparison_result = (
+        f"Mann-Whitney U test statistic: {stat:.3f}, P-value: {p_value:.3f}\n"
+    )
     if p_value < 0.25:
         if stat > 0:
     return comparison_result
 def calculate_average_growth_rate(commits):
     growth_rates = []
     for i in range(1, len(commits)):
     else:
         return 0
+def generate_tldr_summary(
+    github_handles,
+    classification_df,
+    analysis_result,
+    new_developers_count,
+    comparison_result,
+    growth_rate_result,
+    event_name,
+):
     summary = f"### 📝 TLDR Summary for {', '.join(github_handles)}\n\n"
+    highly_involved_devs = classification_df[
+        classification_df["Classification"] == "Highly involved"
+    ]["Developer"].tolist()
     if highly_involved_devs:
         summary += f"**🌟 High Performers:** {', '.join(highly_involved_devs)}\n\n"
         summary += "**🔄 Commit Activity:** No significant change after the program.\n\n"
     if new_developers_count.startswith("Number of new developers"):
+        summary += (
+            f"**🆕 New Developers:** {new_developers_count.split(':')[1].strip()}\n\n"
+        )
     if "significantly higher number of commits" in comparison_result:
         summary += "**🔍 Comparison with Other Developers:** User-specified developers have a significantly higher number of commits.\n\n"
         to see their monthly commit activity, involvement classification, and comparisons with other developers.
         """
     )
+    gr.Markdown(
+        """
+        📺 **Video Tutorial:** Please watch this [5-minute video tutorial](https://www.loom.com/share/b60e7f1bd1ee473b97e9c84c74df692a) examining an African Bootcamp and the Basecamp bootcamp as examples to start using the app effectively.
+        """
+    )
     with gr.Row():
         with gr.Column():
             text_input = gr.Textbox(
                 """
             )
             with gr.Row():
+                program_end_date_input = gr.Textbox(
+                    label="Program End Date (YYYY-MM-DD)",
+                    placeholder="e.g., 2023-06-30",
+                )
+                event_name_input = gr.Textbox(
+                    label="Event Name (optional)",
+                    placeholder="e.g., Basecamp, Hackathon",
+                )
             gr.Markdown(
                 """
                 💡 *Tip: Specifying a program end date allows you to analyze the impact of events like Basecamp or Hackathons on developer activity. Leave it blank to analyze overall activity.*
     btn.click(
         process_input,
         inputs=[text_input, file_input, program_end_date_input, event_name_input],
+        outputs=[
+            plot_output,
+            box_plot_output,
+            table_output,
+            stat_analysis_output,
+            new_developers_output,
+            comparison_output,
+            growth_rate_output,
+            tldr_output,
+        ],
     )
 print(colored("Gradio app initialized.", "blue"))
 if __name__ == "__main__":
     print(colored("Launching app...", "blue"))
+    app.launch(share=True)

github_metrics/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from datetime import datetime
-from termcolor import colored
 import pandas as pd
 def load_all_developers_dataset():

 from datetime import datetime
 import pandas as pd
+from termcolor import colored
 def load_all_developers_dataset():