Spaces:

vedant2905
/

Coconet_Visual

Build error

App Files Files Community

Perturbation testing

by sharmaarushi17 - opened Apr 21

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+344250

-1966597

This PR is in draft mode

Files changed (18) hide show

app.py +1 -2
codenet_4000_CasingClassVariable/java/input.in +0 -0
codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt +0 -0
codenet_4000_Example/java/input.in +0 -0
codenet_4000_Example/java/layer12/kmeans/clusters-kmeans-350.txt +0 -0
codenet_4000_Onecase/java/input.in +0 -0
codenet_4000_Onecase/java/layer12/kmeans/clusters-kmeans-350.txt +0 -0
codenet_4000_exactNameClassVariable/java/input.in +0 -0
codenet_4000_finetuned_compile_error/java/input.in +0 -0
codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt +0 -0
codenet_4000_finetuned_language_classification/java/input.in +0 -0
codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt +0 -0
codenet_4000_lexical_similar/java/input.in +0 -0
{codenet_4000_exactNameClassVariable → codenet_4000_lexical_similar}/java/layer12/kmeans/clusters-kmeans-350.txt +0 -0
convert.py +0 -0
pert.py +0 -182
remove.py +224 -0
results/csi_summary.csv +0 -15

app.py CHANGED Viewed

@@ -964,13 +964,12 @@ def create_wordcloud(tokens, token1=None, token2=None):
     if token2:
         normalized_freq[token2] = normalized_freq.get(token2, 0) + 5
-    # Custom colormap with dark shades of brown, green, and blue
     wc = WordCloud(
         width=800, height=400,
         background_color='white',
         max_words=100,
         prefer_horizontal=1.0,  # Make all words horizontal
-        colormap='Dark2'  # Dark colormap with browns, greens, blues
     ).generate_from_frequencies(normalized_freq)
     return wc

     if token2:
         normalized_freq[token2] = normalized_freq.get(token2, 0) + 5
     wc = WordCloud(
         width=800, height=400,
         background_color='white',
         max_words=100,
         prefer_horizontal=1.0,  # Make all words horizontal
+        colormap='BrBG'  # Using Set3 colormap which has muted, professional colors
     ).generate_from_frequencies(normalized_freq)
     return wc

codenet_4000_CasingClassVariable/java/input.in DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_Example/java/input.in DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_Example/java/layer12/kmeans/clusters-kmeans-350.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_Onecase/java/input.in DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_Onecase/java/layer12/kmeans/clusters-kmeans-350.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_exactNameClassVariable/java/input.in DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_finetuned_compile_error/java/input.in DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_finetuned_language_classification/java/input.in DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

codenet_4000_lexical_similar/java/input.in ADDED Viewed

The diff for this file is too large to render. See raw diff

{codenet_4000_exactNameClassVariable → codenet_4000_lexical_similar}/java/layer12/kmeans/clusters-kmeans-350.txt RENAMED Viewed

The diff for this file is too large to render. See raw diff

convert.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pert.py DELETED Viewed

@@ -1,182 +0,0 @@
-import csv
-import numpy as np
-from collections import defaultdict
-from scipy.optimize import linear_sum_assignment
-import os
-def load_clusters(path):
-    cluster_to_tokens = defaultdict(set)
-    with open(path, "r", encoding="utf-8") as f:
-        for line in f:
-            parts = line.strip().split("|||")
-            if len(parts) < 2:
-                continue
-            token = parts[0]
-            cluster_id = parts[-1]
-            cluster_to_tokens[cluster_id].add(token)
-    return cluster_to_tokens
-def compute_jaccard_matrix(clusters_a, clusters_b):
-    a_keys = list(clusters_a.keys())
-    b_keys = list(clusters_b.keys())
-    matrix = np.zeros((len(a_keys), len(b_keys)))
-    for i, ca in enumerate(a_keys):
-        for j, cb in enumerate(b_keys):
-            set_a = clusters_a[ca]
-            set_b = clusters_b[cb]
-            intersection = len(set_a & set_b)
-            union = len(set_a | set_b)
-            matrix[i, j] = intersection / union if union > 0 else 0.0
-    return matrix, a_keys, b_keys
-# Dictionary mapping perturbation names to their descriptions
-perturbation_descriptions = {
-    "Scope Modification": "Identifies variables in complex scopes and moves them to unrelated blocks.",
-    "Log Modification": "Adds logging statements to blocks of code for tracking execution flow.",
-    "Operator Modification": "Modifies boolean expressions by negating them in various contexts.",
-    "Pointer Modification": "Add C style pointer to the code.",
-    "POS finetuned": "Clusters based on finetuned POS codebert model",
-    "Random Modification": "Permutes statements within basic blocks, allowing different execution orders.",
-    "Try Catch Modification": "Converts switch statements into equivalent if statements.",
-    "Unused Statement Modification": "Inserts unused statements into blocks of code for testing/debugging.",
-    "Exact Name Class Variable Modification": "Renames classes and variables to a specific randomly generated name.",
-    "Casing Class Variable Modification": "Generates lexical variations of class and variable names with different casing.",
-    "Onecase Modification": "Generates lexical variations of class and variable names with just 1 letter uppercase wither for class anme or variable name.",
-    "Example Modification": "Generates lexical variations of class and variable names with Example being the class name and example being the variable name or vice versa.",
-    "Finetuned on compile error": "Clusters based on finetuned codebert model on compile errors",
-    "Finetuned on language classification": "Clusters based on finetuned codebert model on language classification",
-}
-def compute_and_log_csi(file_orig, file_pert, perturbation_name, output_csv="results/csi_summary.csv"):
-    clusters_orig = load_clusters(file_orig)
-    clusters_pert = load_clusters(file_pert)
-    if len(clusters_orig) != len(clusters_pert):
-        raise ValueError(f"Cluster count mismatch: {len(clusters_orig)} (original) vs {len(clusters_pert)} (perturbed)")
-    jaccard_matrix, orig_ids, pert_ids = compute_jaccard_matrix(clusters_orig, clusters_pert)
-    row_ind, col_ind = linear_sum_assignment(-jaccard_matrix)
-    matched_similarities = [jaccard_matrix[i, j] for i, j in zip(row_ind, col_ind)]
-    avg_jaccard = np.mean(matched_similarities)
-    csi = 1.0 - avg_jaccard
-    print(f"Perturbation: {perturbation_name}")
-    print(f"  Average Jaccard Similarity: {avg_jaccard:.4f}")
-    print(f"  Cluster Sensitivity Index (CSI): {csi:.4f}")
-    # Append to CSV
-    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
-    file_exists = os.path.isfile(output_csv)
-    with open(output_csv, mode="a", newline='', encoding="utf-8") as file:
-        writer = csv.writer(file)
-        if not file_exists:
-            writer.writerow(["Perturbation", "Average Jaccard", "CSI", "Description"])
-        writer.writerow([perturbation_name, avg_jaccard, csi, perturbation_descriptions.get(perturbation_name, "No description available")])
-    return avg_jaccard, csi
-# Example usage
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_scope_error/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Scope Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_log/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Log Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_operator/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Operator Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_pointer/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Pointer Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_POS/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="POS finetuned",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_random/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Random Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_trycatch/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Try Catch Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_unusedStatement/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Unused Statement Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_exactNameClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Exact Name Class Variable Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Casing Class Variable Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_Onecase/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Onecase Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_Example/Java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_Example/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Example Modification",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Finetuned on compile error",
-    output_csv="results/csi_summary.csv"
-)
-compute_and_log_csi(
-    "codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt",
-    "codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt",
-    perturbation_name="Finetuned on language classification",
-    output_csv="results/csi_summary.csv"
-)
-# You can now call compute_and_log_csi again and again for other perturbations!

remove.py ADDED Viewed

	@@ -0,0 +1,224 @@

+def remove_lines(filepath, lines_to_remove):
+    # Read the file
+    with open(filepath, 'r', encoding='utf-8') as f:
+        file_content = f.read()
+    # Split content into lines
+    lines = file_content.split('\n')
+    # Create a set of line numbers to remove for O(1) lookup
+    remove_set = set(lines_to_remove)
+    # Keep lines that aren't in the remove set
+    filtered_lines = [line for i, line in enumerate(lines, 1) if i not in remove_set]
+    # Join lines back together
+    new_content = '\n'.join(filtered_lines)
+    # Write back to the same file
+    with open(filepath, 'w', encoding='utf-8') as f:
+        f.write(new_content)
+lines_to_remove =  [
+5,
+11,
+26,
+46,
+53,
+84,
+117,
+174,
+175,
+209,
+212,
+219,
+220,
+268,
+272,
+277,
+294,
+319,
+322,
+333,
+369,
+402,
+437,
+451,
+471,
+471,
+471,
+480,
+494,
+502,
+514,
+564,
+569,
+579,
+592,
+599,
+602,
+602,
+619,
+647,
+679,
+681,
+685,
+688,
+781,
+795,
+833,
+843,
+859,
+860,
+899,
+911,
+941,
+947,
+989,
+993,
+1100,
+1111,
+1120,
+1123,
+1126,
+1153,
+1165,
+1173,
+1183,
+1186,
+1186,
+1220,
+1230,
+1238,
+1242,
+1247,
+1274,
+1285,
+1289,
+1324,
+1358,
+1385,
+1397,
+1402,
+1465,
+1474,
+1504,
+1507,
+1517,
+1563,
+1592,
+1605,
+1614,
+1626,
+1648,
+1648,
+1689,
+1702,
+1730,
+1730,
+1737,
+1769,
+1784,
+1799,
+1824,
+1834,
+1840,
+1853,
+1860,
+1872,
+1941,
+2038,
+2045,
+2081,
+2096,
+2108,
+2115,
+2115,
+2147,
+2149,
+2165,
+2167,
+2173,
+2195,
+2216,
+2275,
+2278,
+2282,
+2285,
+2327,
+2339,
+2347,
+2348,
+2348,
+2425,
+2444,
+2476,
+2477,
+2482,
+2482,
+2486,
+2499,
+2515,
+2529,
+2529,
+2559,
+2565,
+2567,
+2573,
+2582,
+2633,
+2641,
+2677,
+2705,
+2719,
+2744,
+2756,
+2821,
+2860,
+2864,
+2936,
+2955,
+2992,
+3022,
+3041,
+3064,
+3074,
+3121,
+3123,
+3160,
+3170,
+3172,
+3179,
+3180,
+3195,
+3199,
+3208,
+3208,
+3259,
+3269,
+3280,
+3299,
+3300,
+3323,
+3334,
+3352,
+3364,
+3365,
+3378,
+3405,
+3424,
+3438,
+3492,
+3511,
+3512,
+3533,
+3572,
+3579,
+3710,
+3730,
+3735,
+3759,
+3787,
+3793
+]
+remove_lines('input.in', lines_to_remove)

results/csi_summary.csv DELETED Viewed

@@ -1,15 +0,0 @@
-Perturbation,Average Jaccard,CSI,Description
-Scope Modification,0.6788942354152336,0.32110576458476636,Identifies variables in complex scopes and moves them to unrelated blocks.
-Log Modification,0.5597545985057552,0.44024540149424485,Adds logging statements to blocks of code for tracking execution flow.
-Operator Modification,0.7675911973340813,0.23240880266591868,Modifies boolean expressions by negating them in various contexts.
-Pointer Modification,0.7341816285924795,0.2658183714075205,Add C style pointer to the code.
-POS finetuned,0.39399085068850775,0.6060091493114923,Clusters based on finetuned POS codebert model
-Random Modification,0.5314837325594708,0.4685162674405292,"Permutes statements within basic blocks, allowing different execution orders."
-Try Catch Modification,0.6985673658171294,0.3014326341828706,Converts switch statements into equivalent if statements.
-Unused Statement Modification,0.5844954343120634,0.4155045656879366,Inserts unused statements into blocks of code for testing/debugging.
-Exact Name Class Variable Modification,0.675121649837896,0.324878350162104,Renames classes and variables to a specific randomly generated name.
-Casing Class Variable Modification,0.6722713965133429,0.3277286034866571,Generates lexical variations of class and variable names with different casing.
-Onecase Modification,0.665697304921991,0.334302695078009,Generates lexical variations of class and variable names with just 1 letter uppercase wither for class anme or variable name.
-Example Modification,1.0,0.0,Generates lexical variations of class and variable names with Example being the class name and example being the variable name or vice versa.
-Finetuned on compile error,1.0,0.0,Clusters based on finetuned codebert model on compile errors
-Finetuned on language classification,1.0,0.0,Clusters based on finetuned codebert model on language classification