app.py CHANGED
@@ -964,13 +964,12 @@ def create_wordcloud(tokens, token1=None, token2=None):
964
  if token2:
965
  normalized_freq[token2] = normalized_freq.get(token2, 0) + 5
966
 
967
- # Custom colormap with dark shades of brown, green, and blue
968
  wc = WordCloud(
969
  width=800, height=400,
970
  background_color='white',
971
  max_words=100,
972
  prefer_horizontal=1.0, # Make all words horizontal
973
- colormap='Dark2' # Dark colormap with browns, greens, blues
974
  ).generate_from_frequencies(normalized_freq)
975
 
976
  return wc
 
964
  if token2:
965
  normalized_freq[token2] = normalized_freq.get(token2, 0) + 5
966
 
 
967
  wc = WordCloud(
968
  width=800, height=400,
969
  background_color='white',
970
  max_words=100,
971
  prefer_horizontal=1.0, # Make all words horizontal
972
+ colormap='BrBG' # Using Set3 colormap which has muted, professional colors
973
  ).generate_from_frequencies(normalized_freq)
974
 
975
  return wc
codenet_4000_CasingClassVariable/java/input.in DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_Example/java/input.in DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_Example/java/layer12/kmeans/clusters-kmeans-350.txt DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_Onecase/java/input.in DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_Onecase/java/layer12/kmeans/clusters-kmeans-350.txt DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_exactNameClassVariable/java/input.in DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_finetuned_compile_error/java/input.in DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_finetuned_language_classification/java/input.in DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt DELETED
The diff for this file is too large to render. See raw diff
 
codenet_4000_lexical_similar/java/input.in ADDED
The diff for this file is too large to render. See raw diff
 
{codenet_4000_exactNameClassVariable → codenet_4000_lexical_similar}/java/layer12/kmeans/clusters-kmeans-350.txt RENAMED
The diff for this file is too large to render. See raw diff
 
convert.py ADDED
The diff for this file is too large to render. See raw diff
 
pert.py DELETED
@@ -1,182 +0,0 @@
1
- import csv
2
- import numpy as np
3
- from collections import defaultdict
4
- from scipy.optimize import linear_sum_assignment
5
- import os
6
-
7
- def load_clusters(path):
8
- cluster_to_tokens = defaultdict(set)
9
- with open(path, "r", encoding="utf-8") as f:
10
- for line in f:
11
- parts = line.strip().split("|||")
12
- if len(parts) < 2:
13
- continue
14
- token = parts[0]
15
- cluster_id = parts[-1]
16
- cluster_to_tokens[cluster_id].add(token)
17
- return cluster_to_tokens
18
-
19
- def compute_jaccard_matrix(clusters_a, clusters_b):
20
- a_keys = list(clusters_a.keys())
21
- b_keys = list(clusters_b.keys())
22
- matrix = np.zeros((len(a_keys), len(b_keys)))
23
-
24
- for i, ca in enumerate(a_keys):
25
- for j, cb in enumerate(b_keys):
26
- set_a = clusters_a[ca]
27
- set_b = clusters_b[cb]
28
- intersection = len(set_a & set_b)
29
- union = len(set_a | set_b)
30
- matrix[i, j] = intersection / union if union > 0 else 0.0
31
-
32
- return matrix, a_keys, b_keys
33
-
34
- # Dictionary mapping perturbation names to their descriptions
35
- perturbation_descriptions = {
36
- "Scope Modification": "Identifies variables in complex scopes and moves them to unrelated blocks.",
37
- "Log Modification": "Adds logging statements to blocks of code for tracking execution flow.",
38
- "Operator Modification": "Modifies boolean expressions by negating them in various contexts.",
39
- "Pointer Modification": "Add C style pointer to the code.",
40
- "POS finetuned": "Clusters based on finetuned POS codebert model",
41
- "Random Modification": "Permutes statements within basic blocks, allowing different execution orders.",
42
- "Try Catch Modification": "Converts switch statements into equivalent if statements.",
43
- "Unused Statement Modification": "Inserts unused statements into blocks of code for testing/debugging.",
44
- "Exact Name Class Variable Modification": "Renames classes and variables to a specific randomly generated name.",
45
- "Casing Class Variable Modification": "Generates lexical variations of class and variable names with different casing.",
46
- "Onecase Modification": "Generates lexical variations of class and variable names with just 1 letter uppercase wither for class anme or variable name.",
47
- "Example Modification": "Generates lexical variations of class and variable names with Example being the class name and example being the variable name or vice versa.",
48
- "Finetuned on compile error": "Clusters based on finetuned codebert model on compile errors",
49
- "Finetuned on language classification": "Clusters based on finetuned codebert model on language classification",
50
-
51
- }
52
-
53
- def compute_and_log_csi(file_orig, file_pert, perturbation_name, output_csv="results/csi_summary.csv"):
54
- clusters_orig = load_clusters(file_orig)
55
- clusters_pert = load_clusters(file_pert)
56
-
57
- if len(clusters_orig) != len(clusters_pert):
58
- raise ValueError(f"Cluster count mismatch: {len(clusters_orig)} (original) vs {len(clusters_pert)} (perturbed)")
59
-
60
- jaccard_matrix, orig_ids, pert_ids = compute_jaccard_matrix(clusters_orig, clusters_pert)
61
-
62
- row_ind, col_ind = linear_sum_assignment(-jaccard_matrix)
63
-
64
- matched_similarities = [jaccard_matrix[i, j] for i, j in zip(row_ind, col_ind)]
65
- avg_jaccard = np.mean(matched_similarities)
66
- csi = 1.0 - avg_jaccard
67
-
68
- print(f"Perturbation: {perturbation_name}")
69
- print(f" Average Jaccard Similarity: {avg_jaccard:.4f}")
70
- print(f" Cluster Sensitivity Index (CSI): {csi:.4f}")
71
-
72
- # Append to CSV
73
- os.makedirs(os.path.dirname(output_csv), exist_ok=True)
74
- file_exists = os.path.isfile(output_csv)
75
-
76
- with open(output_csv, mode="a", newline='', encoding="utf-8") as file:
77
- writer = csv.writer(file)
78
- if not file_exists:
79
- writer.writerow(["Perturbation", "Average Jaccard", "CSI", "Description"])
80
- writer.writerow([perturbation_name, avg_jaccard, csi, perturbation_descriptions.get(perturbation_name, "No description available")])
81
-
82
- return avg_jaccard, csi
83
-
84
- # Example usage
85
- compute_and_log_csi(
86
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
87
- "codenet_4000_scope_error/java/layer12/kmeans/clusters-kmeans-350.txt",
88
- perturbation_name="Scope Modification",
89
- output_csv="results/csi_summary.csv"
90
- )
91
-
92
- compute_and_log_csi(
93
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
94
- "codenet_4000_log/java/layer12/kmeans/clusters-kmeans-350.txt",
95
- perturbation_name="Log Modification",
96
- output_csv="results/csi_summary.csv"
97
- )
98
-
99
- compute_and_log_csi(
100
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
101
- "codenet_4000_operator/java/layer12/kmeans/clusters-kmeans-350.txt",
102
- perturbation_name="Operator Modification",
103
- output_csv="results/csi_summary.csv"
104
- )
105
-
106
- compute_and_log_csi(
107
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
108
- "codenet_4000_pointer/java/layer12/kmeans/clusters-kmeans-350.txt",
109
- perturbation_name="Pointer Modification",
110
- output_csv="results/csi_summary.csv"
111
- )
112
-
113
- compute_and_log_csi(
114
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
115
- "codenet_4000_POS/java/layer12/kmeans/clusters-kmeans-350.txt",
116
- perturbation_name="POS finetuned",
117
- output_csv="results/csi_summary.csv"
118
- )
119
-
120
- compute_and_log_csi(
121
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
122
- "codenet_4000_random/java/layer12/kmeans/clusters-kmeans-350.txt",
123
- perturbation_name="Random Modification",
124
- output_csv="results/csi_summary.csv"
125
- )
126
-
127
- compute_and_log_csi(
128
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
129
- "codenet_4000_trycatch/java/layer12/kmeans/clusters-kmeans-350.txt",
130
- perturbation_name="Try Catch Modification",
131
- output_csv="results/csi_summary.csv"
132
- )
133
-
134
- compute_and_log_csi(
135
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
136
- "codenet_4000_unusedStatement/java/layer12/kmeans/clusters-kmeans-350.txt",
137
- perturbation_name="Unused Statement Modification",
138
- output_csv="results/csi_summary.csv"
139
- )
140
-
141
- compute_and_log_csi(
142
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
143
- "codenet_4000_exactNameClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
144
- perturbation_name="Exact Name Class Variable Modification",
145
- output_csv="results/csi_summary.csv"
146
- )
147
-
148
- compute_and_log_csi(
149
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
150
- "codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
151
- perturbation_name="Casing Class Variable Modification",
152
- output_csv="results/csi_summary.csv"
153
- )
154
-
155
- compute_and_log_csi(
156
- "codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
157
- "codenet_4000_Onecase/java/layer12/kmeans/clusters-kmeans-350.txt",
158
- perturbation_name="Onecase Modification",
159
- output_csv="results/csi_summary.csv"
160
- )
161
-
162
- compute_and_log_csi(
163
- "codenet_4000_Example/Java/layer12/kmeans/clusters-kmeans-350.txt",
164
- "codenet_4000_Example/java/layer12/kmeans/clusters-kmeans-350.txt",
165
- perturbation_name="Example Modification",
166
- output_csv="results/csi_summary.csv"
167
- )
168
-
169
- compute_and_log_csi(
170
- "codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt",
171
- "codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt",
172
- perturbation_name="Finetuned on compile error",
173
- output_csv="results/csi_summary.csv"
174
- )
175
-
176
- compute_and_log_csi(
177
- "codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt",
178
- "codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt",
179
- perturbation_name="Finetuned on language classification",
180
- output_csv="results/csi_summary.csv"
181
- )
182
- # You can now call compute_and_log_csi again and again for other perturbations!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
remove.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def remove_lines(filepath, lines_to_remove):
2
+ # Read the file
3
+ with open(filepath, 'r', encoding='utf-8') as f:
4
+ file_content = f.read()
5
+
6
+ # Split content into lines
7
+ lines = file_content.split('\n')
8
+
9
+ # Create a set of line numbers to remove for O(1) lookup
10
+ remove_set = set(lines_to_remove)
11
+
12
+ # Keep lines that aren't in the remove set
13
+ filtered_lines = [line for i, line in enumerate(lines, 1) if i not in remove_set]
14
+
15
+ # Join lines back together
16
+ new_content = '\n'.join(filtered_lines)
17
+
18
+ # Write back to the same file
19
+ with open(filepath, 'w', encoding='utf-8') as f:
20
+ f.write(new_content)
21
+
22
+ lines_to_remove = [
23
+ 5,
24
+ 11,
25
+ 26,
26
+ 46,
27
+ 53,
28
+ 84,
29
+ 117,
30
+ 174,
31
+ 175,
32
+ 209,
33
+ 212,
34
+ 219,
35
+ 220,
36
+ 268,
37
+ 272,
38
+ 277,
39
+ 294,
40
+ 319,
41
+ 322,
42
+ 333,
43
+ 369,
44
+ 402,
45
+ 437,
46
+ 451,
47
+ 471,
48
+ 471,
49
+ 471,
50
+ 480,
51
+ 494,
52
+ 502,
53
+ 514,
54
+ 564,
55
+ 569,
56
+ 579,
57
+ 592,
58
+ 599,
59
+ 602,
60
+ 602,
61
+ 619,
62
+ 647,
63
+ 679,
64
+ 681,
65
+ 685,
66
+ 688,
67
+ 781,
68
+ 795,
69
+ 833,
70
+ 843,
71
+ 859,
72
+ 860,
73
+ 899,
74
+ 911,
75
+ 941,
76
+ 947,
77
+ 989,
78
+ 993,
79
+ 1100,
80
+ 1111,
81
+ 1120,
82
+ 1123,
83
+ 1126,
84
+ 1153,
85
+ 1165,
86
+ 1173,
87
+ 1183,
88
+ 1186,
89
+ 1186,
90
+ 1220,
91
+ 1230,
92
+ 1238,
93
+ 1242,
94
+ 1247,
95
+ 1274,
96
+ 1285,
97
+ 1289,
98
+ 1324,
99
+ 1358,
100
+ 1385,
101
+ 1397,
102
+ 1402,
103
+ 1465,
104
+ 1474,
105
+ 1504,
106
+ 1507,
107
+ 1517,
108
+ 1563,
109
+ 1592,
110
+ 1605,
111
+ 1614,
112
+ 1626,
113
+ 1648,
114
+ 1648,
115
+ 1689,
116
+ 1702,
117
+ 1730,
118
+ 1730,
119
+ 1737,
120
+ 1769,
121
+ 1784,
122
+ 1799,
123
+ 1824,
124
+ 1834,
125
+ 1840,
126
+ 1853,
127
+ 1860,
128
+ 1872,
129
+ 1941,
130
+ 2038,
131
+ 2045,
132
+ 2081,
133
+ 2096,
134
+ 2108,
135
+ 2115,
136
+ 2115,
137
+ 2147,
138
+ 2149,
139
+ 2165,
140
+ 2167,
141
+ 2173,
142
+ 2195,
143
+ 2216,
144
+ 2275,
145
+ 2278,
146
+ 2282,
147
+ 2285,
148
+ 2327,
149
+ 2339,
150
+ 2347,
151
+ 2348,
152
+ 2348,
153
+ 2425,
154
+ 2444,
155
+ 2476,
156
+ 2477,
157
+ 2482,
158
+ 2482,
159
+ 2486,
160
+ 2499,
161
+ 2515,
162
+ 2529,
163
+ 2529,
164
+ 2559,
165
+ 2565,
166
+ 2567,
167
+ 2573,
168
+ 2582,
169
+ 2633,
170
+ 2641,
171
+ 2677,
172
+ 2705,
173
+ 2719,
174
+ 2744,
175
+ 2756,
176
+ 2821,
177
+ 2860,
178
+ 2864,
179
+ 2936,
180
+ 2955,
181
+ 2992,
182
+ 3022,
183
+ 3041,
184
+ 3064,
185
+ 3074,
186
+ 3121,
187
+ 3123,
188
+ 3160,
189
+ 3170,
190
+ 3172,
191
+ 3179,
192
+ 3180,
193
+ 3195,
194
+ 3199,
195
+ 3208,
196
+ 3208,
197
+ 3259,
198
+ 3269,
199
+ 3280,
200
+ 3299,
201
+ 3300,
202
+ 3323,
203
+ 3334,
204
+ 3352,
205
+ 3364,
206
+ 3365,
207
+ 3378,
208
+ 3405,
209
+ 3424,
210
+ 3438,
211
+ 3492,
212
+ 3511,
213
+ 3512,
214
+ 3533,
215
+ 3572,
216
+ 3579,
217
+ 3710,
218
+ 3730,
219
+ 3735,
220
+ 3759,
221
+ 3787,
222
+ 3793
223
+ ]
224
+ remove_lines('input.in', lines_to_remove)
results/csi_summary.csv DELETED
@@ -1,15 +0,0 @@
1
- Perturbation,Average Jaccard,CSI,Description
2
- Scope Modification,0.6788942354152336,0.32110576458476636,Identifies variables in complex scopes and moves them to unrelated blocks.
3
- Log Modification,0.5597545985057552,0.44024540149424485,Adds logging statements to blocks of code for tracking execution flow.
4
- Operator Modification,0.7675911973340813,0.23240880266591868,Modifies boolean expressions by negating them in various contexts.
5
- Pointer Modification,0.7341816285924795,0.2658183714075205,Add C style pointer to the code.
6
- POS finetuned,0.39399085068850775,0.6060091493114923,Clusters based on finetuned POS codebert model
7
- Random Modification,0.5314837325594708,0.4685162674405292,"Permutes statements within basic blocks, allowing different execution orders."
8
- Try Catch Modification,0.6985673658171294,0.3014326341828706,Converts switch statements into equivalent if statements.
9
- Unused Statement Modification,0.5844954343120634,0.4155045656879366,Inserts unused statements into blocks of code for testing/debugging.
10
- Exact Name Class Variable Modification,0.675121649837896,0.324878350162104,Renames classes and variables to a specific randomly generated name.
11
- Casing Class Variable Modification,0.6722713965133429,0.3277286034866571,Generates lexical variations of class and variable names with different casing.
12
- Onecase Modification,0.665697304921991,0.334302695078009,Generates lexical variations of class and variable names with just 1 letter uppercase wither for class anme or variable name.
13
- Example Modification,1.0,0.0,Generates lexical variations of class and variable names with Example being the class name and example being the variable name or vice versa.
14
- Finetuned on compile error,1.0,0.0,Clusters based on finetuned codebert model on compile errors
15
- Finetuned on language classification,1.0,0.0,Clusters based on finetuned codebert model on language classification