Petr Tsvetkov
commited on
Commit
β’
5bd86a2
1
Parent(s):
3907263
Use FUS logs (not uploaded to repo) to compare length difference and edit distance distributions in FUS and in our dataset (resulting charts are not included).
Browse files- .gitignore +2 -1
- change_visualizer.py +2 -2
- chart_processing.ipynb +0 -0
- dataset_statistics.py +8 -2
.gitignore
CHANGED
@@ -278,4 +278,5 @@ pip-selfcheck.json
|
|
278 |
.idea
|
279 |
|
280 |
cache
|
281 |
-
output
|
|
|
|
278 |
.idea
|
279 |
|
280 |
cache
|
281 |
+
output
|
282 |
+
data
|
change_visualizer.py
CHANGED
@@ -14,7 +14,7 @@ n_diffs_synthetic = len(df_synthetic)
|
|
14 |
|
15 |
|
16 |
def golden():
|
17 |
-
return
|
18 |
|
19 |
|
20 |
def e2s():
|
@@ -33,7 +33,7 @@ def synthetic():
|
|
33 |
return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
|
34 |
|
35 |
|
36 |
-
STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(
|
37 |
"e2s": dataset_statistics.get_statistics_for_df(e2s()),
|
38 |
"s2e": dataset_statistics.get_statistics_for_df(s2e()),
|
39 |
"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
|
|
|
14 |
|
15 |
|
16 |
def golden():
|
17 |
+
return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]
|
18 |
|
19 |
|
20 |
def e2s():
|
|
|
33 |
return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
|
34 |
|
35 |
|
36 |
+
STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
|
37 |
"e2s": dataset_statistics.get_statistics_for_df(e2s()),
|
38 |
"s2e": dataset_statistics.get_statistics_for_df(s2e()),
|
39 |
"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
|
chart_processing.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
dataset_statistics.py
CHANGED
@@ -9,7 +9,10 @@ from scipy.stats import stats
|
|
9 |
import config
|
10 |
|
11 |
|
12 |
-
def get_statistics(
|
|
|
|
|
|
|
13 |
edit_ops = Levenshtein.editops(start_msg, end_msg)
|
14 |
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
|
15 |
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
|
@@ -27,11 +30,14 @@ def get_statistics(start_msg, end_msg, annotated_msg):
|
|
27 |
"deletions_norm": n_deletes / len(start_msg),
|
28 |
"insertions_norm": n_inserts / len(end_msg),
|
29 |
"changes_norm": n_changes / len(end_msg),
|
|
|
|
|
|
|
30 |
}
|
31 |
|
32 |
|
33 |
def get_statistics_for_df(df: pd.DataFrame):
|
34 |
-
stats = [get_statistics(row
|
35 |
df.iterrows()]
|
36 |
|
37 |
assert len(stats) > 0
|
|
|
9 |
import config
|
10 |
|
11 |
|
12 |
+
def get_statistics(row):
|
13 |
+
start_msg = row["commit_msg_start"]
|
14 |
+
end_msg = row["commit_msg_end"]
|
15 |
+
|
16 |
edit_ops = Levenshtein.editops(start_msg, end_msg)
|
17 |
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
|
18 |
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
|
|
|
30 |
"deletions_norm": n_deletes / len(start_msg),
|
31 |
"insertions_norm": n_inserts / len(end_msg),
|
32 |
"changes_norm": n_changes / len(end_msg),
|
33 |
+
|
34 |
+
"lendiff": abs(len(start_msg) - len(end_msg)),
|
35 |
+
"editdist": row["editdist_related"]
|
36 |
}
|
37 |
|
38 |
|
39 |
def get_statistics_for_df(df: pd.DataFrame):
|
40 |
+
stats = [get_statistics(row) for _, row in
|
41 |
df.iterrows()]
|
42 |
|
43 |
assert len(stats) > 0
|