Spaces:
Runtime error
Runtime error
Petr Tsvetkov
commited on
Commit
Β·
5bd86a2
1
Parent(s):
3907263
Use FUS logs (not uploaded to repo) to compare length difference and edit distance distributions in FUS and in our dataset (resulting charts are not included).
Browse files- .gitignore +2 -1
- change_visualizer.py +2 -2
- chart_processing.ipynb +0 -0
- dataset_statistics.py +8 -2
.gitignore
CHANGED
|
@@ -278,4 +278,5 @@ pip-selfcheck.json
|
|
| 278 |
.idea
|
| 279 |
|
| 280 |
cache
|
| 281 |
-
output
|
|
|
|
|
|
| 278 |
.idea
|
| 279 |
|
| 280 |
cache
|
| 281 |
+
output
|
| 282 |
+
data
|
change_visualizer.py
CHANGED
|
@@ -14,7 +14,7 @@ n_diffs_synthetic = len(df_synthetic)
|
|
| 14 |
|
| 15 |
|
| 16 |
def golden():
|
| 17 |
-
return
|
| 18 |
|
| 19 |
|
| 20 |
def e2s():
|
|
@@ -33,7 +33,7 @@ def synthetic():
|
|
| 33 |
return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
|
| 34 |
|
| 35 |
|
| 36 |
-
STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(
|
| 37 |
"e2s": dataset_statistics.get_statistics_for_df(e2s()),
|
| 38 |
"s2e": dataset_statistics.get_statistics_for_df(s2e()),
|
| 39 |
"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def golden():
|
| 17 |
+
return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]
|
| 18 |
|
| 19 |
|
| 20 |
def e2s():
|
|
|
|
| 33 |
return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
|
| 34 |
|
| 35 |
|
| 36 |
+
STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
|
| 37 |
"e2s": dataset_statistics.get_statistics_for_df(e2s()),
|
| 38 |
"s2e": dataset_statistics.get_statistics_for_df(s2e()),
|
| 39 |
"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
|
chart_processing.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_statistics.py
CHANGED
|
@@ -9,7 +9,10 @@ from scipy.stats import stats
|
|
| 9 |
import config
|
| 10 |
|
| 11 |
|
| 12 |
-
def get_statistics(
|
|
|
|
|
|
|
|
|
|
| 13 |
edit_ops = Levenshtein.editops(start_msg, end_msg)
|
| 14 |
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
|
| 15 |
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
|
|
@@ -27,11 +30,14 @@ def get_statistics(start_msg, end_msg, annotated_msg):
|
|
| 27 |
"deletions_norm": n_deletes / len(start_msg),
|
| 28 |
"insertions_norm": n_inserts / len(end_msg),
|
| 29 |
"changes_norm": n_changes / len(end_msg),
|
|
|
|
|
|
|
|
|
|
| 30 |
}
|
| 31 |
|
| 32 |
|
| 33 |
def get_statistics_for_df(df: pd.DataFrame):
|
| 34 |
-
stats = [get_statistics(row
|
| 35 |
df.iterrows()]
|
| 36 |
|
| 37 |
assert len(stats) > 0
|
|
|
|
| 9 |
import config
|
| 10 |
|
| 11 |
|
| 12 |
+
def get_statistics(row):
|
| 13 |
+
start_msg = row["commit_msg_start"]
|
| 14 |
+
end_msg = row["commit_msg_end"]
|
| 15 |
+
|
| 16 |
edit_ops = Levenshtein.editops(start_msg, end_msg)
|
| 17 |
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
|
| 18 |
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
|
|
|
|
| 30 |
"deletions_norm": n_deletes / len(start_msg),
|
| 31 |
"insertions_norm": n_inserts / len(end_msg),
|
| 32 |
"changes_norm": n_changes / len(end_msg),
|
| 33 |
+
|
| 34 |
+
"lendiff": abs(len(start_msg) - len(end_msg)),
|
| 35 |
+
"editdist": row["editdist_related"]
|
| 36 |
}
|
| 37 |
|
| 38 |
|
| 39 |
def get_statistics_for_df(df: pd.DataFrame):
|
| 40 |
+
stats = [get_statistics(row) for _, row in
|
| 41 |
df.iterrows()]
|
| 42 |
|
| 43 |
assert len(stats) > 0
|