Spaces:
Runtime error
Runtime error
Petr Tsvetkov
commited on
Commit
Β·
aab3281
1
Parent(s):
305e536
WIP on annotated diffs generation
Browse files- .gitignore +2 -1
- config.py +9 -1
- generate_annotated_diffs.py +32 -0
- hf_data_loader.py +3 -0
.gitignore
CHANGED
@@ -277,4 +277,5 @@ pip-selfcheck.json
|
|
277 |
|
278 |
.idea
|
279 |
|
280 |
-
cache
|
|
|
|
277 |
|
278 |
.idea
|
279 |
|
280 |
+
cache
|
281 |
+
output
|
config.py
CHANGED
@@ -1,6 +1,14 @@
|
|
1 |
import os
|
|
|
2 |
|
3 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
4 |
HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
|
5 |
HF_RAW_DATASET_SPLIT = 'train'
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from pathlib import Path
|
3 |
|
4 |
HF_TOKEN = os.environ.get('HF_TOKEN')
|
5 |
HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
|
6 |
HF_RAW_DATASET_SPLIT = 'train'
|
7 |
+
|
8 |
+
CACHE_DIR = Path("cache")
|
9 |
+
CACHE_DIR.mkdir(exist_ok=True)
|
10 |
+
|
11 |
+
OUTPUT_DIR = Path("output")
|
12 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
13 |
+
|
14 |
+
ANNOTATED_DIFFS_ARTIFACT = OUTPUT_DIR / "annotated_diffs.csv"
|
generate_annotated_diffs.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
|
3 |
+
import hf_data_loader
|
4 |
+
|
5 |
+
|
6 |
+
def group_changes(changes):
|
7 |
+
groups = {}
|
8 |
+
for change in changes:
|
9 |
+
group = datetime.fromisoformat(change.ts)
|
10 |
+
if group not in groups:
|
11 |
+
groups[group] = []
|
12 |
+
groups[group].append(change)
|
13 |
+
|
14 |
+
grouped_changes = []
|
15 |
+
for group in sorted(groups.keys()):
|
16 |
+
grouped_changes.sort(key=lambda x: x.p)
|
17 |
+
grouped_changes.append(groups[group])
|
18 |
+
|
19 |
+
return grouped_changes
|
20 |
+
|
21 |
+
|
22 |
+
def get_annotated_diff(initial_text, changes):
|
23 |
+
grouped_changes = group_changes(changes)
|
24 |
+
text = [((c, " ") for c in initial_text)]
|
25 |
+
for change_group in grouped_changes:
|
26 |
+
text_pointer = 0
|
27 |
+
change_pointer = 0
|
28 |
+
while text_pointer < len(text):
|
29 |
+
pass
|
30 |
+
|
31 |
+
|
32 |
+
df = hf_data_loader.load_raw_dataset_as_pandas()
|
hf_data_loader.py
CHANGED
@@ -8,3 +8,6 @@ def load_raw_dataset_as_pandas():
|
|
8 |
split=config.HF_RAW_DATASET_SPLIT,
|
9 |
token=config.HF_TOKEN,
|
10 |
cache_dir=config.CACHE_DIR).to_pandas()
|
|
|
|
|
|
|
|
8 |
split=config.HF_RAW_DATASET_SPLIT,
|
9 |
token=config.HF_TOKEN,
|
10 |
cache_dir=config.CACHE_DIR).to_pandas()
|
11 |
+
|
12 |
+
|
13 |
+
load_raw_dataset_as_pandas()
|