Petr Tsvetkov commited on
Commit
aab3281
β€’
1 Parent(s): 305e536

WIP on annotated diffs generation

Browse files
Files changed (4) hide show
  1. .gitignore +2 -1
  2. config.py +9 -1
  3. generate_annotated_diffs.py +32 -0
  4. hf_data_loader.py +3 -0
.gitignore CHANGED
@@ -277,4 +277,5 @@ pip-selfcheck.json
277
 
278
  .idea
279
 
280
- cache
 
 
277
 
278
  .idea
279
 
280
+ cache
281
+ output
config.py CHANGED
@@ -1,6 +1,14 @@
1
  import os
 
2
 
3
  HF_TOKEN = os.environ.get('HF_TOKEN')
4
  HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
5
  HF_RAW_DATASET_SPLIT = 'train'
6
- CACHE_DIR = "cache"
 
 
 
 
 
 
 
 
1
  import os
2
+ from pathlib import Path
3
 
4
  HF_TOKEN = os.environ.get('HF_TOKEN')
5
  HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
6
  HF_RAW_DATASET_SPLIT = 'train'
7
+
8
+ CACHE_DIR = Path("cache")
9
+ CACHE_DIR.mkdir(exist_ok=True)
10
+
11
+ OUTPUT_DIR = Path("output")
12
+ OUTPUT_DIR.mkdir(exist_ok=True)
13
+
14
+ ANNOTATED_DIFFS_ARTIFACT = OUTPUT_DIR / "annotated_diffs.csv"
generate_annotated_diffs.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ import hf_data_loader
4
+
5
+
6
+ def group_changes(changes):
7
+ groups = {}
8
+ for change in changes:
9
+ group = datetime.fromisoformat(change.ts)
10
+ if group not in groups:
11
+ groups[group] = []
12
+ groups[group].append(change)
13
+
14
+ grouped_changes = []
15
+ for group in sorted(groups.keys()):
16
+ grouped_changes.sort(key=lambda x: x.p)
17
+ grouped_changes.append(groups[group])
18
+
19
+ return grouped_changes
20
+
21
+
22
+ def get_annotated_diff(initial_text, changes):
23
+ grouped_changes = group_changes(changes)
24
+ text = [((c, " ") for c in initial_text)]
25
+ for change_group in grouped_changes:
26
+ text_pointer = 0
27
+ change_pointer = 0
28
+ while text_pointer < len(text):
29
+ pass
30
+
31
+
32
+ df = hf_data_loader.load_raw_dataset_as_pandas()
hf_data_loader.py CHANGED
@@ -8,3 +8,6 @@ def load_raw_dataset_as_pandas():
8
  split=config.HF_RAW_DATASET_SPLIT,
9
  token=config.HF_TOKEN,
10
  cache_dir=config.CACHE_DIR).to_pandas()
 
 
 
 
8
  split=config.HF_RAW_DATASET_SPLIT,
9
  token=config.HF_TOKEN,
10
  cache_dir=config.CACHE_DIR).to_pandas()
11
+
12
+
13
+ load_raw_dataset_as_pandas()