Petr Tsvetkov
commited on
Commit
β’
305e536
1
Parent(s):
6e5778f
Download the commit-rewriting dataset
Browse files- config.py +6 -0
- hf_data_loader.py +10 -0
config.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
HF_TOKEN = os.environ.get('HF_TOKEN')
|
4 |
+
HF_RAW_DATASET_NAME = "petrtsv-jb/commit-msg-rewriting"
|
5 |
+
HF_RAW_DATASET_SPLIT = 'train'
|
6 |
+
CACHE_DIR = "cache"
|
hf_data_loader.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
|
3 |
+
import config
|
4 |
+
|
5 |
+
|
6 |
+
def load_raw_dataset_as_pandas():
|
7 |
+
return load_dataset(config.HF_RAW_DATASET_NAME,
|
8 |
+
split=config.HF_RAW_DATASET_SPLIT,
|
9 |
+
token=config.HF_TOKEN,
|
10 |
+
cache_dir=config.CACHE_DIR).to_pandas()
|