Upload 5 files

The trained base model (2k batch size, 125k training steps) of Multi-perspective Course Learner (Pre-training Language Model as a Multi-perspective Course Learner, Findings of ACL 2023)

Files changed (5) hide show

checkpoint_1_125000.pt +3 -0
dict.txt +0 -0
get_json_file.py +9 -0
sentencepiece.bpe.model +3 -0
shard_data.py +22 -0

checkpoint_1_125000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:704e1dfc96819022f6b52b0a43f0dade574d8f49dcdb756ed014b86407eca204
+size 2395347755

dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

get_json_file.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import glob
+import json
+from sys import argv
+for split in ['train', 'valid']:
+    with open(f'json/{split}.json', 'w') as f:
+        data = [{'source': glob.glob(f'shard/{split}/*'), 'source_lang': 'en', 'weight': 1.0, 'name': '16gb-en'}]
+        json.dump(data, f, indent=4)

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a60b4d1d1d8f70c8b2569c94540d4d9b7c694fd32e7a428ad0dcffaafaa3beb
+size 1363614

shard_data.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from sys import argv
+filename = argv[1]
+num_line = argv[2]
+output_dir = argv[3]
+lines = open(filename).read().strip().split('\n')
+ckpt = 0
+shard_lines = []
+for i, line in enumerate(lines):
+    if line == '' and (i-ckpt)>=int(num_line):
+        shard_lines.append(lines[ckpt:i+1])
+        ckpt = i+1
+if ckpt < len(lines) - 1:
+    shard_lines.append(lines[ckpt:])
+for i, doc in enumerate(shard_lines):
+    with open(f'{output_dir}/{i:06}.txt', 'w') as f:
+        print('\n'.join(doc), file=f, end='\n')