tianhaowang commited on
Commit
407ee90
·
1 Parent(s): 903289f
fixtures/pretrain_d0.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ text
2
+ Curated datasets accelerate small language model research.
3
+ Quality data sampling helps estimate scaling trends.
fixtures/pretrain_d0.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"text": "Curated datasets accelerate small language model research."}
2
+ {"text": "Quality data sampling helps estimate scaling trends."}
fixtures/pretrain_dk.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"text": "Synthetic corpora allow quick smoke tests for pretraining."}
2
+ {"text": "Short documents ensure jobs finish within seconds."}
fixtures/pretrain_test.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"text": "Evaluation samples validate perplexity metrics."}
2
+ {"text": "UI uploads should remain lightweight for dev iterations."}
jobs/train.py CHANGED
@@ -166,7 +166,7 @@ def run_training(model_name: str, task: str, d0: str, dk: str, size: int, output
166
  gradient_accumulation_steps=1,
167
  learning_rate=2e-5,
168
  num_train_epochs=1,
169
- evaluation_strategy="epoch",
170
  logging_steps=10,
171
  save_strategy="no",
172
  report_to=["none"],
@@ -239,7 +239,7 @@ def run_training(model_name: str, task: str, d0: str, dk: str, size: int, output
239
  gradient_accumulation_steps=2,
240
  learning_rate=3e-5,
241
  num_train_epochs=1,
242
- evaluation_strategy="epoch",
243
  logging_steps=25,
244
  save_strategy="no",
245
  report_to=["none"],
@@ -288,7 +288,6 @@ def run_training(model_name: str, task: str, d0: str, dk: str, size: int, output
288
  gradient_accumulation_steps=4,
289
  learning_rate=2e-5,
290
  num_train_epochs=1,
291
- evaluation_strategy="no",
292
  logging_steps=50,
293
  save_strategy="no",
294
  report_to=["none"],
 
166
  gradient_accumulation_steps=1,
167
  learning_rate=2e-5,
168
  num_train_epochs=1,
169
+ eval_strategy="epoch",
170
  logging_steps=10,
171
  save_strategy="no",
172
  report_to=["none"],
 
239
  gradient_accumulation_steps=2,
240
  learning_rate=3e-5,
241
  num_train_epochs=1,
242
+ eval_strategy="epoch",
243
  logging_steps=25,
244
  save_strategy="no",
245
  report_to=["none"],
 
288
  gradient_accumulation_steps=4,
289
  learning_rate=2e-5,
290
  num_train_epochs=1,
 
291
  logging_steps=50,
292
  save_strategy="no",
293
  report_to=["none"],
record.md CHANGED
@@ -236,4 +236,4 @@ Traceback (most recent call last):
236
  File "<string>", line 1, in <module>
237
  AttributeError: type object 'CheckboxGroup' has no attribute 'update'
238
  === Application stopped (exit code: 1) at 2025-09-28 02:22:46.235131589 UTC ===
239
- ```
 
236
  File "<string>", line 1, in <module>
237
  AttributeError: type object 'CheckboxGroup' has no attribute 'update'
238
  === Application stopped (exit code: 1) at 2025-09-28 02:22:46.235131589 UTC ===
239
+ ```- Updated `jobs/train.py` to align with Transformers 4.56 (`eval_strategy`) and executed a local pretraining smoke test using `sshleifer/tiny-gpt2` on synthetic datasets (`tmp/pretrain_smoke/*`), stubbing artifact upload.
tmp/pretrain_smoke/catalog.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "tmp/pretrain_smoke/dk",
4
+ "task": "pretraining",
5
+ "license": "test",
6
+ "size_hint": "4",
7
+ "columns": {"text": "text"}
8
+ }
9
+ ]
tmp/pretrain_smoke/d0/cache-63766c6e12303287.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f0c983d99a2e4ccb2aa4c449c81064dcaf6d20ed8861015f50cc77a9e074a69
3
+ size 6024
tmp/pretrain_smoke/d0/cache-eaeaa8c5a0330819.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37bcf2a452f4a0ce1e14ee116102daed98599af117e43ef94f3f7b9fe8ebc599
3
+ size 464
tmp/pretrain_smoke/d0/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0169024be346e99f73223f3d0c7617a5af9d812919aca101b35014f6246faeb8
3
+ size 536
tmp/pretrain_smoke/d0/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
tmp/pretrain_smoke/d0/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "5ea2e3a2a464e2f0",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
tmp/pretrain_smoke/dk/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85a4022c9df546f7f33e72f441b48f8c3526d8e8d5f86ce08a2d5e001b4a1276
3
+ size 544
tmp/pretrain_smoke/dk/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
tmp/pretrain_smoke/dk/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "4c79c0f45a298d32",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
tmp/pretrain_smoke/run/scaling_law.png ADDED
tmp/pretrain_smoke/run/size-2/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 2,
16
+ "n_head": 2,
17
+ "n_inner": null,
18
+ "n_layer": 2,
19
+ "n_positions": 1024,
20
+ "pad_token_id": 50256,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "transformers_version": "4.56.2",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
tmp/pretrain_smoke/run/size-2/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.56.2"
9
+ }
tmp/pretrain_smoke/run/size-2/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tmp/pretrain_smoke/run/size-2/metrics.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "loss": 10.838460922241211,
3
+ "perplexity": 50942.912511932205,
4
+ "f1": 0.0,
5
+ "exact_match": 0.0
6
+ }
tmp/pretrain_smoke/run/size-2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00390c8bb65009158e974f07d1e724ab360971ba5b2284abcdd95613b73e5dc5
3
+ size 413296
tmp/pretrain_smoke/run/size-2/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tmp/pretrain_smoke/run/size-2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tmp/pretrain_smoke/run/size-2/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ }
13
+ },
14
+ "bos_token": "<|endoftext|>",
15
+ "clean_up_tokenization_spaces": false,
16
+ "eos_token": "<|endoftext|>",
17
+ "errors": "replace",
18
+ "extra_special_tokens": {},
19
+ "model_max_length": 1024,
20
+ "pad_token": "<|endoftext|>",
21
+ "tokenizer_class": "GPT2Tokenizer",
22
+ "unk_token": "<|endoftext|>"
23
+ }
tmp/pretrain_smoke/run/size-2/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95e4a00d19ecbebac404296e923cbcab5be31e1fa8f533746523d5d511c3c187
3
+ size 5841
tmp/pretrain_smoke/run/size-2/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
tmp/pretrain_smoke/run/summary.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "sshleifer/tiny-gpt2",
3
+ "task": "pretraining",
4
+ "d0": "tmp/pretrain_smoke/d0",
5
+ "dk": "tmp/pretrain_smoke/dk",
6
+ "test_dataset": "tmp/pretrain_smoke/test",
7
+ "metrics": [
8
+ "loss",
9
+ "perplexity"
10
+ ],
11
+ "primary_metric": "loss",
12
+ "runs": [
13
+ {
14
+ "size": 2,
15
+ "metrics": {
16
+ "loss": 10.838460922241211,
17
+ "perplexity": 50942.912511932205,
18
+ "f1": 0.0,
19
+ "exact_match": 0.0
20
+ }
21
+ }
22
+ ],
23
+ "scaling": {
24
+ "coefficients": [
25
+ 10.838460922241211
26
+ ],
27
+ "prediction": 10.838460922241211
28
+ },
29
+ "generated_at": "2025-09-28T02:52:17.333083Z",
30
+ "plot": "scaling_law.png"
31
+ }
tmp/pretrain_smoke/test/cache-854da9a57549cb9d.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e641b083c9ea8644f488623b79f2bf3cdc72212689a28424e46bdf559ef2c81
3
+ size 6024
tmp/pretrain_smoke/test/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b88e0765399e913bd461b478c80014fe2e21d1e3d32c640c376eca9510d14e1
3
+ size 512
tmp/pretrain_smoke/test/dataset_info.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "text": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ }
9
+ },
10
+ "homepage": "",
11
+ "license": ""
12
+ }
tmp/pretrain_smoke/test/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "98df13e564c273aa",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }