Spaces:

tianhaowang
/

demo-curation

Running

App Files Files Community

tianhaowang commited on Sep 28

Commit

407ee90

1 Parent(s): 903289f

update

Browse files

Files changed (31) hide show

fixtures/pretrain_d0.csv +3 -0
fixtures/pretrain_d0.jsonl +2 -0
fixtures/pretrain_dk.jsonl +2 -0
fixtures/pretrain_test.jsonl +2 -0
jobs/train.py +2 -3
record.md +1 -1
tmp/pretrain_smoke/catalog.json +9 -0
tmp/pretrain_smoke/d0/cache-63766c6e12303287.arrow +3 -0
tmp/pretrain_smoke/d0/cache-eaeaa8c5a0330819.arrow +3 -0
tmp/pretrain_smoke/d0/data-00000-of-00001.arrow +3 -0
tmp/pretrain_smoke/d0/dataset_info.json +12 -0
tmp/pretrain_smoke/d0/state.json +13 -0
tmp/pretrain_smoke/dk/data-00000-of-00001.arrow +3 -0
tmp/pretrain_smoke/dk/dataset_info.json +12 -0
tmp/pretrain_smoke/dk/state.json +13 -0
tmp/pretrain_smoke/run/scaling_law.png +0 -0
tmp/pretrain_smoke/run/size-2/config.json +39 -0
tmp/pretrain_smoke/run/size-2/generation_config.json +9 -0
tmp/pretrain_smoke/run/size-2/merges.txt +0 -0
tmp/pretrain_smoke/run/size-2/metrics.json +6 -0
tmp/pretrain_smoke/run/size-2/model.safetensors +3 -0
tmp/pretrain_smoke/run/size-2/special_tokens_map.json +24 -0
tmp/pretrain_smoke/run/size-2/tokenizer.json +0 -0
tmp/pretrain_smoke/run/size-2/tokenizer_config.json +23 -0
tmp/pretrain_smoke/run/size-2/training_args.bin +3 -0
tmp/pretrain_smoke/run/size-2/vocab.json +0 -0
tmp/pretrain_smoke/run/summary.json +31 -0
tmp/pretrain_smoke/test/cache-854da9a57549cb9d.arrow +3 -0
tmp/pretrain_smoke/test/data-00000-of-00001.arrow +3 -0
tmp/pretrain_smoke/test/dataset_info.json +12 -0
tmp/pretrain_smoke/test/state.json +13 -0

fixtures/pretrain_d0.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+text
+Curated datasets accelerate small language model research.
+Quality data sampling helps estimate scaling trends.

fixtures/pretrain_d0.jsonl ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {"text": "Curated datasets accelerate small language model research."}
2	+ {"text": "Quality data sampling helps estimate scaling trends."}

fixtures/pretrain_dk.jsonl ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {"text": "Synthetic corpora allow quick smoke tests for pretraining."}
2	+ {"text": "Short documents ensure jobs finish within seconds."}

fixtures/pretrain_test.jsonl ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {"text": "Evaluation samples validate perplexity metrics."}
2	+ {"text": "UI uploads should remain lightweight for dev iterations."}

jobs/train.py CHANGED Viewed

@@ -166,7 +166,7 @@ def run_training(model_name: str, task: str, d0: str, dk: str, size: int, output
             gradient_accumulation_steps=1,
             learning_rate=2e-5,
             num_train_epochs=1,
-            evaluation_strategy="epoch",
             logging_steps=10,
             save_strategy="no",
             report_to=["none"],
@@ -239,7 +239,7 @@ def run_training(model_name: str, task: str, d0: str, dk: str, size: int, output
             gradient_accumulation_steps=2,
             learning_rate=3e-5,
             num_train_epochs=1,
-            evaluation_strategy="epoch",
             logging_steps=25,
             save_strategy="no",
             report_to=["none"],
@@ -288,7 +288,6 @@ def run_training(model_name: str, task: str, d0: str, dk: str, size: int, output
             gradient_accumulation_steps=4,
             learning_rate=2e-5,
             num_train_epochs=1,
-            evaluation_strategy="no",
             logging_steps=50,
             save_strategy="no",
             report_to=["none"],

             gradient_accumulation_steps=1,
             learning_rate=2e-5,
             num_train_epochs=1,
+            eval_strategy="epoch",
             logging_steps=10,
             save_strategy="no",
             report_to=["none"],
             gradient_accumulation_steps=2,
             learning_rate=3e-5,
             num_train_epochs=1,
+            eval_strategy="epoch",
             logging_steps=25,
             save_strategy="no",
             report_to=["none"],
             gradient_accumulation_steps=4,
             learning_rate=2e-5,
             num_train_epochs=1,
             logging_steps=50,
             save_strategy="no",
             report_to=["none"],

record.md CHANGED Viewed

@@ -236,4 +236,4 @@ Traceback (most recent call last):
   File "<string>", line 1, in <module>
 AttributeError: type object 'CheckboxGroup' has no attribute 'update'
 === Application stopped (exit code: 1) at 2025-09-28 02:22:46.235131589 UTC ===
-```

   File "<string>", line 1, in <module>
 AttributeError: type object 'CheckboxGroup' has no attribute 'update'
 === Application stopped (exit code: 1) at 2025-09-28 02:22:46.235131589 UTC ===
+```- Updated `jobs/train.py` to align with Transformers 4.56 (`eval_strategy`) and executed a local pretraining smoke test using `sshleifer/tiny-gpt2` on synthetic datasets (`tmp/pretrain_smoke/*`), stubbing artifact upload.

tmp/pretrain_smoke/catalog.json ADDED Viewed

	@@ -0,0 +1,9 @@

+[
+  {
+    "id": "tmp/pretrain_smoke/dk",
+    "task": "pretraining",
+    "license": "test",
+    "size_hint": "4",
+    "columns": {"text": "text"}
+  }
+]

tmp/pretrain_smoke/d0/cache-63766c6e12303287.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f0c983d99a2e4ccb2aa4c449c81064dcaf6d20ed8861015f50cc77a9e074a69
+size 6024

tmp/pretrain_smoke/d0/cache-eaeaa8c5a0330819.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37bcf2a452f4a0ce1e14ee116102daed98599af117e43ef94f3f7b9fe8ebc599
+size 464

tmp/pretrain_smoke/d0/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0169024be346e99f73223f3d0c7617a5af9d812919aca101b35014f6246faeb8
+size 536

tmp/pretrain_smoke/d0/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

tmp/pretrain_smoke/d0/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5ea2e3a2a464e2f0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

tmp/pretrain_smoke/dk/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85a4022c9df546f7f33e72f441b48f8c3526d8e8d5f86ce08a2d5e001b4a1276
+size 544

tmp/pretrain_smoke/dk/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

tmp/pretrain_smoke/dk/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "4c79c0f45a298d32",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

tmp/pretrain_smoke/run/scaling_law.png ADDED Viewed

tmp/pretrain_smoke/run/size-2/config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 2,
+  "n_head": 2,
+  "n_inner": null,
+  "n_layer": 2,
+  "n_positions": 1024,
+  "pad_token_id": 50256,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "transformers_version": "4.56.2",
+  "use_cache": true,
+  "vocab_size": 50257
+}

tmp/pretrain_smoke/run/size-2/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": [
+    50256
+  ],
+  "pad_token_id": 50256,
+  "transformers_version": "4.56.2"
+}

tmp/pretrain_smoke/run/size-2/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tmp/pretrain_smoke/run/size-2/metrics.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "loss": 10.838460922241211,
+  "perplexity": 50942.912511932205,
+  "f1": 0.0,
+  "exact_match": 0.0
+}

tmp/pretrain_smoke/run/size-2/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00390c8bb65009158e974f07d1e724ab360971ba5b2284abcdd95613b73e5dc5
+size 413296

tmp/pretrain_smoke/run/size-2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tmp/pretrain_smoke/run/size-2/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tmp/pretrain_smoke/run/size-2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

tmp/pretrain_smoke/run/size-2/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95e4a00d19ecbebac404296e923cbcab5be31e1fa8f533746523d5d511c3c187
+size 5841

tmp/pretrain_smoke/run/size-2/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tmp/pretrain_smoke/run/summary.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "model": "sshleifer/tiny-gpt2",
+  "task": "pretraining",
+  "d0": "tmp/pretrain_smoke/d0",
+  "dk": "tmp/pretrain_smoke/dk",
+  "test_dataset": "tmp/pretrain_smoke/test",
+  "metrics": [
+    "loss",
+    "perplexity"
+  ],
+  "primary_metric": "loss",
+  "runs": [
+    {
+      "size": 2,
+      "metrics": {
+        "loss": 10.838460922241211,
+        "perplexity": 50942.912511932205,
+        "f1": 0.0,
+        "exact_match": 0.0
+      }
+    }
+  ],
+  "scaling": {
+    "coefficients": [
+      10.838460922241211
+    ],
+    "prediction": 10.838460922241211
+  },
+  "generated_at": "2025-09-28T02:52:17.333083Z",
+  "plot": "scaling_law.png"
+}

tmp/pretrain_smoke/test/cache-854da9a57549cb9d.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e641b083c9ea8644f488623b79f2bf3cdc72212689a28424e46bdf559ef2c81
+size 6024

tmp/pretrain_smoke/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b88e0765399e913bd461b478c80014fe2e21d1e3d32c640c376eca9510d14e1
+size 512

tmp/pretrain_smoke/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

tmp/pretrain_smoke/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "98df13e564c273aa",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}