Andrew DalPino
commited on
Commit
·
ab12a97
1
Parent(s):
3325763
Broad improvements
Browse files- README.md +15 -10
- beam_search.py +10 -3
- data.py +10 -16
- generate.py +10 -3
- instruction-tune.py +24 -10
- pre-train.py +17 -5
README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
datasets:
|
4 |
-
-
|
5 |
- tatsu-lab/alpaca
|
6 |
language:
|
7 |
- en
|
@@ -10,7 +10,6 @@ metrics:
|
|
10 |
pipeline_tag: text-generation
|
11 |
tags:
|
12 |
- LightGPT
|
13 |
-
- Open-source
|
14 |
---
|
15 |
# LightGPT
|
16 |
|
@@ -28,7 +27,7 @@ LightGPT is a lightweight generative pre-trained Transformer (GPT) model for the
|
|
28 |
|
29 |
Below is a table of recommended default model training configurations but feel free to experiment with settings on your own. See the `model_sizing.ipynb` notebook to estimate the memory and compute requirements for your model configuration.
|
30 |
|
31 |
-
| Name | Vocab. Size | Block Size | Embedding Dim. | Attn. Heads | Layers |
|
32 |
|---|---|---|---|---|---|---|---|
|
33 |
| Small | 50,257 | 1024 | 1024 | 16 | 32 | 454M | 10B |
|
34 |
| Medium | 50,257 | 1024 | 2048 | 32 | 32 | 1.7B | 20B |
|
@@ -57,9 +56,9 @@ For the pre-training corpus we use the Fineweb dataset which consists of about 1
|
|
57 |
python pre-train.py
|
58 |
```
|
59 |
|
60 |
-
|
61 |
|
62 |
-
To customize the default "
|
63 |
|
64 |
```
|
65 |
python pre-train.py --block_size=2048 --embedding_dimensions=4096 --num_hidden_layers=64 --num_attention_heads=64
|
@@ -71,13 +70,13 @@ You can also adjust the `batch_size`, `learning_rate`, and `gradient_accumulatio
|
|
71 |
python pre-train.py --batch_size=32 --learning_rate=0.01 --gradient_accumulation_steps=128
|
72 |
```
|
73 |
|
74 |
-
For distributed training, use PyTorch's [torchrun](https://pytorch.org/docs/stable/elastic/run.html) extension to launch a distributed data parallel session. The example below is for executing the training script on a single node with individual
|
75 |
|
76 |
```
|
77 |
torchrun --standalone --nnodes=1 --nproc-per-node=8 pre-train.py --batch_size=16 --gradient_accumulation_steps=128
|
78 |
```
|
79 |
|
80 |
-
|
81 |
|
82 |
## Text Generation
|
83 |
|
@@ -108,7 +107,9 @@ Soon ...
|
|
108 |
| --batch_size | 1 | int | The number of samples to pass through the network at a time. |
|
109 |
| --gradient_accumulation_steps | 128 | int | The number of batches to pass through the network before updating the weights. |
|
110 |
| --samples_per_epoch | 4096 | int | The number of training samples to pass through the network every epoch. |
|
111 |
-
| --learning_rate | 5e-4 | float | The
|
|
|
|
|
112 |
| --max_gradient_norm | 1.0 | float | Clip gradients above this threshold before stepping. |
|
113 |
| --num_epochs | 2384 | int | The number of epochs to train for. |
|
114 |
| --eval_interval | 10 | int | Evaluate the model after this many epochs on the testing set. |
|
@@ -117,7 +118,7 @@ Soon ...
|
|
117 |
| --num_attention_heads | 16 | int | The number of attention heads within every block. |
|
118 |
| --num_hidden_layers | 32 | int | The number of attention/MLP blocks within the hidden layer of the network. |
|
119 |
| --dropout | 0.1 | float | The proportion of signals to send to zero during training as regularization. |
|
120 |
-
| --activation_checkpointing | False | bool | Should we use activation checkpointing? This will drastically
|
121 |
| --ddp_sharding_level | 2 | int | The level of sharding to use for DDP training. Options are 2 or 3 for partial and full sharding respectively, or 0 for no sharding. |
|
122 |
| --checkpoint_interval | 20 | int | Save the model parameters to disk every this many epochs. |
|
123 |
| --checkpoint_path | "./out/checkpoint.pt" | str | The path to the checkpoint file on disk. |
|
@@ -132,12 +133,15 @@ Soon ...
|
|
132 |
| --base_model_path | "./out/checkpoint.pt" | string | The path to the pre-trained model. |
|
133 |
| --batch_size | 1 | int | The number of samples to pass through the network at a time. |
|
134 |
| --gradient_accumulation_steps | 128 | int | The number of batches to pass through the network before updating the weights. |
|
135 |
-
| --learning_rate | 5e-4 | float | The
|
|
|
|
|
136 |
| --mask_input | False | bool | Should we mask the input part of the sample i.e. only train on the output? |
|
137 |
| --rank | 8 | int | The rank of the LoRA decomposition matrices. |
|
138 |
| --alpha | 1.0 | float | The strength of the LoRA signal. |
|
139 |
| --dropout | 0.05 | float | The proportion of signals to send to zero during training as regularization. |
|
140 |
| --num_epochs | 4 | int | The number of epochs to train for. |
|
|
|
141 |
| --eval_interval | 1 | int | Evaluate the model after this many epochs on the testing set. |
|
142 |
| --checkpoint_interval | 1 | int | Save the model parameters to disk every this many epochs. |
|
143 |
| --checkpoint_path | "./out/lora_instruction.pt" | string | The path to the checkpoint file on disk. |
|
@@ -171,6 +175,7 @@ Soon ...
|
|
171 |
| --seed | None | int | The seed for the random number generator. |
|
172 |
|
173 |
## References:
|
|
|
174 |
>- A. Radford, et al. Language Models are Unsupervised Multitask Learners, OpenAI, 2019.
|
175 |
>- T. Brown, et al. Language Models are Few-Shot Learners. OpenAI, 2020.
|
176 |
>- A. Kazemnejad, et al. The Impact of Positional Encoding on Length Generalization in Transformers, 37th Conference on Neural Information Processing Systems (NeurIPS 2023).
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
datasets:
|
4 |
+
- HuggingFaceFW/fineweb
|
5 |
- tatsu-lab/alpaca
|
6 |
language:
|
7 |
- en
|
|
|
10 |
pipeline_tag: text-generation
|
11 |
tags:
|
12 |
- LightGPT
|
|
|
13 |
---
|
14 |
# LightGPT
|
15 |
|
|
|
27 |
|
28 |
Below is a table of recommended default model training configurations but feel free to experiment with settings on your own. See the `model_sizing.ipynb` notebook to estimate the memory and compute requirements for your model configuration.
|
29 |
|
30 |
+
| Name | Vocab. Size | Block Size | Embedding Dim. | Attn. Heads | Layers | Parameters | Training Tokens |
|
31 |
|---|---|---|---|---|---|---|---|
|
32 |
| Small | 50,257 | 1024 | 1024 | 16 | 32 | 454M | 10B |
|
33 |
| Medium | 50,257 | 1024 | 2048 | 32 | 32 | 1.7B | 20B |
|
|
|
56 |
python pre-train.py
|
57 |
```
|
58 |
|
59 |
+
**Note** that it will take a while to download and pre-process the dataset the first time that the training script is run.
|
60 |
|
61 |
+
To customize the default "Small" architecture you can adjust the `block_size`, `embedding_dimensions`, `num_hidden_layers`, and `num_attention_heads` arguments of the pre-training script.
|
62 |
|
63 |
```
|
64 |
python pre-train.py --block_size=2048 --embedding_dimensions=4096 --num_hidden_layers=64 --num_attention_heads=64
|
|
|
70 |
python pre-train.py --batch_size=32 --learning_rate=0.01 --gradient_accumulation_steps=128
|
71 |
```
|
72 |
|
73 |
+
For distributed training, use PyTorch's [torchrun](https://pytorch.org/docs/stable/elastic/run.html) extension to launch a distributed data parallel (DDP) session. The example below is for executing the training script on a single node with 8 individual GPUs.
|
74 |
|
75 |
```
|
76 |
torchrun --standalone --nnodes=1 --nproc-per-node=8 pre-train.py --batch_size=16 --gradient_accumulation_steps=128
|
77 |
```
|
78 |
|
79 |
+
**Note** that when training in data-parallel mode it's important that the `gradient_accumulation_steps` divides evenly into the world size for maximum performance. For example, if we have an 8 GPU cluster, we could perform 32 gradient accumulation steps in exactly 4 passes over the network.
|
80 |
|
81 |
## Text Generation
|
82 |
|
|
|
107 |
| --batch_size | 1 | int | The number of samples to pass through the network at a time. |
|
108 |
| --gradient_accumulation_steps | 128 | int | The number of batches to pass through the network before updating the weights. |
|
109 |
| --samples_per_epoch | 4096 | int | The number of training samples to pass through the network every epoch. |
|
110 |
+
| --learning_rate | 5e-4 | float | The learning rate of the Adafactor optimizer. |
|
111 |
+
| --rms_decay | -0.8 | float | The decay rate of the RMS coefficient of the Adafactor optimizer. |
|
112 |
+
| --optimizer_low_memory | True | bool | Should the optimizer reduce its memory consumption in exchange for a slightly slower runtime? |
|
113 |
| --max_gradient_norm | 1.0 | float | Clip gradients above this threshold before stepping. |
|
114 |
| --num_epochs | 2384 | int | The number of epochs to train for. |
|
115 |
| --eval_interval | 10 | int | Evaluate the model after this many epochs on the testing set. |
|
|
|
118 |
| --num_attention_heads | 16 | int | The number of attention heads within every block. |
|
119 |
| --num_hidden_layers | 32 | int | The number of attention/MLP blocks within the hidden layer of the network. |
|
120 |
| --dropout | 0.1 | float | The proportion of signals to send to zero during training as regularization. |
|
121 |
+
| --activation_checkpointing | False | bool | Should we use activation checkpointing? This will reduce drastically memory utilization during training at the cost of needing to recompute the forward pass. |
|
122 |
| --ddp_sharding_level | 2 | int | The level of sharding to use for DDP training. Options are 2 or 3 for partial and full sharding respectively, or 0 for no sharding. |
|
123 |
| --checkpoint_interval | 20 | int | Save the model parameters to disk every this many epochs. |
|
124 |
| --checkpoint_path | "./out/checkpoint.pt" | str | The path to the checkpoint file on disk. |
|
|
|
133 |
| --base_model_path | "./out/checkpoint.pt" | string | The path to the pre-trained model. |
|
134 |
| --batch_size | 1 | int | The number of samples to pass through the network at a time. |
|
135 |
| --gradient_accumulation_steps | 128 | int | The number of batches to pass through the network before updating the weights. |
|
136 |
+
| --learning_rate | 5e-4 | float | The learning rate of the Adafactor optimizer. |
|
137 |
+
| --rms_decay | -0.8 | float | The decay rate of the RMS coefficient of the Adafactor optimizer. |
|
138 |
+
| --optimizer_low_memory | True | bool | Should the optimizer reduce its memory consumption in exchange for a slightly slower runtime? |
|
139 |
| --mask_input | False | bool | Should we mask the input part of the sample i.e. only train on the output? |
|
140 |
| --rank | 8 | int | The rank of the LoRA decomposition matrices. |
|
141 |
| --alpha | 1.0 | float | The strength of the LoRA signal. |
|
142 |
| --dropout | 0.05 | float | The proportion of signals to send to zero during training as regularization. |
|
143 |
| --num_epochs | 4 | int | The number of epochs to train for. |
|
144 |
+
| --activation_checkpointing | False | bool | Should we use activation checkpointing? This will reduce drastically memory utilization during training at the cost of needing to recompute the forward pass. |
|
145 |
| --eval_interval | 1 | int | Evaluate the model after this many epochs on the testing set. |
|
146 |
| --checkpoint_interval | 1 | int | Save the model parameters to disk every this many epochs. |
|
147 |
| --checkpoint_path | "./out/lora_instruction.pt" | string | The path to the checkpoint file on disk. |
|
|
|
175 |
| --seed | None | int | The seed for the random number generator. |
|
176 |
|
177 |
## References:
|
178 |
+
>- G. Penedo, et al. The FineWeb Datasts: Decanting the Web for the Finest Text Data at Scale, 38th Conference on Neural Information Processing Systems (NeurIPS 2024) Track on Datasets and Benchmarks.
|
179 |
>- A. Radford, et al. Language Models are Unsupervised Multitask Learners, OpenAI, 2019.
|
180 |
>- T. Brown, et al. Language Models are Few-Shot Learners. OpenAI, 2020.
|
181 |
>- A. Kazemnejad, et al. The Impact of Positional Encoding on Length Generalization in Transformers, 37th Conference on Neural Information Processing Systems (NeurIPS 2023).
|
beam_search.py
CHANGED
@@ -37,12 +37,12 @@ def main():
|
|
37 |
torch.manual_seed(args.seed)
|
38 |
random.seed(args.seed)
|
39 |
|
40 |
-
tokenizer = tiktoken.get_encoding(Alpaca.ENCODING)
|
41 |
-
|
42 |
checkpoint = torch.load(
|
43 |
args.checkpoint_path, map_location=args.device, weights_only=True
|
44 |
)
|
45 |
|
|
|
|
|
46 |
model = GPT(**checkpoint["model_args"])
|
47 |
|
48 |
model = torch.compile(model)
|
@@ -74,7 +74,14 @@ def main():
|
|
74 |
prompt = input("Enter a prompt: ")
|
75 |
|
76 |
if args.lora_path:
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
prompt = tokenizer.encode_ordinary(prompt)
|
80 |
|
|
|
37 |
torch.manual_seed(args.seed)
|
38 |
random.seed(args.seed)
|
39 |
|
|
|
|
|
40 |
checkpoint = torch.load(
|
41 |
args.checkpoint_path, map_location=args.device, weights_only=True
|
42 |
)
|
43 |
|
44 |
+
tokenizer = tiktoken.get_encoding(checkpoint["token_encoding"])
|
45 |
+
|
46 |
model = GPT(**checkpoint["model_args"])
|
47 |
|
48 |
model = torch.compile(model)
|
|
|
74 |
prompt = input("Enter a prompt: ")
|
75 |
|
76 |
if args.lora_path:
|
77 |
+
context = input("Additional context (leave blank for none): ")
|
78 |
+
|
79 |
+
if len(context) > 0:
|
80 |
+
prompt = Alpaca.PROMPT_TEMPLATE_WITH_INPUT.format(
|
81 |
+
input=context, instruction=prompt
|
82 |
+
)
|
83 |
+
else:
|
84 |
+
prompt = Alpaca.PROMPT_TEMPLATE.format(instruction=prompt)
|
85 |
|
86 |
prompt = tokenizer.encode_ordinary(prompt)
|
87 |
|
data.py
CHANGED
@@ -5,7 +5,7 @@ from copy import deepcopy
|
|
5 |
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
-
import
|
9 |
|
10 |
import numpy as np
|
11 |
|
@@ -28,12 +28,12 @@ class Fineweb(IterableDataset):
|
|
28 |
|
29 |
def __init__(
|
30 |
self,
|
|
|
31 |
root_path: str = "./dataset",
|
32 |
subset: str | None = "sample-10BT",
|
33 |
split: str = "train",
|
34 |
tokens_per_sample: int = 1024,
|
35 |
samples_per_epoch: int = 4096,
|
36 |
-
token_encoding: str = "r50k_base",
|
37 |
num_processes: int = 8,
|
38 |
):
|
39 |
super().__init__()
|
@@ -51,15 +51,12 @@ class Fineweb(IterableDataset):
|
|
51 |
if samples_per_epoch < 1:
|
52 |
raise ValueError(f"Samples per epoch must be greater than 0.")
|
53 |
|
54 |
-
if token_encoding not in ("r50k_base", "cl100k_base", "o200k_base"):
|
55 |
-
raise ValueError(f"Invalid token encoding, {token_encoding} given.")
|
56 |
-
|
57 |
-
self.tokenizer = tiktoken.get_encoding(token_encoding)
|
58 |
-
|
59 |
dataset_name = f"fineweb-{subset}" if subset != None else "fineweb"
|
60 |
|
61 |
-
train_path = path.join(root_path, f"{dataset_name}-train-{
|
62 |
-
test_path = path.join(root_path, f"{dataset_name}-test-{
|
|
|
|
|
63 |
|
64 |
if not path.exists(train_path) or not path.exists(test_path):
|
65 |
dataset = load_dataset(
|
@@ -70,7 +67,7 @@ class Fineweb(IterableDataset):
|
|
70 |
).map(
|
71 |
self.tokenize,
|
72 |
desc="Tokenizing",
|
73 |
-
remove_columns=["text"],
|
74 |
num_proc=num_processes,
|
75 |
)
|
76 |
|
@@ -172,9 +169,9 @@ class Alpaca(Dataset):
|
|
172 |
|
173 |
def __init__(
|
174 |
self,
|
|
|
175 |
max_tokens_per_sample: int = 1024,
|
176 |
-
|
177 |
-
mask_input: bool = True,
|
178 |
):
|
179 |
super().__init__()
|
180 |
|
@@ -183,10 +180,7 @@ class Alpaca(Dataset):
|
|
183 |
f"Max tokens per sample must be greater than 0, {max_tokens_per_sample} given."
|
184 |
)
|
185 |
|
186 |
-
|
187 |
-
raise ValueError(f"Invalid token encoding, {token_encoding} given.")
|
188 |
-
|
189 |
-
self.tokenizer = tiktoken.get_encoding(token_encoding)
|
190 |
|
191 |
self.dataset = load_dataset(self.DATASET_NAME, split="train")
|
192 |
|
|
|
5 |
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
+
from tiktoken import Encoding
|
9 |
|
10 |
import numpy as np
|
11 |
|
|
|
28 |
|
29 |
def __init__(
|
30 |
self,
|
31 |
+
tokenizer: Encoding,
|
32 |
root_path: str = "./dataset",
|
33 |
subset: str | None = "sample-10BT",
|
34 |
split: str = "train",
|
35 |
tokens_per_sample: int = 1024,
|
36 |
samples_per_epoch: int = 4096,
|
|
|
37 |
num_processes: int = 8,
|
38 |
):
|
39 |
super().__init__()
|
|
|
51 |
if samples_per_epoch < 1:
|
52 |
raise ValueError(f"Samples per epoch must be greater than 0.")
|
53 |
|
|
|
|
|
|
|
|
|
|
|
54 |
dataset_name = f"fineweb-{subset}" if subset != None else "fineweb"
|
55 |
|
56 |
+
train_path = path.join(root_path, f"{dataset_name}-train-{tokenizer.name}.bin")
|
57 |
+
test_path = path.join(root_path, f"{dataset_name}-test-{tokenizer.name}.bin")
|
58 |
+
|
59 |
+
self.tokenizer = tokenizer
|
60 |
|
61 |
if not path.exists(train_path) or not path.exists(test_path):
|
62 |
dataset = load_dataset(
|
|
|
67 |
).map(
|
68 |
self.tokenize,
|
69 |
desc="Tokenizing",
|
70 |
+
remove_columns=["text", "token_count"],
|
71 |
num_proc=num_processes,
|
72 |
)
|
73 |
|
|
|
169 |
|
170 |
def __init__(
|
171 |
self,
|
172 |
+
tokenizer: Encoding,
|
173 |
max_tokens_per_sample: int = 1024,
|
174 |
+
mask_input: bool = False,
|
|
|
175 |
):
|
176 |
super().__init__()
|
177 |
|
|
|
180 |
f"Max tokens per sample must be greater than 0, {max_tokens_per_sample} given."
|
181 |
)
|
182 |
|
183 |
+
self.tokenizer = tokenizer
|
|
|
|
|
|
|
184 |
|
185 |
self.dataset = load_dataset(self.DATASET_NAME, split="train")
|
186 |
|
generate.py
CHANGED
@@ -38,12 +38,12 @@ def main():
|
|
38 |
torch.manual_seed(args.seed)
|
39 |
random.seed(args.seed)
|
40 |
|
41 |
-
tokenizer = tiktoken.get_encoding(Alpaca.ENCODING)
|
42 |
-
|
43 |
checkpoint = torch.load(
|
44 |
args.checkpoint_path, map_location=args.device, weights_only=True
|
45 |
)
|
46 |
|
|
|
|
|
47 |
model = GPT(**checkpoint["model_args"])
|
48 |
|
49 |
model = torch.compile(model)
|
@@ -75,7 +75,14 @@ def main():
|
|
75 |
prompt = input("Enter a prompt: ")
|
76 |
|
77 |
if args.lora_path:
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
prompt = tokenizer.encode_ordinary(prompt)
|
81 |
|
|
|
38 |
torch.manual_seed(args.seed)
|
39 |
random.seed(args.seed)
|
40 |
|
|
|
|
|
41 |
checkpoint = torch.load(
|
42 |
args.checkpoint_path, map_location=args.device, weights_only=True
|
43 |
)
|
44 |
|
45 |
+
tokenizer = tiktoken.get_encoding(checkpoint["token_encoding"])
|
46 |
+
|
47 |
model = GPT(**checkpoint["model_args"])
|
48 |
|
49 |
model = torch.compile(model)
|
|
|
75 |
prompt = input("Enter a prompt: ")
|
76 |
|
77 |
if args.lora_path:
|
78 |
+
context = input("Additional context (leave blank for none): ")
|
79 |
+
|
80 |
+
if len(context) > 0:
|
81 |
+
prompt = Alpaca.PROMPT_TEMPLATE_WITH_INPUT.format(
|
82 |
+
input=context, instruction=prompt
|
83 |
+
)
|
84 |
+
else:
|
85 |
+
prompt = Alpaca.PROMPT_TEMPLATE.format(instruction=prompt)
|
86 |
|
87 |
prompt = tokenizer.encode_ordinary(prompt)
|
88 |
|
instruction-tune.py
CHANGED
@@ -21,17 +21,20 @@ from tqdm import tqdm
|
|
21 |
|
22 |
|
23 |
def main():
|
24 |
-
parser = ArgumentParser(description="Instruction-tune the
|
25 |
|
26 |
parser.add_argument("--base_model_path", default="./out/checkpoint.pt", type=str)
|
27 |
parser.add_argument("--batch_size", default=1, type=int)
|
28 |
parser.add_argument("--gradient_accumulation_steps", default=128, type=int)
|
29 |
parser.add_argument("--learning_rate", default=5e-4, type=float)
|
30 |
-
parser.add_argument("--
|
|
|
|
|
|
|
31 |
parser.add_argument("--rank", default=8, type=int)
|
32 |
parser.add_argument("--alpha", default=1.0, type=float)
|
33 |
parser.add_argument("--dropout", default=0.05, type=float)
|
34 |
-
parser.add_argument("--
|
35 |
parser.add_argument("--eval_interval", default=1, type=int)
|
36 |
parser.add_argument("--checkpoint_interval", default=1, type=int)
|
37 |
parser.add_argument(
|
@@ -66,7 +69,13 @@ def main():
|
|
66 |
|
67 |
model_args = checkpoint["model_args"]
|
68 |
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
training, testing = random_split(dataset, (0.9, 0.1))
|
72 |
|
@@ -85,7 +94,7 @@ def main():
|
|
85 |
shuffle=False,
|
86 |
)
|
87 |
|
88 |
-
model = GPT(**model_args)
|
89 |
|
90 |
model = torch.compile(model)
|
91 |
|
@@ -104,11 +113,12 @@ def main():
|
|
104 |
print("Compiling model")
|
105 |
model.compile()
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
112 |
|
113 |
starting_epoch = 1
|
114 |
|
@@ -125,6 +135,10 @@ def main():
|
|
125 |
|
126 |
model.train()
|
127 |
|
|
|
|
|
|
|
|
|
128 |
print("Instruction-tuning ...")
|
129 |
|
130 |
for epoch in range(starting_epoch, args.num_epochs + 1):
|
|
|
21 |
|
22 |
|
23 |
def main():
|
24 |
+
parser = ArgumentParser(description="Instruction-tune the GPT.")
|
25 |
|
26 |
parser.add_argument("--base_model_path", default="./out/checkpoint.pt", type=str)
|
27 |
parser.add_argument("--batch_size", default=1, type=int)
|
28 |
parser.add_argument("--gradient_accumulation_steps", default=128, type=int)
|
29 |
parser.add_argument("--learning_rate", default=5e-4, type=float)
|
30 |
+
parser.add_argument("--rms_decay", default=-0.8, type=float)
|
31 |
+
parser.add_argument("--optimizer_low_memory", default=True, type=bool)
|
32 |
+
parser.add_argument("--mask_input", default=False, type=bool)
|
33 |
+
parser.add_argument("--num_epochs", default=4, type=int)
|
34 |
parser.add_argument("--rank", default=8, type=int)
|
35 |
parser.add_argument("--alpha", default=1.0, type=float)
|
36 |
parser.add_argument("--dropout", default=0.05, type=float)
|
37 |
+
parser.add_argument("--activation_checkpointing", action="store_true")
|
38 |
parser.add_argument("--eval_interval", default=1, type=int)
|
39 |
parser.add_argument("--checkpoint_interval", default=1, type=int)
|
40 |
parser.add_argument(
|
|
|
69 |
|
70 |
model_args = checkpoint["model_args"]
|
71 |
|
72 |
+
tokenizer = tiktoken.get_encoding(checkpoint["token_encoding"])
|
73 |
+
|
74 |
+
dataset = Alpaca(
|
75 |
+
tokenizer,
|
76 |
+
max_tokens_per_sample=model_args["block_size"],
|
77 |
+
mask_input=args.mask_input,
|
78 |
+
)
|
79 |
|
80 |
training, testing = random_split(dataset, (0.9, 0.1))
|
81 |
|
|
|
94 |
shuffle=False,
|
95 |
)
|
96 |
|
97 |
+
model = GPT(**model_args, activation_checkpointing=args.activation_checkpointing)
|
98 |
|
99 |
model = torch.compile(model)
|
100 |
|
|
|
113 |
print("Compiling model")
|
114 |
model.compile()
|
115 |
|
116 |
+
optimizer = Adafactor(
|
117 |
+
model.parameters(),
|
118 |
+
lr=args.learning_rate,
|
119 |
+
beta2_decay=args.rms_decay,
|
120 |
+
foreach=not args.optimizer_low_memory,
|
121 |
+
)
|
122 |
|
123 |
starting_epoch = 1
|
124 |
|
|
|
135 |
|
136 |
model.train()
|
137 |
|
138 |
+
print(f"Model has {model.num_trainable_params:,} trainable parameters")
|
139 |
+
|
140 |
+
perplexity_metric = Perplexity(ignore_index=dataset.PADDING_INDEX).to(args.device)
|
141 |
+
|
142 |
print("Instruction-tuning ...")
|
143 |
|
144 |
for epoch in range(starting_epoch, args.num_epochs + 1):
|
pre-train.py
CHANGED
@@ -19,8 +19,10 @@ from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy
|
|
19 |
|
20 |
from torchmetrics.text import Perplexity
|
21 |
|
22 |
-
|
|
|
23 |
from data import Fineweb
|
|
|
24 |
|
25 |
from tqdm import tqdm
|
26 |
|
@@ -41,7 +43,7 @@ def main():
|
|
41 |
parser.add_argument(
|
42 |
"--dataset_subset",
|
43 |
default="sample-10BT",
|
44 |
-
choices=("sample-10BT", "sample-100BT", "sample-350BT"
|
45 |
)
|
46 |
parser.add_argument(
|
47 |
"--token_encoding",
|
@@ -54,6 +56,8 @@ def main():
|
|
54 |
parser.add_argument("--gradient_accumulation_steps", default=128, type=int)
|
55 |
parser.add_argument("--samples_per_epoch", default=4096, type=int)
|
56 |
parser.add_argument("--learning_rate", default=1e-2, type=float)
|
|
|
|
|
57 |
parser.add_argument("--max_gradient_norm", default=1.0, type=float)
|
58 |
parser.add_argument("--dropout", default=0.1, type=float)
|
59 |
parser.add_argument("--num_epochs", default=2384, type=int)
|
@@ -149,22 +153,24 @@ def main():
|
|
149 |
torch.manual_seed(args.seed)
|
150 |
random.seed(args.seed)
|
151 |
|
|
|
|
|
152 |
training = Fineweb(
|
|
|
153 |
root_path=args.dataset_path,
|
154 |
subset=args.dataset_subset,
|
155 |
split="train",
|
156 |
tokens_per_sample=args.block_size,
|
157 |
samples_per_epoch=args.samples_per_epoch,
|
158 |
-
token_encoding=args.token_encoding,
|
159 |
num_processes=args.num_dataset_processes,
|
160 |
)
|
161 |
testing = Fineweb(
|
|
|
162 |
root_path=args.dataset_path,
|
163 |
subset=args.dataset_subset,
|
164 |
split="test",
|
165 |
tokens_per_sample=args.block_size,
|
166 |
samples_per_epoch=args.samples_per_epoch,
|
167 |
-
token_encoding=args.token_encoding,
|
168 |
num_processes=args.num_dataset_processes,
|
169 |
)
|
170 |
|
@@ -209,7 +215,12 @@ def main():
|
|
209 |
|
210 |
model = model.to(args.device)
|
211 |
|
212 |
-
optimizer = Adafactor(
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
starting_epoch = 1
|
215 |
|
@@ -309,6 +320,7 @@ def main():
|
|
309 |
"model_args": model_args,
|
310 |
"model": model.state_dict(),
|
311 |
"optimizer": optimizer.state_dict(),
|
|
|
312 |
}
|
313 |
|
314 |
torch.save(checkpoint, args.checkpoint_path)
|
|
|
19 |
|
20 |
from torchmetrics.text import Perplexity
|
21 |
|
22 |
+
import tiktoken
|
23 |
+
|
24 |
from data import Fineweb
|
25 |
+
from model import GPT
|
26 |
|
27 |
from tqdm import tqdm
|
28 |
|
|
|
43 |
parser.add_argument(
|
44 |
"--dataset_subset",
|
45 |
default="sample-10BT",
|
46 |
+
choices=(None, "sample-10BT", "sample-100BT", "sample-350BT"),
|
47 |
)
|
48 |
parser.add_argument(
|
49 |
"--token_encoding",
|
|
|
56 |
parser.add_argument("--gradient_accumulation_steps", default=128, type=int)
|
57 |
parser.add_argument("--samples_per_epoch", default=4096, type=int)
|
58 |
parser.add_argument("--learning_rate", default=1e-2, type=float)
|
59 |
+
parser.add_argument("--rms_decay", default=-0.8, type=float)
|
60 |
+
parser.add_argument("--optimizer_low_memory", default=True, type=bool)
|
61 |
parser.add_argument("--max_gradient_norm", default=1.0, type=float)
|
62 |
parser.add_argument("--dropout", default=0.1, type=float)
|
63 |
parser.add_argument("--num_epochs", default=2384, type=int)
|
|
|
153 |
torch.manual_seed(args.seed)
|
154 |
random.seed(args.seed)
|
155 |
|
156 |
+
tokenizer = tiktoken.get_encoding(args.token_encoding)
|
157 |
+
|
158 |
training = Fineweb(
|
159 |
+
tokenizer,
|
160 |
root_path=args.dataset_path,
|
161 |
subset=args.dataset_subset,
|
162 |
split="train",
|
163 |
tokens_per_sample=args.block_size,
|
164 |
samples_per_epoch=args.samples_per_epoch,
|
|
|
165 |
num_processes=args.num_dataset_processes,
|
166 |
)
|
167 |
testing = Fineweb(
|
168 |
+
tokenizer,
|
169 |
root_path=args.dataset_path,
|
170 |
subset=args.dataset_subset,
|
171 |
split="test",
|
172 |
tokens_per_sample=args.block_size,
|
173 |
samples_per_epoch=args.samples_per_epoch,
|
|
|
174 |
num_processes=args.num_dataset_processes,
|
175 |
)
|
176 |
|
|
|
215 |
|
216 |
model = model.to(args.device)
|
217 |
|
218 |
+
optimizer = Adafactor(
|
219 |
+
model.parameters(),
|
220 |
+
lr=args.learning_rate,
|
221 |
+
beta2_decay=args.rms_decay,
|
222 |
+
foreach=not args.optimizer_low_memory,
|
223 |
+
)
|
224 |
|
225 |
starting_epoch = 1
|
226 |
|
|
|
320 |
"model_args": model_args,
|
321 |
"model": model.state_dict(),
|
322 |
"optimizer": optimizer.state_dict(),
|
323 |
+
"token_encoding": args.token_encoding,
|
324 |
}
|
325 |
|
326 |
torch.save(checkpoint, args.checkpoint_path)
|