attn_pdrop: 0.1 | |
b1: 0.9 | |
b2: 0.95 | |
batch_size: 4096 | |
blocks: 6 | |
d_model: 128 | |
embd_pdrop: 0.1 | |
epochs: 50000 | |
heads: 4 | |
lr: 0.0003 | |
model_type: gpt | |
num_final_chars_in_dataset: 2 | |
num_workers: 4 | |
resid_pdrop: 0.1 | |
stoi: | |
' ': 1 | |
'''': 2 | |
'-': 3 | |
.: 0 | |
a: 4 | |
b: 5 | |
c: 6 | |
d: 7 | |
e: 8 | |
f: 9 | |
g: 10 | |
h: 11 | |
i: 12 | |
j: 13 | |
k: 14 | |
l: 15 | |
m: 16 | |
n: 17 | |
o: 18 | |
p: 19 | |
q: 20 | |
r: 21 | |
s: 22 | |
t: 23 | |
u: 24 | |
v: 25 | |
w: 26 | |
x: 27 | |
y: 28 | |
z: 29 | |
vocab: 30 | |
weight_decay: 0.1 | |
window: 32 | |