attn_pdrop: 0.1 | |
b1: 0.9 | |
b2: 0.95 | |
batch_size: 4096 | |
blocks: 6 | |
d_model: 128 | |
embd_pdrop: 0.1 | |
epochs: 50000 | |
heads: 4 | |
lr: 0.0003 | |
model_type: gpt | |
num_final_chars_in_dataset: 2 | |
num_workers: 4 | |
resid_pdrop: 0.1 | |
stoi: | |
.: 0 | |
a: 1 | |
b: 2 | |
c: 3 | |
d: 4 | |
e: 5 | |
f: 6 | |
g: 7 | |
h: 8 | |
i: 9 | |
j: 10 | |
k: 11 | |
l: 12 | |
m: 13 | |
n: 14 | |
o: 15 | |
p: 16 | |
q: 17 | |
r: 18 | |
s: 19 | |
t: 20 | |
u: 21 | |
v: 22 | |
w: 23 | |
x: 24 | |
y: 25 | |
z: 26 | |
vocab: 27 | |
weight_decay: 0.1 | |
window: 32 | |