nicholasKluge
commited on
Commit
•
eb32620
1
Parent(s):
36f7a59
Update README.md
Browse files
README.md
CHANGED
@@ -66,37 +66,33 @@ This repository has the [source code](https://github.com/Nkluge-correa/Aira) use
|
|
66 |
|
67 |
## Training Set-up
|
68 |
|
69 |
-
|
|
70 |
-
|
71 |
-
|
|
72 |
-
|
|
73 |
-
|
|
74 |
-
| |
|
75 |
-
|
|
76 |
-
|
|
77 |
-
|
|
78 |
-
|
|
79 |
-
|
|
80 |
-
|
|
81 |
-
|
|
82 |
-
|
|
83 |
-
|
|
84 |
-
|
|
85 |
-
|
|
86 |
-
|
|
87 |
-
|
|
88 |
-
|
|
89 |
-
|
|
90 |
-
|
|
91 |
-
|
|
92 |
-
|
|
93 |
-
|
|
94 |
-
|
|
95 |
-
|
96 |
-
| | seed | 42 |
|
97 |
-
| | mixed_precision | 'no' |
|
98 |
-
| | checkpointing_steps | 22000 |
|
99 |
-
| | tf32 | true |
|
100 |
|
101 |
## Basic usage
|
102 |
|
|
|
66 |
|
67 |
## Training Set-up
|
68 |
|
69 |
+
| Arguments | Value |
|
70 |
+
|-------------------------------|--------------------------------------|
|
71 |
+
| vocabulary size | 32000 |
|
72 |
+
| hidden dimension size | 768 |
|
73 |
+
| intermediate dimension size | 3072 |
|
74 |
+
| context length | 2048 |
|
75 |
+
| nº attention heads | 12 |
|
76 |
+
| nº hidden layers | 12 |
|
77 |
+
| nº key value heads | 12 |
|
78 |
+
| nº training samples | 1831873 |
|
79 |
+
| nº validation samples | 18000 |
|
80 |
+
| nº epochs | 1 |
|
81 |
+
| evaluation steps | 100000 |
|
82 |
+
| train batch size | 4 |
|
83 |
+
| eval batch size | 4 |
|
84 |
+
| gradient accumulation steps | 1 |
|
85 |
+
| learning rate | 0.0006 |
|
86 |
+
| adam epsilon | 0.00000001 |
|
87 |
+
| weight decay | 0.01 |
|
88 |
+
| scheduler type | "cosine" |
|
89 |
+
| warmup ratio | 0.01 |
|
90 |
+
| gradient checkpointing | false |
|
91 |
+
| seed | 42 |
|
92 |
+
| mixed precision | 'no' |
|
93 |
+
| torch dtype | "float32" |
|
94 |
+
| tf32 | true |
|
95 |
+
|
|
|
|
|
|
|
|
|
96 |
|
97 |
## Basic usage
|
98 |
|