Spaces:
Sleeping
Sleeping
Commit
·
ddb0136
1
Parent(s):
84f0b80
initial calculator
Browse files- calculator.py +0 -0
- defaults.py +5 -5
- state.py +17 -3
calculator.py
ADDED
|
File without changes
|
defaults.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
from state import ModelState
|
| 2 |
|
| 3 |
-
GEMMA3_270M = ModelState(vocab_size=256000, num_layers=9,
|
| 4 |
-
GEMMA3_1B = ModelState(vocab_size=262208, num_layers=26,
|
| 5 |
-
GEMMA3_4B = ModelState(vocab_size=262208, num_layers=28,
|
| 6 |
-
GEMMA3_12B = ModelState(vocab_size=262208, num_layers=42,
|
| 7 |
-
GEMMA3_27B = ModelState(vocab_size=262208, num_layers=46,
|
| 8 |
|
| 9 |
DEFAULTS = {
|
| 10 |
"Gemma3 270M": GEMMA3_270M,
|
|
|
|
| 1 |
from state import ModelState
|
| 2 |
|
| 3 |
+
GEMMA3_270M = ModelState(vocab_size=256000, num_layers=9, hidden_dim=1152, intermediate_size=4608)
|
| 4 |
+
GEMMA3_1B = ModelState(vocab_size=262208, num_layers=26, hidden_dim=2304, intermediate_size=9216)
|
| 5 |
+
GEMMA3_4B = ModelState(vocab_size=262208, num_layers=28, hidden_dim=3072, intermediate_size=12288)
|
| 6 |
+
GEMMA3_12B = ModelState(vocab_size=262208, num_layers=42, hidden_dim=4608, intermediate_size=18432)
|
| 7 |
+
GEMMA3_27B = ModelState(vocab_size=262208, num_layers=46, hidden_dim=6144, intermediate_size=24576)
|
| 8 |
|
| 9 |
DEFAULTS = {
|
| 10 |
"Gemma3 270M": GEMMA3_270M,
|
state.py
CHANGED
|
@@ -1,8 +1,22 @@
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
|
| 3 |
@dataclass
|
| 4 |
-
class
|
| 5 |
vocab_size: int
|
| 6 |
num_layers: int
|
| 7 |
-
|
| 8 |
-
intermediate_size: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from dataclasses import dataclass
|
| 2 |
|
| 3 |
@dataclass
|
| 4 |
+
class Model:
|
| 5 |
vocab_size: int
|
| 6 |
num_layers: int
|
| 7 |
+
hidden_dim: int
|
| 8 |
+
intermediate_size: int
|
| 9 |
+
weight_tied_embeddings: bool
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class Parallelism:
|
| 14 |
+
tensor_parallelism: int
|
| 15 |
+
pipeline_parallelism: int
|
| 16 |
+
context_parallelism: int
|
| 17 |
+
expert_parallelism: int
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class Training:
|
| 21 |
+
sequence_length: int
|
| 22 |
+
batch_size: int
|