Coobyk commited on
Commit
4a88de0
·
verified ·
1 Parent(s): d02959a

Upload EpsteinGPT V1

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. EpsteinGPT.pt +3 -0
  3. EpsteinGPT.ptl +3 -0
  4. README.md +57 -3
  5. config.json +10 -0
  6. epsteingpt_tokenizer.json +0 -0
  7. model.py +123 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ EpsteinGPT.ptl filter=lfs diff=lfs merge=lfs -text
EpsteinGPT.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2760dea55d021be0ec5243890e9911eaa2bb88c7ea0a376a9eac023ad4f29aa3
3
+ size 336554701
EpsteinGPT.ptl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c668c8c6153d5ee229f9531304692b229b491ff6e2a13d8a565cb2e029995d18
3
+ size 113733862
README.md CHANGED
@@ -1,3 +1,57 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EpsteinGPT - Minimal GPT Model
2
+
3
+ This repository contains a Minimal GPT (MVT) model trained on the Epstein email threads dataset.
4
+
5
+ ## Model Details
6
+
7
+ This is a custom-built Causal Transformer model (`MinimalGPT`) inspired by nanoGPT/minGPT architectures. It was trained from scratch using a custom Byte-Pair Encoding (BPE) tokenizer.
8
+
9
+ ### Configuration (`config.json`)
10
+ ```json
11
+ {
12
+ "vocab_size": 5000,
13
+ "block_size": 256,
14
+ "n_layer": 8,
15
+ "n_head": 8,
16
+ "n_embd": 512,
17
+ "batch_size": 16,
18
+ "dropout": 0.1,
19
+ "bias": false
20
+ }
21
+ ```
22
+
23
+ ## Files Included
24
+
25
+ * `epsteingpt_tokenizer.json`: The custom BPE tokenizer used for encoding and decoding text.
26
+ * `EpsteinGPT.pt`: The PyTorch checkpoint containing the trained model weights.
27
+ * `EpsteinGPT.ptl`: The TorchScript Lite version of the trained model, optimized for deployment.
28
+ * `model.py`: Defines the `MVTConfig` class and the `MinimalGPT` model architecture.
29
+ * `config.json`: Model configuration in JSON format.
30
+ * `README.md`: This file.
31
+
32
+ ## How to Use
33
+
34
+ To use this model, you would typically:
35
+
36
+ 1. Load the tokenizer:
37
+ ```python
38
+ from tokenizers import Tokenizer
39
+ tokenizer = Tokenizer.from_file("epsteingpt_tokenizer.json")
40
+ ```
41
+ 2. Load the model architecture and configuration (from `model.py` and `config.json`).
42
+ 3. Load the trained weights from `EpsteinGPT.pt` into the model.
43
+ 4. Use the model for text generation or other tasks.
44
+
45
+ For generation, you can refer to the `generate.py` script used during development.
46
+
47
+ ## Training
48
+
49
+ The model was trained on a dataset of Epstein email threads. The training process involved:
50
+
51
+ 1. **Tokenizer Training:** A BPE tokenizer was trained on the raw text data.
52
+ 2. **Data Preparation:** The text data was tokenized and converted into a numerical format.
53
+ 3. **Model Training:** The `MinimalGPT` model was trained using a custom training loop.
54
+
55
+ ## Further Information
56
+
57
+ For more details on the model architecture and training process, refer to the `model.py` and `train.py` scripts.
config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 5000,
3
+ "block_size": 256,
4
+ "n_layer": 8,
5
+ "n_head": 8,
6
+ "n_embd": 512,
7
+ "batch_size": 16,
8
+ "dropout": 0.1,
9
+ "bias": false
10
+ }
epsteingpt_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional as F
5
+
6
+ # Configuration Dataclass (equivalent to GPTConfig in nanoGPT)
7
+ class MVTConfig:
8
+ vocab_size = 5000 # V: Set by custom tokenizer
9
+ block_size = 256 # T_ctx: Context length
10
+ n_layer = 8 # N_layer: Number of decoder blocks
11
+ n_head = 8 # N_head: Number of attention heads
12
+ n_embd = 512 # D_embd: Embedding dimension
13
+ batch_size = 16 # B: Batch size
14
+ dropout = 0.1
15
+ bias = False # Optional bias for linear layers
16
+
17
+ # Initializing device setup
18
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
19
+
20
+ # --- 1. Causal Self-Attention Mechanism ---
21
+ class CausalSelfAttention(nn.Module):
22
+ def __init__(self, config):
23
+ super().__init__()
24
+ assert config.n_embd % config.n_head == 0
25
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
26
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
27
+ self.attn_dropout = nn.Dropout(config.dropout)
28
+ self.resid_dropout = nn.Dropout(config.dropout)
29
+ self.n_head = config.n_head
30
+ self.n_embd = config.n_embd
31
+ self.dropout = config.dropout
32
+ self.block_size = config.block_size
33
+ self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
34
+ .view(1, 1, config.block_size, config.block_size))
35
+ nn.init.normal_(self.c_proj.weight, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))
36
+
37
+ def forward(self, x):
38
+ B, T, C = x.size()
39
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
40
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
41
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
42
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
43
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
44
+ att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
45
+ att = F.softmax(att, dim=-1)
46
+ att = self.attn_dropout(att)
47
+ y = att @ v
48
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
49
+ y = self.resid_dropout(self.c_proj(y))
50
+ return y
51
+
52
+ # --- 2. Feed-Forward Network (MLP) ---
53
+ class MLP(nn.Module):
54
+ def __init__(self, config):
55
+ super().__init__()
56
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
57
+ self.gelu = nn.GELU()
58
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
59
+ self.dropout = nn.Dropout(config.dropout)
60
+ nn.init.normal_(self.c_proj.weight, mean=0.0, std=0.02 / math.sqrt(2 * config.n_layer))
61
+
62
+ def forward(self, x):
63
+ x = self.c_fc(x)
64
+ x = self.gelu(x)
65
+ x = self.c_proj(x)
66
+ x = self.dropout(x)
67
+ return x
68
+
69
+ # --- 3. Transformer Block ---
70
+ class Block(nn.Module):
71
+ def __init__(self, config):
72
+ super().__init__()
73
+ self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
74
+ self.attn = CausalSelfAttention(config)
75
+ self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
76
+ self.mlp = MLP(config)
77
+
78
+ def forward(self, x):
79
+ x = x + self.attn(self.ln_1(x))
80
+ x = x + self.mlp(self.ln_2(x))
81
+ return x
82
+
83
+ # --- 4. The MinimalGPT Model ---
84
+ class MinimalGPT(nn.Module):
85
+ def __init__(self, config):
86
+ super().__init__()
87
+ # Store config parameters as instance attributes for TorchScript compatibility
88
+ self.vocab_size = config.vocab_size
89
+ self.block_size = config.block_size
90
+ self.n_layer = config.n_layer
91
+ self.n_head = config.n_head
92
+ self.n_embd = config.n_embd
93
+ self.dropout = config.dropout
94
+ self.bias = config.bias
95
+
96
+ self.transformer = nn.ModuleDict(dict(
97
+ wte=nn.Embedding(self.vocab_size, self.n_embd),
98
+ wpe=nn.Embedding(self.block_size, self.n_embd),
99
+ drop=nn.Dropout(self.dropout),
100
+ h=nn.ModuleList([Block(config) for _ in range(self.n_layer)]),
101
+ ln_f=nn.LayerNorm(self.n_embd, bias=self.bias),
102
+ ))
103
+ self.lm_head = nn.Linear(self.n_embd, self.vocab_size, bias=False)
104
+ self.transformer.wte.weight = self.lm_head.weight
105
+ print(f"Minimal GPT Model initialized: {sum(p.numel() for p in self.parameters())/1e6:.2f}M parameters")
106
+
107
+ def forward(self, idx, targets=None):
108
+ B, T = idx.size()
109
+ assert T <= self.block_size, f"Input sequence length {T} exceeds block size {self.block_size}"
110
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
111
+ tok_emb = self.transformer.wte(idx)
112
+ pos_emb = self.transformer.wpe(pos)
113
+ x = self.transformer.drop(tok_emb + pos_emb)
114
+ for block in self.transformer.h:
115
+ x = block(x)
116
+ x = self.transformer.ln_f(x)
117
+ logits = self.lm_head(x)
118
+ loss = None
119
+ if targets is not None:
120
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
121
+ else: # Return a dummy loss tensor if targets is None for TorchScript compatibility
122
+ loss = torch.tensor(0.0, device=idx.device)
123
+ return logits, loss