Wonder-Griffin commited on
Commit
78f739d
·
verified ·
1 Parent(s): 011915b

End of training

Browse files
Files changed (6) hide show
  1. README.md +48 -0
  2. config.json +102 -0
  3. model.safetensors +3 -0
  4. pyJudgeXL_model.py +122 -0
  5. tokenizer1.pickle +3 -0
  6. training_args.bin +3 -0
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model: Wonder-Griffin/JudgeLLM2
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: The_Judge
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # The_Judge
15
+
16
+ This model is a fine-tuned version of [Wonder-Griffin/JudgeLLM2](https://huggingface.co/Wonder-Griffin/JudgeLLM2) on an unknown dataset.
17
+
18
+ ## Model description
19
+
20
+ More information needed
21
+
22
+ ## Intended uses & limitations
23
+
24
+ More information needed
25
+
26
+ ## Training and evaluation data
27
+
28
+ More information needed
29
+
30
+ ## Training procedure
31
+
32
+ ### Training hyperparameters
33
+
34
+ The following hyperparameters were used during training:
35
+ - learning_rate: 5e-05
36
+ - train_batch_size: 8
37
+ - eval_batch_size: 8
38
+ - seed: 42
39
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
40
+ - lr_scheduler_type: linear
41
+ - num_epochs: 3.0
42
+
43
+ ### Framework versions
44
+
45
+ - Transformers 4.45.0.dev0
46
+ - Pytorch 2.4.0+cu124
47
+ - Datasets 2.20.0
48
+ - Tokenizers 0.19.1
config.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_": "Judge-GPT2",
3
+ "_name_or_path": "Wonder-Griffin/JudgeLLM2",
4
+ "activation_function": "gelu_new",
5
+ "architectures": [
6
+ "GPT2Model"
7
+ ],
8
+ "attn_pdrop": 0.1,
9
+ "batch_size": 32,
10
+ "bias": true,
11
+ "block_size": 512,
12
+ "bos_token_id": 50256,
13
+ "dim_feedforward": 3072,
14
+ "dropout": 0.1,
15
+ "embd_pdrop": 0.1,
16
+ "eos_token_id": 50256,
17
+ "ff_expansion_factor": 4,
18
+ "hidden_act": "gelu",
19
+ "id2label": {
20
+ "0": "LABEL_0",
21
+ "1": "LABEL_1",
22
+ "2": "LABEL_2",
23
+ "3": "LABEL_3",
24
+ "4": "LABEL_4"
25
+ },
26
+ "inference_mode": true,
27
+ "initializer_range": 0.02,
28
+ "label2id": {
29
+ "LABEL_0": 0,
30
+ "LABEL_1": 1,
31
+ "LABEL_2": 2,
32
+ "LABEL_3": 3,
33
+ "LABEL_4": 4
34
+ },
35
+ "label_smoothing": 0.1,
36
+ "layer_norm_epsilon": 1e-05,
37
+ "learning_rate": 0.0003,
38
+ "log_interval": 100,
39
+ "max_grad_norm": 1.0,
40
+ "model_type": "gpt2",
41
+ "n_embd": 768,
42
+ "n_head": 12,
43
+ "n_inner": null,
44
+ "n_layer": 12,
45
+ "n_positions": 512,
46
+ "output_dir": "C:/Users/wonde/output",
47
+ "pretrained_weights": "Wonder-Griffin/JudgeLLM2",
48
+ "reorder_and_upcast_attn": false,
49
+ "resid_pdrop": 0.1,
50
+ "scale_attn_by_inverse_layer_idx": false,
51
+ "scale_attn_weights": true,
52
+ "summary_activation": null,
53
+ "summary_first_dropout": 0.1,
54
+ "summary_proj_to_labels": true,
55
+ "summary_type": "cls_index",
56
+ "summary_use_proj": true,
57
+ "task_heads": {
58
+ "classifier_head": {
59
+ "params": {
60
+ "num_labels": 5
61
+ },
62
+ "type": "JudgeClassifier"
63
+ },
64
+ "lm_head": {
65
+ "params": {
66
+ "vocab_size": 50257
67
+ },
68
+ "type": "JudgeCasualLMHead"
69
+ },
70
+ "qa_head": {
71
+ "params": {
72
+ "num_labels": 2
73
+ },
74
+ "type": "JudgeWithQA"
75
+ }
76
+ },
77
+ "task_specific_params": {
78
+ "question-answering": {
79
+ "max_answer_length": 100
80
+ },
81
+ "sequence-classification": {
82
+ "eval_steps": 500
83
+ },
84
+ "text-generation": {
85
+ "do_sample": true,
86
+ "max_length": 100
87
+ }
88
+ },
89
+ "tokenizer": {
90
+ "params": {
91
+ "vocab_size": 50257
92
+ },
93
+ "type": "AutoTokenizer"
94
+ },
95
+ "torch_dtype": "float32",
96
+ "total_steps": 10000,
97
+ "transformers_version": "4.45.0.dev0",
98
+ "use_cache": true,
99
+ "vocab_size": 30522,
100
+ "warmup_steps": 1000,
101
+ "weight_decay": 0.01
102
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7611a89d0d92c222df86ba901d724f356efb57ae7f96a425528464f3c3a410e
3
+ size 435573648
pyJudgeXL_model.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration
2
+ config = {
3
+ "learning_rate": 1e-4,
4
+ "batch_size": 32,
5
+ "vocab_size": 30522,
6
+ "max_len": 256,
7
+ "hidden_size": 768,
8
+ "dropout": 0.1,
9
+ "n_layer": 12,
10
+ "n_head": 12,
11
+ "ff_expansion_factor": 4,
12
+ "rnn_units": 768,
13
+ "num_labels": 5
14
+ }
15
+
16
+ class MyClass:
17
+ def __init__(self, value):
18
+ self.value = value
19
+
20
+ # Custom Initializer
21
+ def custom_initializer(shape):
22
+ return torch.normal(mean=0.0, std=0.02, size=shape)
23
+
24
+ class CustomEmbedding(nn.Module):
25
+ def __init__(self, vocab_size, hidden_size):
26
+ super(CustomEmbedding, self).__init__()
27
+ self.embedding = nn.Embedding(vocab_size, hidden_size, _weight=custom_initializer((vocab_size, hidden_size)))
28
+
29
+ def forward(self, inputs):
30
+ return self.embedding(inputs)
31
+
32
+ class PositionalEncoding(nn.Module):
33
+ def __init__(self, n_embd, max_len=5000):
34
+ super(PositionalEncoding, self).__init__()
35
+ self.n_embd = n_embd
36
+ self.max_len = max_len
37
+
38
+ pe = torch.zeros(max_len, n_embd)
39
+ position = torch.arange(0, max_len).unsqueeze(1).float()
40
+ div_term = torch.exp(torch.arange(0, n_embd, 2).float() * -(np.log(10000.0) / n_embd))
41
+ pe[:, 0::2] = torch.sin(position * div_term)
42
+ pe[:, 1::2] = torch.cos(position * div_term)
43
+ pe = pe.unsqueeze(0).transpose(0, 1)
44
+ self.register_buffer('pe', pe)
45
+
46
+ def forward(self, x):
47
+ return x + self.pe[:x.size(0), :]
48
+
49
+ class MultiheadAttention(nn.Module):
50
+ def __init__(self, config):
51
+ super(MultiheadAttention, self).__init__()
52
+ self.attention = nn.MultiheadAttention(config['hidden_size'], config['n_head'], dropout=config['dropout'])
53
+
54
+ def forward(self, v, k, q, mask=None):
55
+ attn_output, attn_output_weights = self.attention(q, k, v, attn_mask=mask)
56
+ return attn_output
57
+
58
+ class FeedForward(nn.Module):
59
+ def __init__(self, config):
60
+ super(FeedForward, self).__init__()
61
+ self.dense1 = nn.Linear(config['hidden_size'], config['hidden_size'] * config['ff_expansion_factor'])
62
+ self.dense2 = nn.Linear(config['hidden_size'] * config['ff_expansion_factor'], config['hidden_size'])
63
+ self.dropout = nn.Dropout(config['dropout'])
64
+
65
+ def forward(self, x):
66
+ x = torch.nn.functional.gelu(self.dense1(x))
67
+ x = self.dropout(x)
68
+ return self.dense2(x)
69
+
70
+ class TransformerXLBlock(nn.Module):
71
+ def __init__(self, config):
72
+ super(TransformerXLBlock, self).__init__()
73
+ self.attn = MultiheadAttention(config)
74
+ self.ff = FeedForward(config)
75
+ self.ln1 = nn.LayerNorm(config['hidden_size'])
76
+ self.ln2 = nn.LayerNorm(config['hidden_size'])
77
+
78
+ def forward(self, x, mask=None):
79
+ attn_out = self.attn(v=x, k=x, q=x, mask=mask)
80
+ out1 = self.ln1(x + attn_out)
81
+ ff_out = self.ff(out1)
82
+ return self.ln2(out1 + ff_out)
83
+
84
+ class JudgeXL(nn.Module):
85
+ def __init__(self, config):
86
+ super(JudgeXL, self).__init__()
87
+ self.token_embedding = CustomEmbedding(config['vocab_size'], config['hidden_size'])
88
+ self.pos_encoding = PositionalEncoding(config['hidden_size'], config['max_len'])
89
+ self.transformer_blocks = nn.ModuleList([TransformerXLBlock(config) for _ in range(config['n_layer'])])
90
+ self.ln_f = nn.LayerNorm(config['hidden_size'])
91
+ self.rnn = nn.LSTM(config['hidden_size'], config['rnn_units'], num_layers=2, dropout=config['dropout'], bidirectional=True, batch_first=True)
92
+ self.fc = nn.Linear(config['rnn_units'] * 2, config['vocab_size']) # Adjusted to rnn_units * 2
93
+
94
+ def forward(self, x, mask=None):
95
+ x = self.token_embedding(x)
96
+ x = self.pos_encoding(x)
97
+ for block in self.transformer_blocks:
98
+ x = block(x, mask=mask)
99
+ x = self.ln_f(x)
100
+ x, _ = self.rnn(x)
101
+ x = self.fc(x)
102
+ return x
103
+
104
+ def generate(self, prompt, max_len=100):
105
+ self.eval()
106
+ input_ids = self.tokenizer(prompt, return_tensors='pt').input_ids.to(device)
107
+ generated = input_ids
108
+ with torch.no_grad():
109
+ for _ in range(max_len):
110
+ outputs = self.forward(generated)
111
+ next_token_logits = outputs[:, :] # Adjusted indexing
112
+ next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
113
+ generated = torch.cat((generated, next_token_id), dim=1)
114
+ if next_token_id.item() == self.tokenizer.sep_token_id:
115
+ break
116
+ generated_text = self.tokenizer.decode(generated[0], skip_special_tokens=True)
117
+ return generated_text
118
+
119
+ # Load the last saved model
120
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
121
+ model = JudgeXL(config)
122
+ model = torch.load('C:/AIstuffing/Judge_XL-LLM/xl-llm_weights/judgeXL-LLm_wiki.pth', weights_only=False)
tokenizer1.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a4bfa2daf9cb9275703fcadd2e7953704653c2a206b1ea0852fad26a5e76c80
3
+ size 82362540
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca93e44304ed6ec37809ad1da1d61576ecf6389b60e134a029f36fbbbf24ebec
3
+ size 5176