MAKILINGDING commited on
Commit
a3b29ff
1 Parent(s): 0503492

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenized_datasets/c4_realnewslike.json filter=lfs diff=lfs merge=lfs -text
Inference.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import Train
3
+ import Tokenizer
4
+
5
+
6
+ def load_model(model_path, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
7
+ model = Train.Transformer(d_model, ffn_hidden, num_heads, drop_prob, num_layers)
8
+ model.load_state_dict(torch.load(model_path))
9
+ model.eval()
10
+ return model
11
+
12
+
13
+ def prepare_input(input_text):
14
+ input_tokens = Tokenizer.tokenize_sequence(input_text)
15
+ # input_tokens = Tokenizer.pad_to_length(input_tokens, Train.max_sequence_length)
16
+ input_ids = torch.tensor(input_tokens).unsqueeze(0) # Add batch dimension
17
+ return input_ids
18
+
19
+
20
+ # def generate_output(model, input_ids):
21
+ # with torch.no_grad():
22
+ # output_logits = model(input_ids)
23
+ # predicted_token_ids = torch.argmax(output_logits, dim=-1)
24
+ # output_text = Tokenizer.detokenize_sequence(predicted_token_ids[0].tolist())
25
+ # return output_text
26
+
27
+
28
+ # def generate_output(model, input_ids, max_length, eos_token=Tokenizer.vocabulary.get('<EOS>')):
29
+ # with torch.no_grad(): # No need to track gradients during inference
30
+ # input_tensor = torch.tensor(input_ids)
31
+ # output_seq = []
32
+ #
33
+ # for i in range(50):
34
+ # output = model.generate(input_tensor)
35
+ # print(f'output.size(): {output.size()}')
36
+ # next_token = torch.argmax(output[0, i, :], dim=-1).item() # Take last token from sequence
37
+ # output_seq.append(next_token)
38
+ # if next_token == eos_token:
39
+ # break # Stop if EOS token is generated
40
+ # next_token_tensor = torch.tensor([[next_token]]).to(Train.device) # Convert and move to device
41
+ # input_tensor = torch.cat([input_tensor, next_token_tensor], dim=1) # Concatenate
42
+ # print(f'Generated tokens: {output_seq}')
43
+ # AI_response = Tokenizer.detokenize_sequence(output_seq)
44
+ # return AI_response
45
+
46
+
47
+ def generate_output(model, input_ids, max_length, eos_token=Tokenizer.vocabulary.get('<EOS>')):
48
+ out = model.generate(input_ids)
49
+ preds = torch.argmax(out, dim=-1)
50
+ output_tokens = []
51
+ for token in preds[0]:
52
+ output_tokens.append(token.item())
53
+ AI_response = Tokenizer.detokenize_sequence(output_tokens)
54
+ return AI_response
55
+
56
+
57
+ # Example usage
58
+ model_path = 'models/my_model.pt'
59
+ input_text = ''
60
+
61
+ model = load_model(model_path, Train.d_model, Train.ffn_hidden, Train.num_heads, Train.drop_prob, Train.num_layers)
62
+ model.to(Train.device)
63
+ input_ids = prepare_input(input_text)
64
+ output_text = generate_output(model, input_ids.to(Train.device), Train.max_sequence_length)
65
+
66
+ print("Generated Output:", output_text)
Tokenizer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+
4
+ vocabulary = {}
5
+ token_vocabulary = {}
6
+ # vocabulary_length = ['<EOS>']
7
+
8
+ with open('cl100k_base_vocab_list.txt', 'r', encoding='utf-8') as file:
9
+ for line_count, line in enumerate(file):
10
+ line = line.rstrip('\n')
11
+ if (line.startswith('\'') and line.endswith('\'')) or (line.startswith('\"') and line.endswith('\"')):
12
+ line = line[1:-1]
13
+ vocabulary[line] = line_count
14
+ else:
15
+ vocabulary[line] = line_count
16
+ token_vocabulary = {v: k for k, v in vocabulary.items()}
17
+
18
+ def get_vocabulary():
19
+ return vocabulary
20
+
21
+
22
+ def get_token_vocabulary():
23
+ return token_vocabulary
24
+
25
+ # def check_vocabulary_length(word):
26
+ # append_length = True
27
+ # for vocab in vocabulary_length:
28
+ # if word == vocab:
29
+ # append_length = False
30
+ # break
31
+ # if append_length == True:
32
+ # vocabulary_length.append(word)
33
+ #
34
+ # def return_vocabulary_length():
35
+ # return vocabulary_length
36
+
37
+ def tokenize_sequence(sentence):
38
+ # tokenized_seq = [vocabulary.get('<SOS>')]
39
+ tokenized_seq = []
40
+ regex = r'(\s+\w+|\S+)'
41
+ words = re.split(regex, sentence)
42
+ for word in words:
43
+ if word in vocabulary:
44
+ tokenized_seq.append(vocabulary.get(word, vocabulary.get('<UNK>')))
45
+ else:
46
+ i = 0
47
+ while i < len(word):
48
+ subword_len = 1
49
+ for j in range(len(word), i - 1, -1):
50
+ subword = word[i:j]
51
+ if subword in vocabulary:
52
+ tokenized_seq.append(vocabulary.get(subword, vocabulary.get('<UNK>')))
53
+ subword_len = len(subword)
54
+ break
55
+ if j - i == 1:
56
+ tokenized_seq.append(vocabulary.get('<UNK>'))
57
+ break
58
+ i += subword_len
59
+ tokenized_seq.append(vocabulary.get('<EOS>'))
60
+ return tokenized_seq
61
+
62
+
63
+ def detokenize_sequence(tokenized_seq):
64
+ decoded_sentence = ''
65
+ for token in tokenized_seq:
66
+ decoded_sentence += token_vocabulary[token]
67
+ return decoded_sentence
68
+
69
+
70
+ def pad_to_length(seq, length):
71
+ padded_seq = torch.full((length,), fill_value=0, dtype=torch.long)
72
+ padded_seq[:len(seq)] = torch.tensor(seq, dtype=torch.long)
73
+ return padded_seq
Train.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import math
3
+ from torch import nn
4
+ import torch.nn.functional as F
5
+ import Tokenizer
6
+ from datasets import load_dataset
7
+ import time
8
+ import json
9
+ from transformers import AdamW, get_scheduler
10
+ from sklearn.model_selection import train_test_split
11
+ from torch.nn.utils.rnn import pad_sequence
12
+
13
+
14
+ ### TOKENIZER ##########################################################################################################
15
+ vocabulary = Tokenizer.get_vocabulary()
16
+ token_vocabulary = Tokenizer.get_token_vocabulary()
17
+
18
+
19
+ ### TRANSFORMER ########################################################################################################
20
+ d_model = 384
21
+ num_heads = 6
22
+ drop_prob = 0.1
23
+ batch_size = 38 # batch_size must be divisible by num_heads / len(train_input) must be divisible by batch_size
24
+ max_sequence_length = 256
25
+ ffn_hidden = d_model * 4
26
+ num_layers = 6
27
+ save_path = 'models/my_model.pt'
28
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
29
+
30
+
31
+ def scaled_dot_product(q, k, v, mask=None):
32
+ d_k = q.size()[-1]
33
+ scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
34
+ if mask is not None:
35
+ scaled += mask.to(device)
36
+ attention = F.softmax(scaled, dim=-1)
37
+ values = torch.matmul(attention, v)
38
+ return values, attention
39
+
40
+
41
+ class MultiHeadAttention(nn.Module):
42
+ def __init__(self, d_model, num_heads):
43
+ super().__init__()
44
+ self.d_model = d_model
45
+ self.num_heads = num_heads
46
+ self.head_dim = d_model // num_heads
47
+ self.qkv_layer = nn.Linear(d_model, 3 * d_model)
48
+ self.linear_layer = nn.Linear(d_model, d_model)
49
+
50
+ def forward(self, x, mask=None):
51
+ batch_size, max_sequence_length, d_model = x.size()
52
+ qkv = self.qkv_layer(x)
53
+ qkv = qkv.reshape(batch_size, max_sequence_length, self.num_heads, 3 * self.head_dim)
54
+ qkv = qkv.permute(0, 2, 1, 3)
55
+ q, k, v = qkv.chunk(3, dim=-1)
56
+ values, attention = scaled_dot_product(q, k, v, mask)
57
+ values = values.reshape(batch_size, max_sequence_length, self.num_heads * self.head_dim)
58
+ out = self.linear_layer(values)
59
+ return out
60
+
61
+
62
+ class LayerNormalization(nn.Module):
63
+ def __init__(self, parameters_shape, eps=1e-5):
64
+ super().__init__()
65
+ self.parameters_shape = parameters_shape
66
+ self.eps = eps
67
+ self.gamma = nn.Parameter(torch.ones(parameters_shape))
68
+ self.beta = nn.Parameter(torch.zeros(parameters_shape))
69
+
70
+ def forward(self, inputs):
71
+ dims = [-(i + 1) for i in range(len(self.parameters_shape))]
72
+ mean = inputs.mean(dim=dims, keepdim=True)
73
+ var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
74
+ std = (var + self.eps).sqrt()
75
+ y = (inputs - mean) / std
76
+ out = self.gamma * y + self.beta
77
+ return out
78
+
79
+
80
+ class PositionwiseFeedForward(nn.Module):
81
+ def __init__(self, d_model, hidden, drop_prob=0.1):
82
+ super(PositionwiseFeedForward, self).__init__()
83
+ self.linear1 = nn.Linear(d_model, hidden)
84
+ self.linear2 = nn.Linear(hidden, d_model)
85
+ self.dropout = nn.Dropout(p=drop_prob)
86
+
87
+ def forward(self, x):
88
+ x = self.linear1(x)
89
+ x = F.gelu(x)
90
+ x = self.dropout(x)
91
+ x = self.linear2(x)
92
+ return x
93
+
94
+
95
+ class PositionalEncoding(nn.Module):
96
+ def __init__(self, d_model):
97
+ super().__init__()
98
+ self.d_model = d_model
99
+
100
+ def forward(self, sequence_length):
101
+ even_i = torch.arange(0, self.d_model, 2).float()
102
+ denominator = torch.pow(10000, even_i / self.d_model)
103
+ position = torch.arange(sequence_length).reshape(sequence_length, 1)
104
+ even_PE = torch.sin(position / denominator)
105
+ odd_PE = torch.cos(position / denominator)
106
+ stacked = torch.stack([even_PE, odd_PE], dim=2)
107
+ PE = torch.flatten(stacked, start_dim=1, end_dim=2)
108
+ return PE
109
+
110
+
111
+ class TransformerLayer(nn.Module):
112
+ def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
113
+ super(TransformerLayer, self).__init__()
114
+ self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
115
+ self.norm1 = LayerNormalization(parameters_shape=[d_model])
116
+ self.dropout1 = nn.Dropout(p=drop_prob)
117
+ self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
118
+ self.norm2 = LayerNormalization(parameters_shape=[d_model])
119
+ self.dropout2 = nn.Dropout(p=drop_prob)
120
+
121
+ def forward(self, x, original_inputs):
122
+ input_pad_mask = (original_inputs != 0)
123
+ index = torch.argmax(input_pad_mask.sum(dim=1))
124
+ max_length = 0
125
+ for element in original_inputs[index]:
126
+ if element != 0:
127
+ max_length += 1
128
+ else:
129
+ break
130
+ seq_len = x.size()[1]
131
+ causal_mask = torch.tril(torch.ones(seq_len, seq_len))
132
+ mask = torch.where(causal_mask == 0, torch.tensor(float('-inf')), causal_mask)
133
+ mask[mask == 1] = 0
134
+ mask[max_length:, max_length:] = float('-inf')
135
+
136
+ residual_x = x
137
+ x = self.attention(x, mask=mask)
138
+ # x = self.dropout1(x)
139
+ x = self.norm1(x + residual_x)
140
+ residual_x = x
141
+ x = self.ffn(x)
142
+ # x = self.dropout2(x)
143
+ x = self.norm2(x + residual_x)
144
+ return x
145
+
146
+
147
+ class SequentialTransformer(nn.Sequential):
148
+ def forward(self, *inputs):
149
+ x, original_inputs = inputs
150
+ for module in self._modules.values():
151
+ new_x = module(x, original_inputs)
152
+ return new_x
153
+
154
+
155
+ class Transformer(nn.Module):
156
+ def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
157
+ super().__init__()
158
+ self.d_model = d_model
159
+ self.token_embedding = nn.Embedding(len(vocabulary), d_model)
160
+ # self.token_embedding = nn.Embedding(len(true_vocabulary), d_model)
161
+ self.positional_encoding = PositionalEncoding(d_model)
162
+ self.layers = SequentialTransformer(*[TransformerLayer(d_model, ffn_hidden, num_heads, drop_prob)
163
+ for _ in range(num_layers)])
164
+ self.output_layers = nn.Linear(d_model, len(vocabulary))
165
+ # self.output_layers = nn.Linear(d_model, len(true_vocabulary))
166
+
167
+ def forward(self, x, targets):
168
+ original_inputs = x
169
+ token_embeddings = self.token_embedding(x) * math.sqrt(self.d_model)
170
+ pos_encoding = self.positional_encoding(x.size()[1]).to(device).unsqueeze(0).repeat(x.size(0), 1, 1)
171
+ x = token_embeddings + pos_encoding
172
+ x = self.layers(x, original_inputs)
173
+ logits = self.output_layers(x)
174
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
175
+ return logits, loss
176
+
177
+ def generate(self, x):
178
+ original_inputs = x
179
+ token_embeddings = self.token_embedding(x) * math.sqrt(self.d_model)
180
+ pos_encoding = self.positional_encoding(x.size()[1]).to(device).unsqueeze(0).repeat(x.size(0), 1, 1)
181
+ x = token_embeddings + pos_encoding
182
+ x = self.layers(x, original_inputs)
183
+ x = self.output_layers(x)
184
+ return F.softmax(x, dim=-1)
185
+
186
+
187
+ ### DATA PREPROCESSING #################################################################################################
188
+ print('Data Preprocessing...')
189
+ start_time = time.time()
190
+
191
+ def save_tokenized_data(name, tokenized_dataset):
192
+ with open(name, 'w') as file:
193
+ json.dump(tokenized_dataset, file)
194
+
195
+ def load_tokenized_data(name):
196
+ with open(f'tokenized_datasets/{name}', 'r') as file:
197
+ loaded_tokenized_data = json.load(file)
198
+ return loaded_tokenized_data
199
+
200
+ # raw_dataset = load_dataset('c4', 'realnewslike') #***********************************#
201
+ # raw_dataset = raw_dataset['train'].select(range(round(len(raw_dataset['train']) / 1000)))
202
+ # raw_dataset = [Tokenizer.tokenize_sequence(raw_dataset['text'][i]) for i in range(len(raw_dataset['text']))]
203
+ # save_tokenized_data('tokenized_datasets/c4_realnewslike.json', raw_dataset)
204
+
205
+ raw_dataset = load_tokenized_data('c4_realnewslike.json') #***********************************#
206
+ token_dataset = []
207
+ for i in range(len(raw_dataset)):
208
+ for j in range(len(raw_dataset[i])):
209
+ token_dataset.append(raw_dataset[i][j])
210
+ token_dataset = token_dataset[:round(max_sequence_length * math.floor(len(token_dataset) / max_sequence_length))]
211
+ train_input = [[] for i in range(math.floor(len(token_dataset) / (max_sequence_length * 2)))]
212
+ train_output = [[] for i in range(math.floor(len(token_dataset) / (max_sequence_length * 2)))]
213
+ for i in range(0, len(token_dataset) - max_sequence_length, max_sequence_length * 2):
214
+ for j in range(max_sequence_length):
215
+ train_input[round(i / (max_sequence_length * 2))].append(token_dataset[i + j])
216
+ train_output[round(i / (max_sequence_length * 2))].append(token_dataset[i + j + max_sequence_length])
217
+ print(f'len(train_input) = {len(train_input)}')
218
+
219
+ # # raw_train_dataset, raw_eval_dataset = train_test_split(raw_dataset['train'].select(range(round(len(raw_dataset['train']) / 25))), test_size=0.2)
220
+ train_input = [seq[:max_sequence_length] if len(seq) > max_sequence_length else seq for seq in train_input]
221
+ train_output = [seq[:max_sequence_length] if len(seq) > max_sequence_length else seq for seq in train_output]
222
+ train_input = [torch.tensor(seq, dtype=torch.long) for seq in train_input]
223
+ train_output = [torch.tensor(seq, dtype=torch.long) for seq in train_output]
224
+ # train_input = [Tokenizer.pad_to_length(seq, max_sequence_length) for seq in train_input]
225
+ # train_output = [Tokenizer.pad_to_length(seq, max_sequence_length) for seq in train_output]
226
+ train_dataset = [(train_input[i], train_output[i]) for i in range(len(train_input))]
227
+ # train_dataset = [pad_sequence(train_dataset[i], batch_first=True, padding_value=0) for i in range(len(train_dataset))]
228
+ train_batch = [[[] for i in range(round(len(train_dataset) / batch_size))] for j in range(2)]
229
+ train_batch_count = 0
230
+ for i in range(0, len(train_dataset), batch_size):
231
+ for j in range(batch_size):
232
+ train_batch[0][train_batch_count].append(train_dataset[i + j][0])
233
+ train_batch[1][train_batch_count].append(train_dataset[i + j][1])
234
+ train_batch_count += 1
235
+
236
+
237
+ ### TRAINING ###########################################################################################################
238
+ print('Training...')
239
+ model = Transformer(d_model, ffn_hidden, num_heads, drop_prob, num_layers)
240
+ print(f'model parameters: {sum(p.numel() for p in model.parameters())}')
241
+ model.to(device)
242
+ epochs = 5
243
+ optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)
244
+
245
+ num_training_steps = epochs * len(train_dataset)
246
+ lr_scheduler = get_scheduler(
247
+ name='linear',
248
+ optimizer=optimizer,
249
+ num_warmup_steps=0,
250
+ num_training_steps=num_training_steps
251
+ )
252
+ train_epoch_average_loss = []
253
+ train_loss_total = 0
254
+ for epoch in range(epochs):
255
+ model.train()
256
+ train_loss = 0
257
+ for i in range(len(train_batch[0])):
258
+ inputs = torch.stack(train_batch[0][i]).to(device)
259
+ labels = torch.stack(train_batch[1][i]).to(device)
260
+ logits, loss = model.forward(inputs, labels)
261
+ optimizer.zero_grad()
262
+ loss.backward()
263
+ optimizer.step()
264
+ lr_scheduler.step()
265
+ train_loss_total += loss
266
+ if i % 10 == 0:
267
+ print('TRAINING...')
268
+ print(f'EPOCH {epoch}, batch {i}/{len(train_batch[0])}')
269
+ print(f'loss: {loss}')
270
+ train_epoch_average_loss.append((train_loss_total / len(train_batch[0])))
271
+ train_loss_total = 0
272
+ # model.eval()
273
+ # eval_loss = 0
274
+ # with torch.no_grad():
275
+ # for i, batch in enumerate(eval_dataset):
276
+ # inputs = batch[0].unsqueeze(0).to(device)
277
+ # labels = batch[1].unsqueeze(0).to(device)
278
+ # logits, loss = model(inputs, labels)
279
+ # if i % 10 == 0:
280
+ # print('EVALUATING...')
281
+ # print(f'EPOCH {epoch}, batch {i}/{len(eval_dataset)}')
282
+ # print(f'loss: {loss}')
283
+ for i in range(len(train_epoch_average_loss)):
284
+ print(f'EPOCH {i} AVERAGE LOSS: {train_epoch_average_loss[i]}')
285
+ torch.save(model.state_dict(), save_path)
286
+
287
+
288
+ end_time = time.time()
289
+ total_time = end_time - start_time
290
+ print(f'{total_time} seconds')
cl100k_base_vocab_list.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenized_datasets/c4_realnewslike.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6fcbb4ebe2e58cf94a59ab469390d801b808a68ae7564ff1d98fe537997e4ab
3
+ size 43787042