abhaskumarsinha commited on
Commit
ceed47a
1 Parent(s): 841a1f4

Added the alpha version of Corpus2GPT

Browse files
inference/__pycache__/inference.cpython-310.pyc ADDED
Binary file (3.13 kB). View file
 
inference/inference.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras
2
+ import numpy as np
3
+ from .sampling_strategies.sample_random import *
4
+
5
+ class Generative_inference:
6
+ """
7
+ This class facilitates text generation by utilizing a provided Keras model,
8
+ tokenizer, and search strategy. It allows for the generation of text based
9
+ on an initial prompt.
10
+
11
+ Example:
12
+ ```
13
+ >>> inference = Generative_inference(model = model,
14
+ >>> tokenizer = tokenizer,
15
+ >>> search_strategy=random_sampling_strategy)
16
+ >>> inference.generate("Hello World")
17
+ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ Hello WorldAr things sayingWhen ruby...
18
+ ```
19
+
20
+ """
21
+ def __init__(self,
22
+ model,
23
+ tokenizer,
24
+ search_strategy=random_sampling_strategy,
25
+ prompt="Hello World",
26
+ input_len=64,
27
+ padding_token=0,
28
+ **kwargs
29
+ ):
30
+ """
31
+ Constructor for Generative_inference class.
32
+
33
+ Args:
34
+ model: A Keras model used for text generation.
35
+ tokenizer: Tokenizer used to encode and decode text.
36
+ search_strategy: Strategy used for searching tokens during generation. Default is `random_sampling_strategy`
37
+ prompt (str): The initial prompt for text generation. Default is "Hello World".
38
+ input_len (int): Length of the input tokens. Default is 64.
39
+ padding_token (int): Token used for padding. Default is 0.
40
+ """
41
+ self.search_strategy = search_strategy
42
+ self.kwargs = kwargs
43
+ self.model = model
44
+ self.tokenizer = tokenizer
45
+ self.prompt = prompt
46
+ self.padding_token = padding_token
47
+ self.input_len = input_len
48
+
49
+ def generate(self,
50
+ prompt=None,
51
+ generate_limit=50,
52
+ **kwargs):
53
+ """
54
+ Generate text based on the provided prompt.
55
+
56
+ Args:
57
+ prompt (str): The prompt for text generation. If not provided, uses the default prompt.
58
+ generate_limit (int): Maximum number of tokens to generate. Default is 50.
59
+ **kwargs: Additional keyword arguments to be passed to the search_strategy.
60
+
61
+ Returns:
62
+ str: Generated text.
63
+ """
64
+
65
+ if prompt is None:
66
+ prompt = self.prompt
67
+
68
+ prompt_tokens = self.tokenizer.tokenizer.encode_as_ids(prompt)
69
+
70
+ input_prompt_token_len = len(prompt_tokens)
71
+
72
+ if len(prompt_tokens) > self.input_len:
73
+ prompt_tokens = prompt_tokens[:self.input_len]
74
+ elif len(prompt_tokens) < self.input_len:
75
+ prompt_tokens = [self.padding_token] * (self.input_len - len(prompt_tokens)) + prompt_tokens
76
+ else:
77
+ pass
78
+
79
+ model_input = keras.ops.convert_to_tensor(prompt_tokens)
80
+ model_input = keras.ops.reshape(model_input, (1, self.input_len))
81
+
82
+ gen_len = 0
83
+ while gen_len < generate_limit:
84
+
85
+ gen_len += 1
86
+
87
+ model_output = self.model(model_input)
88
+ output_token = self.search_strategy(outputs=model_output, pos_num=-1, **self.kwargs)
89
+ model_input = keras.ops.convert_to_numpy(model_input)
90
+ model_input = np.concatenate((model_input, [[output_token]]), -1)
91
+ model_input = model_input[:, 1 :]
92
+ # model_input = keras.ops.convert_to_tensor(model_input)
93
+
94
+ model_input = keras.ops.reshape(model_input, (self.input_len,))
95
+ model_input = keras.ops.convert_to_numpy(model_input)
96
+
97
+ model_output = self.tokenizer.tokenizer.decode_ids(model_input.tolist())
98
+
99
+ return model_output
inference/sampling_strategies/__pycache__/sample_random.cpython-310.pyc ADDED
Binary file (601 Bytes). View file
 
inference/sampling_strategies/sample_random.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras
2
+ import numpy as np
3
+
4
+ def random_sampling_strategy(outputs, pos_num=0, k_value=3):
5
+
6
+ if len(keras.ops.shape(outputs)) == 3:
7
+ outputs = outputs[0][pos_num]
8
+ else:
9
+ outputs = outputs[pos_num]
10
+
11
+ values, indices = keras.ops.top_k(outputs, k=k_value)
12
+ values = keras.ops.softmax(values, -1)
13
+ values = keras.ops.convert_to_numpy(values)
14
+ indices = keras.ops.convert_to_numpy(indices)
15
+
16
+ return np.random.choice(indices, p=values)
inference/scale_utils.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from models.GPT import build_GPT
3
+
4
+ # Utils to work with estimation functions
5
+
6
+ def normalize_list(numbers):
7
+ """
8
+ Normalizes a list of numbers to the range [0, 1].
9
+
10
+ Args:
11
+ numbers (list of numeric): List of numbers to be normalized.
12
+
13
+ Returns:
14
+ list of float: Normalized list of numbers.
15
+ """
16
+ min_val = min(numbers)
17
+ max_val = max(numbers)
18
+ normalized = [(x - min_val) / (max_val - min_val) for x in numbers]
19
+ return normalized
20
+
21
+
22
+ def estimate_optimal_ratios_from_models(model_configs,
23
+ train_seq_len,
24
+ x_train,
25
+ y_train,
26
+ max_epochs,
27
+ batch_size):
28
+ """
29
+ Estimate the optimal ratios of model size and number of training tokens from FLOP counts.
30
+
31
+ Args:
32
+ - model_configs (list): List of tuples representing model configurations.
33
+ Each tuple contains parameters for building the model.
34
+ - train_seq_len (list): List of integers representing different numbers of training sequences.
35
+ - x_train (numpy array): Input data for training.
36
+ - y_train (numpy array): Target data for training.
37
+ - max_epochs (int): Maximum number of epochs for training.
38
+ - batch_size (int): Batch size for training.
39
+
40
+ Returns:
41
+ - flops (numpy array): Array of FLOP counts for each experiment.
42
+ - loss_history (numpy array): Array of loss histories for each experiment.
43
+ - model_params (numpy array): Array of total model parameters for each experiment.
44
+ """
45
+
46
+ total_models = len(model_configs)
47
+ total_seq_len = len(train_seq_len)
48
+
49
+ print('Total Number of Experiments: ' + str(total_models * total_seq_len))
50
+
51
+ experiment_number = 0
52
+ _flops = []
53
+ _loss_history = []
54
+ _model_params = []
55
+ for model_config in model_configs:
56
+ for seq_len in train_seq_len:
57
+ experiment_number += 1
58
+ print('Train Number: ' + str(experiment_number))
59
+
60
+ # Build the model and calculate FLOPs
61
+ GPT, flops = build_GPT(*model_config)
62
+
63
+ # Train the model
64
+ history = GPT.fit(x_train[:seq_len], y_train[:seq_len], batch_size=batch_size, epochs=max_epochs)
65
+
66
+ # Count model parameters
67
+ model_params = GPT.count_params()
68
+
69
+ # Extract loss history
70
+ loss_history = history.history['loss']
71
+
72
+ # Store results
73
+ _flops.append(flops*seq_len*max_epochs)
74
+ _loss_history.append(loss_history)
75
+ _model_params.append(model_params)
76
+
77
+ return (np.array(_flops), np.array(_loss_history), np.array(_model_params))
78
+
79
+ import numpy as np
80
+
81
+ def estimate_optimal_ratios_from_flops(flop_list,
82
+ input_len,
83
+ num_heads,
84
+ head_dims,
85
+ num_decoders,
86
+ fc_dim_factor,
87
+ vocab_size,
88
+ dropout_rate,
89
+ x_train,
90
+ y_train,
91
+ trials_per_flop=2,
92
+ batch_size=32):
93
+ """
94
+ Estimates optimal ratios of various model parameters based on FLOP count.
95
+
96
+ Args:
97
+ flop_list (list): List of FLOP counts to estimate optimal ratios for.
98
+ input_len (int): Length of the input sequence.
99
+ num_heads (tuple): Tuple containing the minimum and maximum values for the number of attention heads.
100
+ head_dims (tuple): Tuple containing the minimum and maximum values for the dimensionality of attention heads.
101
+ num_decoders (int): Number of decoder layers.
102
+ fc_dim_factor (int): Factor to determine the dimensionality of fully connected layers.
103
+ vocab_size (int): Size of the vocabulary.
104
+ dropout_rate (float): Dropout rate.
105
+ x_train (numpy.ndarray): Training input data.
106
+ y_train (numpy.ndarray): Training target data.
107
+ trials_per_flop (int, optional): Number of trials per FLOP count. Defaults to 2.
108
+ batch_size (int, optional): Batch size for training. Defaults to 32.
109
+
110
+ Returns:
111
+ tuple: Tuple containing loss history, FLOP history, and number of parameters for each trial.
112
+ """
113
+
114
+ loss_history = []
115
+ flop_history = []
116
+ parameters = []
117
+
118
+ for flop in flop_list:
119
+ for _ in range(trials_per_flop):
120
+ f_num_heads = np.random.randint(num_heads[0], num_heads[1])
121
+ f_head_dims = np.random.randint(head_dims[0], head_dims[1])
122
+ f_embed_dim = f_num_heads * f_head_dims
123
+ f_num_decoders = np.random.randint(1, num_decoders)
124
+ f_fc_dim_factor = np.random.randint(1, fc_dim_factor)
125
+
126
+ args = (input_len,
127
+ vocab_size,
128
+ f_embed_dim,
129
+ f_num_decoders,
130
+ dropout_rate,
131
+ f_num_heads,
132
+ f_head_dims,
133
+ f_fc_dim_factor
134
+ )
135
+
136
+ GPT, flop_per_inference = build_GPT(*args) # Assuming build_GPT is defined elsewhere
137
+ print(GPT.summary())
138
+
139
+ epochs = flop // flop_per_inference
140
+ if epochs <= 0:
141
+ raise Exception('The provided FLOP count is too small: ' + str(flop) + ' is too small')
142
+
143
+ history = GPT.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
144
+
145
+ loss_history.append(history.history['loss'])
146
+ flop_history.append(flop*batch_size*epochs)
147
+ parameters.append(GPT.count_params())
148
+
149
+ return loss_history, flop_history, parameters
150
+
models/GPT.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras
2
+ from tokenizer.tokenizer import *
3
+ from models.attention import *
4
+ from models.decoder import *
5
+ from models.embeddings import *
6
+
7
+ def FLOP(input_len, vocab_size, embed_dim, num_heads, num_decoders, fc_dim_factor):
8
+ """
9
+ Calculate total number of FLOPs, see Chinchilla
10
+ paper Appendix F as reference: https://arxiv.org/pdf/2203.15556.pdf
11
+
12
+ Copied from: https://github.com/karpathy/nanoGPT/blob/master/scaling_laws.ipynb
13
+ """
14
+ key_size = embed_dim // num_heads
15
+
16
+ # embeddings
17
+ embeddings = 2 * input_len * vocab_size * embed_dim
18
+
19
+ # attention
20
+ # key, query, value projections
21
+ attention = 2 * 3 * input_len * embed_dim * (key_size * num_heads)
22
+ # key @ query logits
23
+ attlogits = 2 * input_len * input_len * (key_size * num_heads)
24
+ # softmax
25
+ attsoftmax = 3 * num_heads * input_len * input_len # 3* is for subtract (max), exp, divide (?)
26
+ # softmax @ value reductions
27
+ attvalue = 2 * input_len * input_len * (key_size * num_heads)
28
+ # final linear
29
+ attlinear = 2 * input_len * (key_size * num_heads) * embed_dim
30
+ att = attention + attlogits + attsoftmax + attvalue + attlinear
31
+ # feed forward
32
+ dense = 2 * input_len * (embed_dim * embed_dim*fc_dim_factor + embed_dim * embed_dim*fc_dim_factor)
33
+
34
+ # logits
35
+ logits = 2 * input_len * embed_dim * vocab_size
36
+
37
+ # this is what you'd expect:
38
+ # forward_flops = embeddings + num_decoders * (att + dense) + logits
39
+ # but:
40
+ # per author correspondence apparently there is typo in the paper,
41
+ # they do not count embeddings and logits to repro table 4. So instead:
42
+ forward_flops = num_decoders * (att + dense)
43
+ backward_flops = 2 * forward_flops # as in Kaplan et al. 2020
44
+ total_flops = forward_flops + backward_flops
45
+
46
+ return total_flops
47
+
48
+ class GPT(keras.layers.Layer):
49
+ """
50
+ GPT (Generative Pre-trained Transformer) layer.
51
+
52
+ This layer implements the architecture of the GPT model, which consists of multiple decoder layers followed
53
+ by a linear mapping head for language modeling.
54
+
55
+ Parameters:
56
+ decoder (class): Class representing the decoder layer of the Transformer model.
57
+ attention (class): Class representing the attention mechanism used in the decoder layer.
58
+ embeddings (class): Class representing the token embeddings.
59
+ pos_embeddings (class): Class representing the positional embeddings.
60
+ embedding_size (int): Size of the token embeddings. Default is 1280.
61
+ vocab_size (int): Size of the vocabulary. Default is 8008.
62
+ input_len (int): Length of the input sequence. Default is 64.
63
+ num_decoders (int): Number of decoder layers in the GPT model. Default is 10.
64
+
65
+ Attributes:
66
+ num_decoders (int): Number of decoder layers in the GPT model.
67
+ decoders (list): List of decoder layer instances.
68
+ embeddings (keras.layers.Layer): Token embeddings layer instance.
69
+ pos_embeddings (keras.layers.Layer): Positional embeddings layer instance.
70
+ lm_head (keras.layers.Dense): Dense layer for language modeling.
71
+ """
72
+
73
+ def __init__(self, decoder, embeddings, pos_embeddings = None, embedding_size=1280, vocab_size=8008, input_len=64, num_decoders=5, dropout_rate=0.1, num_heads = 32, head_dims = 40, fc_dim_factor = 5):
74
+ """
75
+ Initializes the GPT layer.
76
+
77
+ Args:
78
+ decoder (class): Class representing the decoder layer of the Transformer model.
79
+ attention (class): Class representing the attention mechanism used in the decoder layer.
80
+ embeddings (class): Class representing the token embeddings.
81
+ pos_embeddings (class): Class representing the positional embeddings.
82
+ embedding_size (int): Size of the token embeddings. Default is 1280.
83
+ vocab_size (int): Size of the vocabulary. Default is 8008.
84
+ input_len (int): Length of the input sequence. Default is 64.
85
+ num_decoders (int): Number of decoder layers in the GPT model. Default is 10.
86
+ """
87
+ super().__init__()
88
+
89
+ self.num_decoders = num_decoders
90
+ self.decoders = []
91
+ for _ in range(self.num_decoders):
92
+ self.decoders.append(decoder(dropout_rate, num_heads, head_dims, fc_dim_factor, input_len = input_len))
93
+
94
+ self.embeddings = embeddings(input_len, vocab_size+1, embed_dim=embedding_size)
95
+
96
+ self.lm_head = keras.layers.Dense(vocab_size)
97
+
98
+ self._config = {'decoder' : decoder, 'embeddings': embeddings, 'pos_embeddings': pos_embeddings, 'embedding_size': embedding_size, 'vocab_size': vocab_size, 'input_len': input_len, 'num_decoders': num_decoders}
99
+
100
+
101
+ def call(self, inputs):
102
+ """
103
+ Executes the forward pass of the GPT layer.
104
+
105
+ Args:
106
+ inputs: Input tensor representing the token indices.
107
+
108
+ Returns:
109
+ Tensor: Output tensor representing the logits for language modeling.
110
+ """
111
+ x = inputs
112
+
113
+ x = self.embeddings(x)
114
+
115
+ for decoder in self.decoders:
116
+ x = decoder(x)
117
+
118
+ x = self.lm_head(x)
119
+
120
+ return x
121
+
122
+
123
+
124
+ def build_GPT(input_len,
125
+ vocab_size,
126
+ embed_dim,
127
+ num_decoders,
128
+ dropout_rate,
129
+ num_heads,
130
+ head_dims,
131
+ fc_dim_factor,
132
+ optimizer='adam'
133
+ ):
134
+ """
135
+ Builds a GPT (Generative Pre-trained Transformer) model.
136
+
137
+ Parameters:
138
+ input_len (int): The length of the input sequence.
139
+ vocab_size (int): The size of the vocabulary.
140
+ embed_dim (int): The dimensionality of the token embeddings.
141
+ num_decoders (int): The number of decoder layers.
142
+ dropout_rate (float): The dropout rate to apply within the model.
143
+ num_heads (int): The number of attention heads in each decoder layer.
144
+ head_dims (int): The dimensionality of each attention head.
145
+ fc_dim_factor (int): The factor to determine the dimensionality
146
+ of the feedforward network within each decoder layer.
147
+ optimizer (str, optional): The optimizer to use for training.
148
+ Defaults to 'adam'.
149
+
150
+ Returns:
151
+ tuple: A tuple containing the GPT model and the total number of floating-point operations (FLOPs).
152
+
153
+ """
154
+ GPT = keras.Sequential()
155
+ GPT.add(keras.Input(shape=(input_len,)))
156
+ GPT.add(TokenAndPositionEmbedding(input_len, vocab_size, embed_dim))
157
+ for _ in range(num_decoders):
158
+ GPT.add(Decoder(dropout_rate, num_heads, head_dims, fc_dim_factor, input_len))
159
+ GPT.add(keras.layers.Dense(vocab_size+1))
160
+
161
+ loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
162
+ GPT.compile(optimizer=optimizer, loss=[loss_fn])
163
+
164
+ # Calculate the total number of floating-point operations
165
+ flops = FLOP(input_len, vocab_size, embed_dim, num_heads, num_decoders, fc_dim_factor)
166
+ return GPT, flops
167
+
models/__pycache__/GPT.cpython-310.pyc ADDED
Binary file (5.73 kB). View file
 
models/__pycache__/attention.cpython-310.pyc ADDED
Binary file (5.8 kB). View file
 
models/__pycache__/decoder.cpython-310.pyc ADDED
Binary file (2 kB). View file
 
models/__pycache__/embeddings.cpython-310.pyc ADDED
Binary file (961 Bytes). View file
 
models/attention.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras
2
+ import numpy as np
3
+
4
+
5
+ class Attention(keras.layers.Layer):
6
+ """
7
+ Multihead attention layer.
8
+
9
+ This layer performs multihead attention on input sequences
10
+ `(key, query, value)`. It splits the input into multiple heads,
11
+ applies attention mechanism independently to each head,
12
+ and concatenates the outputs.
13
+
14
+ Parameters:
15
+ head_dims (int): Dimensionality of each head.
16
+ num_heads (int): Number of attention heads.
17
+ dropout (float): Dropout rate to apply within the
18
+ attention mechanism.
19
+
20
+ Usage:
21
+ ```
22
+ attention = Attention(head_dims=40, num_heads=32, dropout=0.2)
23
+ output, cache = attention([key, query, value])
24
+ ```
25
+ """
26
+
27
+ def __init__(self, head_dims=40, num_heads=32, dropout=0.2):
28
+ """
29
+ Initializes the multihead attention layer.
30
+
31
+ Args:
32
+ head_dims (int): Dimensionality of each head.
33
+ num_heads (int): Number of attention heads.
34
+ dropout (float): Dropout rate to apply within
35
+ the attention mechanism.
36
+ """
37
+ super().__init__()
38
+
39
+ self.head_dims = head_dims
40
+ self.num_heads = num_heads
41
+ self.dropout = dropout
42
+ self.dense_units = self.head_dims * self.num_heads
43
+
44
+ self.key = keras.layers.Dense(self.dense_units)
45
+ self.query = keras.layers.Dense(self.dense_units)
46
+ self.value = keras.layers.Dense(self.dense_units)
47
+ self.out = keras.layers.Dense(self.dense_units)
48
+ self.norm = keras.layers.LayerNormalization(-1)
49
+ self.dropout = keras.layers.Dropout(self.dropout)
50
+
51
+ self.q_norm_factor = 1/np.sqrt(self.num_heads * self.head_dims)
52
+
53
+ self._config = {'head_dims': head_dims, 'num_heads': num_heads, 'dropout': dropout}
54
+
55
+ def generate_mask(self, num_words):
56
+ """
57
+ Generates a triangular mask to be applied
58
+ to attention scores to prevent attending to
59
+ future positions.
60
+
61
+ Args:
62
+ num_words (int): Number of words in the
63
+ sequence.
64
+
65
+ Returns:
66
+ tf.Tensor: Triangular mask tensor.
67
+ """
68
+ tensor = np.full((num_words, num_words), np.inf) # Initialize tensor with infinity
69
+ for i in range(num_words):
70
+ tensor[i, :i + 1] = 0
71
+ return keras.ops.convert_to_tensor(tensor, dtype="float32")
72
+
73
+ def _shape(self, tensor):
74
+ """
75
+ Reshapes the input tensor for multihead attention
76
+ computations.
77
+
78
+ Args:
79
+ tensor (tf.Tensor): Input tensor.
80
+
81
+ Returns:
82
+ tf.Tensor: Reshaped tensor.
83
+ """
84
+ bsz = keras.ops.shape(tensor)[0]
85
+ tensor = keras.ops.reshape(tensor, (bsz, -1, self.num_heads, self.head_dims))
86
+ tensor = keras.ops.transpose(tensor, (0, 2, 1, 3))
87
+ return tensor
88
+
89
+ def call(self, inputs, use_cache=None):
90
+ """
91
+ Forward pass of the multihead attention layer.
92
+
93
+ Args:
94
+ inputs (list): List containing key, query,
95
+ and value tensors.
96
+ use_cache (tuple): Cache from previous
97
+ attention operation (unsupported).
98
+
99
+ Returns:
100
+ tf.Tensor: Output tensor.
101
+ tuple: Cache for subsequent attention operations.
102
+ """
103
+ k, q, v = inputs
104
+
105
+ if use_cache is None:
106
+ k = self.key(k)
107
+ q = self.query(q)
108
+ v = self.value(v)
109
+
110
+ k, q, v = self._shape(k), self._shape(q), self._shape(v)
111
+
112
+ else:
113
+ raise NotImplementedError("`use_cache` argument is not supported yet!")
114
+
115
+ cache = (k, q, v)
116
+
117
+ kq = keras.ops.einsum('bijk, bilk -> bij', k, q)
118
+ kq *= self.q_norm_factor
119
+
120
+ num_words = keras.ops.shape(kq)[-1]
121
+ #num_words = 64
122
+ bsz = keras.ops.shape(kq)[0]
123
+
124
+ kq = keras.ops.reshape(kq, (bsz, 1, -1, num_words))
125
+
126
+ kq_copy = keras.ops.copy(kq)
127
+
128
+ for counter in range(num_words - 1):
129
+ kq = keras.ops.append(kq, kq_copy, 1)
130
+
131
+ mask = self.generate_mask(num_words)
132
+ kq = keras.ops.transpose(kq, (0, 2, 1, 3))
133
+ kq = kq - mask
134
+ kq = keras.ops.transpose(kq, (0, 2, 1, 3))
135
+ kq = keras.ops.softmax(kq, -1)
136
+ kqv = keras.ops.einsum('bijk, bjkl -> bijl', kq, v)
137
+ kqv = keras.ops.reshape(kqv, (bsz, num_words, -1))
138
+ kqv = self.norm(kqv)
139
+ kqv = self.dropout(kqv)
140
+ kqv = self.out(kqv)
141
+
142
+ return kqv, cache
143
+
144
+
145
+
146
+ class AttentionTrain(keras.layers.Layer):
147
+ def _shape(self, tensor):
148
+ """
149
+ Reshapes the input tensor for multihead attention
150
+ computations.
151
+
152
+ Args:
153
+ tensor (tf.Tensor): Input tensor.
154
+
155
+ Returns:
156
+ tf.Tensor: Reshaped tensor.
157
+ """
158
+ bsz = keras.ops.shape(tensor)[0]
159
+ tensor = keras.ops.reshape(tensor, (bsz, -1, self.num_heads, self.head_dims))
160
+ tensor = keras.ops.transpose(tensor, (0, 2, 1, 3))
161
+ return tensor
162
+
163
+ def generate_mask(self, num_words):
164
+ """
165
+ Generates a triangular mask to be applied
166
+ to attention scores to prevent attending to
167
+ future positions.
168
+
169
+ Args:
170
+ num_words (int): Number of words in the
171
+ sequence.
172
+
173
+ Returns:
174
+ tf.Tensor: Triangular mask tensor.
175
+ """
176
+ tensor = np.full((num_words, num_words), np.inf) # Initialize tensor with infinity
177
+ for i in range(num_words):
178
+ tensor[i, :i + 1] = 0
179
+ return keras.ops.convert_to_tensor(tensor, dtype="float32")
180
+
181
+
182
+ def __init__(self, num_heads, head_dims, dropout = 0.2, input_len = 64):
183
+ super().__init__()
184
+ self.num_heads = num_heads
185
+ self.head_dims = head_dims
186
+
187
+ self.k = keras.layers.Dense(self.num_heads * self.head_dims)
188
+ self.q = keras.layers.Dense(self.num_heads * self.head_dims)
189
+ self.v = keras.layers.Dense(self.num_heads * self.head_dims)
190
+ self.out = keras.layers.Dense(self.num_heads * self.head_dims)
191
+
192
+ self.q_norm = 1/keras.ops.sqrt(self.num_heads * self.head_dims)
193
+ self.mask = self.generate_mask(input_len)
194
+
195
+ self.dropout = keras.layers.Dropout(dropout)
196
+
197
+
198
+ def call(self, inputs):
199
+
200
+ k = self.k(inputs)
201
+ q = self.q(inputs)
202
+ v = self.v(inputs)
203
+
204
+ k, q, v = self._shape(k), self._shape(q), self._shape(v)
205
+
206
+ # (b, head, k_token, dims), (b, head, q_token, dims) -> (b, head, q_token, k_token)
207
+ kq = keras.ops.einsum('bijk, bilk -> bilj', k, q)
208
+ kq *= self.q_norm
209
+ kq -= self.mask
210
+ kq = self.dropout(kq)
211
+ kq = keras.ops.softmax(kq, -1)
212
+
213
+ # (b, head, q_token, k_token), (b, head, k_token, dims) -> (b, head, q_token, dims)
214
+ kqv = keras.ops.einsum('bilj, bijk -> bilk', kq, v)
215
+
216
+ kqv = keras.ops.transpose(kqv, (0, 2, 1, 3))
217
+
218
+ bsz = keras.ops.shape(v)[0]
219
+ kqv = keras.ops.reshape(kqv, (bsz, -1, self.num_heads * self.head_dims))
220
+ kqv = self.out(kqv)
221
+
222
+ return kqv
models/decoder.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import keras
2
+ from models.attention import Attention, AttentionTrain
3
+
4
+ class Decoder(keras.layers.Layer):
5
+ """
6
+ Decoder layer in a Transformer model architecture.
7
+
8
+ This layer implements the decoder component of the Transformer model, which is responsible for generating
9
+ the output sequence based on the encoded input sequence and previously generated output tokens.
10
+
11
+ Parameters:
12
+ attention (keras.layers.Layer): Attention layer for the decoder.
13
+ dropout_rate (float): Dropout rate applied to the outputs of each sub-layer. Default is 0.2.
14
+ """
15
+
16
+ def __init__(self, dropout_rate=0.2, num_heads = 32, head_dims = 40, fc_dim_factor = 5, input_len = 64):
17
+ """
18
+ Initializes the Decoder layer.
19
+
20
+ Args:
21
+ attention (keras.layers.Layer): Attention layer for the decoder.
22
+ dropout_rate (float): Dropout rate applied to the outputs of each sub-layer. Default is 0.2.
23
+ """
24
+ super().__init__()
25
+
26
+ # Layer Normalization for the first sub-layer
27
+ self.norm1 = keras.layers.LayerNormalization(epsilon=1e-9)
28
+
29
+ # Layer Normalization for the second sub-layer
30
+ self.norm2 = keras.layers.LayerNormalization(epsilon=1e-9)
31
+
32
+ # Attention mechanism
33
+ self.attn = AttentionTrain(head_dims=head_dims, num_heads=num_heads, dropout=dropout_rate, input_len = input_len)
34
+
35
+ # Dense layer for the first feed-forward sub-layer
36
+ self.fc1 = keras.layers.Dense(num_heads * head_dims * fc_dim_factor, activation='gelu')
37
+
38
+ # Dense layer for the second feed-forward sub-layer
39
+ self.fc2 = keras.layers.Dense(num_heads * head_dims, activation='gelu')
40
+
41
+ # Dropout layers
42
+ self.dropout1 = keras.layers.Dropout(dropout_rate)
43
+ self.dropout2 = keras.layers.Dropout(dropout_rate)
44
+
45
+ self._config = {'dropout_rate': dropout_rate}
46
+
47
+ def call(self, inputs):
48
+ x = inputs
49
+ x = self.attn(x)
50
+ x = self.dropout1(x)
51
+ out1 = self.norm1(x + inputs)
52
+
53
+ x = out1
54
+ out1 = self.fc2(self.fc1(out1))
55
+ out1 = self.dropout2(out1)
56
+ return self.norm2(out1 + x)
models/embeddings.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import keras
3
+ import numpy as np
4
+
5
+
6
+ class TokenAndPositionEmbedding(keras.layers.Layer):
7
+ def __init__(self, maxlen, vocab_size, embed_dim):
8
+ super().__init__()
9
+ self.token_emb = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
10
+ self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
11
+
12
+ def call(self, x):
13
+ maxlen = keras.ops.shape(x)[-1]
14
+ positions = keras.ops.arange(0, maxlen, 1)
15
+ positions = self.pos_emb(positions)
16
+ x = self.token_emb(x)
17
+ return x + positions
tokenizer/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (5.58 kB). View file
 
tokenizer/tokenizer.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+ import os
3
+ import re
4
+ import numpy as np
5
+
6
+ class SPM_Tokenizer:
7
+ """
8
+ A class for tokenizing text data in multiple languages using SentencePiece.
9
+
10
+ Attributes:
11
+ vocab_model_file (str): The file path to the pre-trained SentencePiece vocabulary model file.
12
+ vocab_size (int): The size of the vocabulary for training the tokenizer.
13
+ corpus (str): The file path to the corpus used for training the tokenizer if no pre-trained vocabulary model is provided.
14
+ model_prefix (str): The prefix for the output files generated during training if no pre-trained vocabulary model is provided.
15
+ input_size (int): The maximum sequence length for tokenized sequences.
16
+ model_type (str): The type of SentencePiece model to train, default is "unigram".
17
+ tokenizer (spm.SentencePieceProcessor): The SentencePiece tokenizer object.
18
+
19
+ Methods:
20
+ load_file(file_path): Loads and tokenizes text data from a file.
21
+ load_dataset(list_files): Loads and tokenizes text data from a list of files, yielding input-output pairs for training.
22
+
23
+ Examples:
24
+ ```
25
+ >>> # Create a new Tokenizer
26
+ >>> SPM_Tokenizer(vocab_size = 5000, corpus='./stories.txt', input_size=100+1) # Context-width of GPT+1
27
+ >>> tokenizer = SPM_Tokenizer(vocab_model_file='./tokenizer_.model')
28
+ Success!
29
+ >>> tokenizer.tokenizer.encode_as_ids(['Hello World', 'How are you?'])
30
+ [[3063, 215, 920, 129, 1323], [654, 54, 217, 78]]
31
+ >>> dataset = tokenizer.load_dataset(['./stories.txt'])
32
+ >>> for (X, Y) in dataset:
33
+ >>> X=np.array(X)[0]
34
+ >>> Y=np.array(Y)[0]
35
+ >>> tokenizer.tokenizer.decode_ids(X.tolist()), tokenizer.tokenizer.decode_ids(Y.tolist())
36
+ ('The Project Gutenberg EBook of The Thousand and One Nights, Vol. I., by Anonymous This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the',
37
+ 'Project Gutenberg EBook of The Thousand and One Nights, Vol. I., by Anonymous This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms')
38
+
39
+ ```
40
+ """
41
+
42
+ def __init__(self, vocab_model_file=None, vocab_size=5000, corpus=None, model_prefix='tokenizer_', input_size=65, model_type="unigram"):
43
+ """
44
+ Initializes the MultiLanguageTokenizer object.
45
+
46
+ Parameters:
47
+ vocab_model_file (str): The file path to the pre-trained SentencePiece vocabulary model file.
48
+ vocab_size (int): The size of the vocabulary for training the tokenizer.
49
+ corpus (str): The file path to the corpus used for training the tokenizer if no pre-trained vocabulary model is provided.
50
+ model_prefix (str): The prefix for the output files generated during training if no pre-trained vocabulary model is provided.
51
+ input_size (int): The maximum sequence length for tokenized sequences.
52
+ model_type (str): The type of SentencePiece model to train, default is "unigram".
53
+ """
54
+ self.input_size = input_size
55
+ if vocab_model_file is not None and os.path.exists(vocab_model_file):
56
+ self.tokenizer = spm.SentencePieceProcessor()
57
+ self.tokenizer.load(vocab_model_file)
58
+ else:
59
+ if corpus is None:
60
+ raise Exception('A corpus to train the tokenizer must be provided!')
61
+
62
+ self.tokenizer = spm.SentencePieceTrainer.train(input=corpus, model_prefix=model_prefix, vocab_size=vocab_size, model_type=model_type)
63
+
64
+ if self.tokenizer is not None:
65
+ print('Success!')
66
+
67
+ def load_file(self, file_path):
68
+ """
69
+ Loads and tokenizes text data from a file.
70
+
71
+ Parameters:
72
+ file_path (str): The file path to the text file.
73
+
74
+ Returns:
75
+ list: A list of tokenized sequences.
76
+ """
77
+ with open(file_path, 'r', encoding='utf-8') as file:
78
+ content = file.readlines()
79
+ content = " ".join(content).replace('\n', ' ')
80
+ content = self.tokenizer.encode_as_ids(content)
81
+ new_content = [content[i:i+self.input_size] for i in range(0, len(content), self.input_size)]
82
+ num_zeros = self.input_size - len(new_content[-1])
83
+ padded_list = [0] * num_zeros + new_content[-1]
84
+ new_content[-1] = padded_list
85
+ return new_content
86
+
87
+ def load_dataset(self, list_files):
88
+ """
89
+ Loads and tokenizes text data from a list of files, yielding input-output pairs for training.
90
+
91
+ Parameters:
92
+ list_files (list): A list of file paths to text files.
93
+
94
+ Yields:
95
+ tuple: A tuple containing input and output sequences.
96
+ """
97
+ for file in list_files:
98
+ content = self.load_file(file)
99
+ X = [line[:-1] for line in content]
100
+ Y = [line[1:] for line in content]
101
+ yield X, Y
102
+
utils/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .config import *
2
+ from .utils import *
utils/config.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class _C2G_CONFIG:
2
+ """
3
+ This class provides functionalities for managing configurations and modules within the C2G system.
4
+
5
+ Methods
6
+ clean_config(): Resets the configuration dictionary.
7
+ clean_dir(): Clears the module directory and module list.
8
+ display_available_modules(): Prints the names of available modules.
9
+ update_inverse_list(): Updates the inverse dictionary of module names.
10
+
11
+ Attributes:
12
+ _C: A dictionary storing configuration settings.
13
+ _dir: A dictionary mapping module names to their corresponding objects.
14
+ _module_list: A list containing names of registered modules.
15
+
16
+ Example:
17
+ ```
18
+ >>> @register_module('add')
19
+ >>> def add(a, b):
20
+ >>> return a+b
21
+ >>> add(-10, 12)
22
+ 2
23
+ >>> cfg.display_available_modules()
24
+ Available Modules:
25
+ ADD
26
+ >>> cfg.clean_dir()
27
+ >>> cfg.display_available_modules()
28
+ Available Modules:
29
+ ```
30
+ """
31
+ _C = {}
32
+ _dir = {}
33
+ _module_list = []
34
+
35
+ def clean_config(self):
36
+ self._C = {}
37
+
38
+ def clean_dir(self):
39
+ self._dir = {}
40
+ self._module_list = []
41
+
42
+ def display_available_modules(self):
43
+ print('Available Modules:')
44
+ for module_name, _ in self._dir.items():
45
+ print(module_name)
46
+
47
+ def update_inverse_list(self):
48
+ self._inv_dir = {value : key for key, value in self._dir.items()}
49
+
50
+
51
+ class register_module:
52
+ """
53
+ This class is a decorator used for registering modules within the C2G system.
54
+
55
+ Methods
56
+ __init__(self, name): Initializes the module with a given name.
57
+ __call__(self, module): Registers the module with the provided name and updates the module list.
58
+
59
+ Examples
60
+ ```
61
+ >>> @register_module('add')
62
+ >>> def add(a, b):
63
+ >>> return a+b
64
+ >>> add(-10, 12)
65
+ 2
66
+ >>> cfg.display_available_modules()
67
+ Available Modules:
68
+ ADD
69
+ ```
70
+ """
71
+ def __init__(self, name):
72
+
73
+ self.name = name.upper()
74
+ def __call__(self, module):
75
+ cfg._dir[self.name] = module
76
+ cfg.update_inverse_list()
77
+ cfg._module_list.append(self.name)
78
+ return cfg._dir[self.name]
utils/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class BASE_C2G:
2
+ """
3
+ This is the base class for C2G models.
4
+
5
+ Class Methods
6
+ serialize_c2g(cls, config): Serializes the configuration into a dictionary.
7
+ deserialize_c2g(self, config): Deserializes the configuration from a dictionary.
8
+ construct_model(self, configs): Abstract method for constructing a model. This needs to be implemented by subclasses.
9
+
10
+ Examples
11
+ ```
12
+ >>> @register_module('test_module')
13
+ >>> class test_module(BASE_C2G):
14
+ >>> def __init__(self, a, b):
15
+ >>> self._config = {'a':a, 'b':b}
16
+ >>> self.a = a
17
+ >>> self.b = b
18
+ >>> def __call__(self):
19
+ >>> return self.a+self.b
20
+
21
+ >>> sub_module = test_module(1, 2)
22
+ >>> sub_module()
23
+ 3
24
+
25
+ >>> @register_module('master_module')
26
+ >>> class master_module(BASE_C2G):
27
+ >>> def __init__(self, sub_module):
28
+ >>> self._config = {'sub_module': sub_module}
29
+ >>> self.sub_module = sub_module
30
+
31
+ >>> def __call__(self):
32
+ >>> print(self.sub_module())
33
+
34
+ >>> master_mod = master_module(sub_module)
35
+ >>> master_mod()
36
+ 3
37
+
38
+ >>> save_master_module = BASE_C2G()
39
+ >>> save_master_module.serialize_c2g(master_mod._config)
40
+ {'BASE_C2G': {'SUB_MODULE': {'TEST_MODULE': {'a': 1, 'b': 2}}}}
41
+ >>> cfg._module_list
42
+ ['TEST_MODULE', 'MASTER_MODULE']
43
+ ```
44
+
45
+ """
46
+ @classmethod
47
+ def serialize_c2g(cls, config):
48
+ cls_config = {}
49
+ for keyword, value in config.items():
50
+ if value.__class__.__name__.upper() not in cfg._module_list:
51
+ #if value.__name__.upper() not in cfg._module_list:
52
+ cls_config[keyword] = value
53
+ else:
54
+ cls_config[keyword.upper()] = value.serialize_c2g(value._config)
55
+ cfg._C = {cls.__name__.upper(): cls_config}
56
+ return cfg._C
57
+
58
+ def has_sub_dicts(self, config):
59
+ for key, val in config.items():
60
+ if isinstance(val, dict):
61
+ return False
62
+ return True
63
+
64
+ def deserialize_c2g(self, config):
65
+ for model, params in config.items():
66
+ pass
67
+ return cfg._dir[model](**params)
68
+
69
+ def construct_model(self, configs):
70
+ raise NotImplementedError('`construct_model` method is not implemented. To implement it, define it on the subclasses of BASE_C2G Models.')