Spaces:
Running
Running
abhaskumarsinha
commited on
Commit
•
ceed47a
1
Parent(s):
841a1f4
Added the alpha version of Corpus2GPT
Browse files- inference/__pycache__/inference.cpython-310.pyc +0 -0
- inference/inference.py +99 -0
- inference/sampling_strategies/__pycache__/sample_random.cpython-310.pyc +0 -0
- inference/sampling_strategies/sample_random.py +16 -0
- inference/scale_utils.py +150 -0
- models/GPT.py +167 -0
- models/__pycache__/GPT.cpython-310.pyc +0 -0
- models/__pycache__/attention.cpython-310.pyc +0 -0
- models/__pycache__/decoder.cpython-310.pyc +0 -0
- models/__pycache__/embeddings.cpython-310.pyc +0 -0
- models/attention.py +222 -0
- models/decoder.py +56 -0
- models/embeddings.py +17 -0
- tokenizer/__pycache__/tokenizer.cpython-310.pyc +0 -0
- tokenizer/tokenizer.py +102 -0
- utils/__init__.py +2 -0
- utils/config.py +78 -0
- utils/utils.py +70 -0
inference/__pycache__/inference.cpython-310.pyc
ADDED
Binary file (3.13 kB). View file
|
|
inference/inference.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import keras
|
2 |
+
import numpy as np
|
3 |
+
from .sampling_strategies.sample_random import *
|
4 |
+
|
5 |
+
class Generative_inference:
|
6 |
+
"""
|
7 |
+
This class facilitates text generation by utilizing a provided Keras model,
|
8 |
+
tokenizer, and search strategy. It allows for the generation of text based
|
9 |
+
on an initial prompt.
|
10 |
+
|
11 |
+
Example:
|
12 |
+
```
|
13 |
+
>>> inference = Generative_inference(model = model,
|
14 |
+
>>> tokenizer = tokenizer,
|
15 |
+
>>> search_strategy=random_sampling_strategy)
|
16 |
+
>>> inference.generate("Hello World")
|
17 |
+
⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ ⁇ Hello WorldAr things sayingWhen ruby...
|
18 |
+
```
|
19 |
+
|
20 |
+
"""
|
21 |
+
def __init__(self,
|
22 |
+
model,
|
23 |
+
tokenizer,
|
24 |
+
search_strategy=random_sampling_strategy,
|
25 |
+
prompt="Hello World",
|
26 |
+
input_len=64,
|
27 |
+
padding_token=0,
|
28 |
+
**kwargs
|
29 |
+
):
|
30 |
+
"""
|
31 |
+
Constructor for Generative_inference class.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
model: A Keras model used for text generation.
|
35 |
+
tokenizer: Tokenizer used to encode and decode text.
|
36 |
+
search_strategy: Strategy used for searching tokens during generation. Default is `random_sampling_strategy`
|
37 |
+
prompt (str): The initial prompt for text generation. Default is "Hello World".
|
38 |
+
input_len (int): Length of the input tokens. Default is 64.
|
39 |
+
padding_token (int): Token used for padding. Default is 0.
|
40 |
+
"""
|
41 |
+
self.search_strategy = search_strategy
|
42 |
+
self.kwargs = kwargs
|
43 |
+
self.model = model
|
44 |
+
self.tokenizer = tokenizer
|
45 |
+
self.prompt = prompt
|
46 |
+
self.padding_token = padding_token
|
47 |
+
self.input_len = input_len
|
48 |
+
|
49 |
+
def generate(self,
|
50 |
+
prompt=None,
|
51 |
+
generate_limit=50,
|
52 |
+
**kwargs):
|
53 |
+
"""
|
54 |
+
Generate text based on the provided prompt.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
prompt (str): The prompt for text generation. If not provided, uses the default prompt.
|
58 |
+
generate_limit (int): Maximum number of tokens to generate. Default is 50.
|
59 |
+
**kwargs: Additional keyword arguments to be passed to the search_strategy.
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
str: Generated text.
|
63 |
+
"""
|
64 |
+
|
65 |
+
if prompt is None:
|
66 |
+
prompt = self.prompt
|
67 |
+
|
68 |
+
prompt_tokens = self.tokenizer.tokenizer.encode_as_ids(prompt)
|
69 |
+
|
70 |
+
input_prompt_token_len = len(prompt_tokens)
|
71 |
+
|
72 |
+
if len(prompt_tokens) > self.input_len:
|
73 |
+
prompt_tokens = prompt_tokens[:self.input_len]
|
74 |
+
elif len(prompt_tokens) < self.input_len:
|
75 |
+
prompt_tokens = [self.padding_token] * (self.input_len - len(prompt_tokens)) + prompt_tokens
|
76 |
+
else:
|
77 |
+
pass
|
78 |
+
|
79 |
+
model_input = keras.ops.convert_to_tensor(prompt_tokens)
|
80 |
+
model_input = keras.ops.reshape(model_input, (1, self.input_len))
|
81 |
+
|
82 |
+
gen_len = 0
|
83 |
+
while gen_len < generate_limit:
|
84 |
+
|
85 |
+
gen_len += 1
|
86 |
+
|
87 |
+
model_output = self.model(model_input)
|
88 |
+
output_token = self.search_strategy(outputs=model_output, pos_num=-1, **self.kwargs)
|
89 |
+
model_input = keras.ops.convert_to_numpy(model_input)
|
90 |
+
model_input = np.concatenate((model_input, [[output_token]]), -1)
|
91 |
+
model_input = model_input[:, 1 :]
|
92 |
+
# model_input = keras.ops.convert_to_tensor(model_input)
|
93 |
+
|
94 |
+
model_input = keras.ops.reshape(model_input, (self.input_len,))
|
95 |
+
model_input = keras.ops.convert_to_numpy(model_input)
|
96 |
+
|
97 |
+
model_output = self.tokenizer.tokenizer.decode_ids(model_input.tolist())
|
98 |
+
|
99 |
+
return model_output
|
inference/sampling_strategies/__pycache__/sample_random.cpython-310.pyc
ADDED
Binary file (601 Bytes). View file
|
|
inference/sampling_strategies/sample_random.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import keras
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def random_sampling_strategy(outputs, pos_num=0, k_value=3):
|
5 |
+
|
6 |
+
if len(keras.ops.shape(outputs)) == 3:
|
7 |
+
outputs = outputs[0][pos_num]
|
8 |
+
else:
|
9 |
+
outputs = outputs[pos_num]
|
10 |
+
|
11 |
+
values, indices = keras.ops.top_k(outputs, k=k_value)
|
12 |
+
values = keras.ops.softmax(values, -1)
|
13 |
+
values = keras.ops.convert_to_numpy(values)
|
14 |
+
indices = keras.ops.convert_to_numpy(indices)
|
15 |
+
|
16 |
+
return np.random.choice(indices, p=values)
|
inference/scale_utils.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from models.GPT import build_GPT
|
3 |
+
|
4 |
+
# Utils to work with estimation functions
|
5 |
+
|
6 |
+
def normalize_list(numbers):
|
7 |
+
"""
|
8 |
+
Normalizes a list of numbers to the range [0, 1].
|
9 |
+
|
10 |
+
Args:
|
11 |
+
numbers (list of numeric): List of numbers to be normalized.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
list of float: Normalized list of numbers.
|
15 |
+
"""
|
16 |
+
min_val = min(numbers)
|
17 |
+
max_val = max(numbers)
|
18 |
+
normalized = [(x - min_val) / (max_val - min_val) for x in numbers]
|
19 |
+
return normalized
|
20 |
+
|
21 |
+
|
22 |
+
def estimate_optimal_ratios_from_models(model_configs,
|
23 |
+
train_seq_len,
|
24 |
+
x_train,
|
25 |
+
y_train,
|
26 |
+
max_epochs,
|
27 |
+
batch_size):
|
28 |
+
"""
|
29 |
+
Estimate the optimal ratios of model size and number of training tokens from FLOP counts.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
- model_configs (list): List of tuples representing model configurations.
|
33 |
+
Each tuple contains parameters for building the model.
|
34 |
+
- train_seq_len (list): List of integers representing different numbers of training sequences.
|
35 |
+
- x_train (numpy array): Input data for training.
|
36 |
+
- y_train (numpy array): Target data for training.
|
37 |
+
- max_epochs (int): Maximum number of epochs for training.
|
38 |
+
- batch_size (int): Batch size for training.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
- flops (numpy array): Array of FLOP counts for each experiment.
|
42 |
+
- loss_history (numpy array): Array of loss histories for each experiment.
|
43 |
+
- model_params (numpy array): Array of total model parameters for each experiment.
|
44 |
+
"""
|
45 |
+
|
46 |
+
total_models = len(model_configs)
|
47 |
+
total_seq_len = len(train_seq_len)
|
48 |
+
|
49 |
+
print('Total Number of Experiments: ' + str(total_models * total_seq_len))
|
50 |
+
|
51 |
+
experiment_number = 0
|
52 |
+
_flops = []
|
53 |
+
_loss_history = []
|
54 |
+
_model_params = []
|
55 |
+
for model_config in model_configs:
|
56 |
+
for seq_len in train_seq_len:
|
57 |
+
experiment_number += 1
|
58 |
+
print('Train Number: ' + str(experiment_number))
|
59 |
+
|
60 |
+
# Build the model and calculate FLOPs
|
61 |
+
GPT, flops = build_GPT(*model_config)
|
62 |
+
|
63 |
+
# Train the model
|
64 |
+
history = GPT.fit(x_train[:seq_len], y_train[:seq_len], batch_size=batch_size, epochs=max_epochs)
|
65 |
+
|
66 |
+
# Count model parameters
|
67 |
+
model_params = GPT.count_params()
|
68 |
+
|
69 |
+
# Extract loss history
|
70 |
+
loss_history = history.history['loss']
|
71 |
+
|
72 |
+
# Store results
|
73 |
+
_flops.append(flops*seq_len*max_epochs)
|
74 |
+
_loss_history.append(loss_history)
|
75 |
+
_model_params.append(model_params)
|
76 |
+
|
77 |
+
return (np.array(_flops), np.array(_loss_history), np.array(_model_params))
|
78 |
+
|
79 |
+
import numpy as np
|
80 |
+
|
81 |
+
def estimate_optimal_ratios_from_flops(flop_list,
|
82 |
+
input_len,
|
83 |
+
num_heads,
|
84 |
+
head_dims,
|
85 |
+
num_decoders,
|
86 |
+
fc_dim_factor,
|
87 |
+
vocab_size,
|
88 |
+
dropout_rate,
|
89 |
+
x_train,
|
90 |
+
y_train,
|
91 |
+
trials_per_flop=2,
|
92 |
+
batch_size=32):
|
93 |
+
"""
|
94 |
+
Estimates optimal ratios of various model parameters based on FLOP count.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
flop_list (list): List of FLOP counts to estimate optimal ratios for.
|
98 |
+
input_len (int): Length of the input sequence.
|
99 |
+
num_heads (tuple): Tuple containing the minimum and maximum values for the number of attention heads.
|
100 |
+
head_dims (tuple): Tuple containing the minimum and maximum values for the dimensionality of attention heads.
|
101 |
+
num_decoders (int): Number of decoder layers.
|
102 |
+
fc_dim_factor (int): Factor to determine the dimensionality of fully connected layers.
|
103 |
+
vocab_size (int): Size of the vocabulary.
|
104 |
+
dropout_rate (float): Dropout rate.
|
105 |
+
x_train (numpy.ndarray): Training input data.
|
106 |
+
y_train (numpy.ndarray): Training target data.
|
107 |
+
trials_per_flop (int, optional): Number of trials per FLOP count. Defaults to 2.
|
108 |
+
batch_size (int, optional): Batch size for training. Defaults to 32.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
tuple: Tuple containing loss history, FLOP history, and number of parameters for each trial.
|
112 |
+
"""
|
113 |
+
|
114 |
+
loss_history = []
|
115 |
+
flop_history = []
|
116 |
+
parameters = []
|
117 |
+
|
118 |
+
for flop in flop_list:
|
119 |
+
for _ in range(trials_per_flop):
|
120 |
+
f_num_heads = np.random.randint(num_heads[0], num_heads[1])
|
121 |
+
f_head_dims = np.random.randint(head_dims[0], head_dims[1])
|
122 |
+
f_embed_dim = f_num_heads * f_head_dims
|
123 |
+
f_num_decoders = np.random.randint(1, num_decoders)
|
124 |
+
f_fc_dim_factor = np.random.randint(1, fc_dim_factor)
|
125 |
+
|
126 |
+
args = (input_len,
|
127 |
+
vocab_size,
|
128 |
+
f_embed_dim,
|
129 |
+
f_num_decoders,
|
130 |
+
dropout_rate,
|
131 |
+
f_num_heads,
|
132 |
+
f_head_dims,
|
133 |
+
f_fc_dim_factor
|
134 |
+
)
|
135 |
+
|
136 |
+
GPT, flop_per_inference = build_GPT(*args) # Assuming build_GPT is defined elsewhere
|
137 |
+
print(GPT.summary())
|
138 |
+
|
139 |
+
epochs = flop // flop_per_inference
|
140 |
+
if epochs <= 0:
|
141 |
+
raise Exception('The provided FLOP count is too small: ' + str(flop) + ' is too small')
|
142 |
+
|
143 |
+
history = GPT.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
|
144 |
+
|
145 |
+
loss_history.append(history.history['loss'])
|
146 |
+
flop_history.append(flop*batch_size*epochs)
|
147 |
+
parameters.append(GPT.count_params())
|
148 |
+
|
149 |
+
return loss_history, flop_history, parameters
|
150 |
+
|
models/GPT.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import keras
|
2 |
+
from tokenizer.tokenizer import *
|
3 |
+
from models.attention import *
|
4 |
+
from models.decoder import *
|
5 |
+
from models.embeddings import *
|
6 |
+
|
7 |
+
def FLOP(input_len, vocab_size, embed_dim, num_heads, num_decoders, fc_dim_factor):
|
8 |
+
"""
|
9 |
+
Calculate total number of FLOPs, see Chinchilla
|
10 |
+
paper Appendix F as reference: https://arxiv.org/pdf/2203.15556.pdf
|
11 |
+
|
12 |
+
Copied from: https://github.com/karpathy/nanoGPT/blob/master/scaling_laws.ipynb
|
13 |
+
"""
|
14 |
+
key_size = embed_dim // num_heads
|
15 |
+
|
16 |
+
# embeddings
|
17 |
+
embeddings = 2 * input_len * vocab_size * embed_dim
|
18 |
+
|
19 |
+
# attention
|
20 |
+
# key, query, value projections
|
21 |
+
attention = 2 * 3 * input_len * embed_dim * (key_size * num_heads)
|
22 |
+
# key @ query logits
|
23 |
+
attlogits = 2 * input_len * input_len * (key_size * num_heads)
|
24 |
+
# softmax
|
25 |
+
attsoftmax = 3 * num_heads * input_len * input_len # 3* is for subtract (max), exp, divide (?)
|
26 |
+
# softmax @ value reductions
|
27 |
+
attvalue = 2 * input_len * input_len * (key_size * num_heads)
|
28 |
+
# final linear
|
29 |
+
attlinear = 2 * input_len * (key_size * num_heads) * embed_dim
|
30 |
+
att = attention + attlogits + attsoftmax + attvalue + attlinear
|
31 |
+
# feed forward
|
32 |
+
dense = 2 * input_len * (embed_dim * embed_dim*fc_dim_factor + embed_dim * embed_dim*fc_dim_factor)
|
33 |
+
|
34 |
+
# logits
|
35 |
+
logits = 2 * input_len * embed_dim * vocab_size
|
36 |
+
|
37 |
+
# this is what you'd expect:
|
38 |
+
# forward_flops = embeddings + num_decoders * (att + dense) + logits
|
39 |
+
# but:
|
40 |
+
# per author correspondence apparently there is typo in the paper,
|
41 |
+
# they do not count embeddings and logits to repro table 4. So instead:
|
42 |
+
forward_flops = num_decoders * (att + dense)
|
43 |
+
backward_flops = 2 * forward_flops # as in Kaplan et al. 2020
|
44 |
+
total_flops = forward_flops + backward_flops
|
45 |
+
|
46 |
+
return total_flops
|
47 |
+
|
48 |
+
class GPT(keras.layers.Layer):
|
49 |
+
"""
|
50 |
+
GPT (Generative Pre-trained Transformer) layer.
|
51 |
+
|
52 |
+
This layer implements the architecture of the GPT model, which consists of multiple decoder layers followed
|
53 |
+
by a linear mapping head for language modeling.
|
54 |
+
|
55 |
+
Parameters:
|
56 |
+
decoder (class): Class representing the decoder layer of the Transformer model.
|
57 |
+
attention (class): Class representing the attention mechanism used in the decoder layer.
|
58 |
+
embeddings (class): Class representing the token embeddings.
|
59 |
+
pos_embeddings (class): Class representing the positional embeddings.
|
60 |
+
embedding_size (int): Size of the token embeddings. Default is 1280.
|
61 |
+
vocab_size (int): Size of the vocabulary. Default is 8008.
|
62 |
+
input_len (int): Length of the input sequence. Default is 64.
|
63 |
+
num_decoders (int): Number of decoder layers in the GPT model. Default is 10.
|
64 |
+
|
65 |
+
Attributes:
|
66 |
+
num_decoders (int): Number of decoder layers in the GPT model.
|
67 |
+
decoders (list): List of decoder layer instances.
|
68 |
+
embeddings (keras.layers.Layer): Token embeddings layer instance.
|
69 |
+
pos_embeddings (keras.layers.Layer): Positional embeddings layer instance.
|
70 |
+
lm_head (keras.layers.Dense): Dense layer for language modeling.
|
71 |
+
"""
|
72 |
+
|
73 |
+
def __init__(self, decoder, embeddings, pos_embeddings = None, embedding_size=1280, vocab_size=8008, input_len=64, num_decoders=5, dropout_rate=0.1, num_heads = 32, head_dims = 40, fc_dim_factor = 5):
|
74 |
+
"""
|
75 |
+
Initializes the GPT layer.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
decoder (class): Class representing the decoder layer of the Transformer model.
|
79 |
+
attention (class): Class representing the attention mechanism used in the decoder layer.
|
80 |
+
embeddings (class): Class representing the token embeddings.
|
81 |
+
pos_embeddings (class): Class representing the positional embeddings.
|
82 |
+
embedding_size (int): Size of the token embeddings. Default is 1280.
|
83 |
+
vocab_size (int): Size of the vocabulary. Default is 8008.
|
84 |
+
input_len (int): Length of the input sequence. Default is 64.
|
85 |
+
num_decoders (int): Number of decoder layers in the GPT model. Default is 10.
|
86 |
+
"""
|
87 |
+
super().__init__()
|
88 |
+
|
89 |
+
self.num_decoders = num_decoders
|
90 |
+
self.decoders = []
|
91 |
+
for _ in range(self.num_decoders):
|
92 |
+
self.decoders.append(decoder(dropout_rate, num_heads, head_dims, fc_dim_factor, input_len = input_len))
|
93 |
+
|
94 |
+
self.embeddings = embeddings(input_len, vocab_size+1, embed_dim=embedding_size)
|
95 |
+
|
96 |
+
self.lm_head = keras.layers.Dense(vocab_size)
|
97 |
+
|
98 |
+
self._config = {'decoder' : decoder, 'embeddings': embeddings, 'pos_embeddings': pos_embeddings, 'embedding_size': embedding_size, 'vocab_size': vocab_size, 'input_len': input_len, 'num_decoders': num_decoders}
|
99 |
+
|
100 |
+
|
101 |
+
def call(self, inputs):
|
102 |
+
"""
|
103 |
+
Executes the forward pass of the GPT layer.
|
104 |
+
|
105 |
+
Args:
|
106 |
+
inputs: Input tensor representing the token indices.
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
Tensor: Output tensor representing the logits for language modeling.
|
110 |
+
"""
|
111 |
+
x = inputs
|
112 |
+
|
113 |
+
x = self.embeddings(x)
|
114 |
+
|
115 |
+
for decoder in self.decoders:
|
116 |
+
x = decoder(x)
|
117 |
+
|
118 |
+
x = self.lm_head(x)
|
119 |
+
|
120 |
+
return x
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
def build_GPT(input_len,
|
125 |
+
vocab_size,
|
126 |
+
embed_dim,
|
127 |
+
num_decoders,
|
128 |
+
dropout_rate,
|
129 |
+
num_heads,
|
130 |
+
head_dims,
|
131 |
+
fc_dim_factor,
|
132 |
+
optimizer='adam'
|
133 |
+
):
|
134 |
+
"""
|
135 |
+
Builds a GPT (Generative Pre-trained Transformer) model.
|
136 |
+
|
137 |
+
Parameters:
|
138 |
+
input_len (int): The length of the input sequence.
|
139 |
+
vocab_size (int): The size of the vocabulary.
|
140 |
+
embed_dim (int): The dimensionality of the token embeddings.
|
141 |
+
num_decoders (int): The number of decoder layers.
|
142 |
+
dropout_rate (float): The dropout rate to apply within the model.
|
143 |
+
num_heads (int): The number of attention heads in each decoder layer.
|
144 |
+
head_dims (int): The dimensionality of each attention head.
|
145 |
+
fc_dim_factor (int): The factor to determine the dimensionality
|
146 |
+
of the feedforward network within each decoder layer.
|
147 |
+
optimizer (str, optional): The optimizer to use for training.
|
148 |
+
Defaults to 'adam'.
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
tuple: A tuple containing the GPT model and the total number of floating-point operations (FLOPs).
|
152 |
+
|
153 |
+
"""
|
154 |
+
GPT = keras.Sequential()
|
155 |
+
GPT.add(keras.Input(shape=(input_len,)))
|
156 |
+
GPT.add(TokenAndPositionEmbedding(input_len, vocab_size, embed_dim))
|
157 |
+
for _ in range(num_decoders):
|
158 |
+
GPT.add(Decoder(dropout_rate, num_heads, head_dims, fc_dim_factor, input_len))
|
159 |
+
GPT.add(keras.layers.Dense(vocab_size+1))
|
160 |
+
|
161 |
+
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
162 |
+
GPT.compile(optimizer=optimizer, loss=[loss_fn])
|
163 |
+
|
164 |
+
# Calculate the total number of floating-point operations
|
165 |
+
flops = FLOP(input_len, vocab_size, embed_dim, num_heads, num_decoders, fc_dim_factor)
|
166 |
+
return GPT, flops
|
167 |
+
|
models/__pycache__/GPT.cpython-310.pyc
ADDED
Binary file (5.73 kB). View file
|
|
models/__pycache__/attention.cpython-310.pyc
ADDED
Binary file (5.8 kB). View file
|
|
models/__pycache__/decoder.cpython-310.pyc
ADDED
Binary file (2 kB). View file
|
|
models/__pycache__/embeddings.cpython-310.pyc
ADDED
Binary file (961 Bytes). View file
|
|
models/attention.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import keras
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
class Attention(keras.layers.Layer):
|
6 |
+
"""
|
7 |
+
Multihead attention layer.
|
8 |
+
|
9 |
+
This layer performs multihead attention on input sequences
|
10 |
+
`(key, query, value)`. It splits the input into multiple heads,
|
11 |
+
applies attention mechanism independently to each head,
|
12 |
+
and concatenates the outputs.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
head_dims (int): Dimensionality of each head.
|
16 |
+
num_heads (int): Number of attention heads.
|
17 |
+
dropout (float): Dropout rate to apply within the
|
18 |
+
attention mechanism.
|
19 |
+
|
20 |
+
Usage:
|
21 |
+
```
|
22 |
+
attention = Attention(head_dims=40, num_heads=32, dropout=0.2)
|
23 |
+
output, cache = attention([key, query, value])
|
24 |
+
```
|
25 |
+
"""
|
26 |
+
|
27 |
+
def __init__(self, head_dims=40, num_heads=32, dropout=0.2):
|
28 |
+
"""
|
29 |
+
Initializes the multihead attention layer.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
head_dims (int): Dimensionality of each head.
|
33 |
+
num_heads (int): Number of attention heads.
|
34 |
+
dropout (float): Dropout rate to apply within
|
35 |
+
the attention mechanism.
|
36 |
+
"""
|
37 |
+
super().__init__()
|
38 |
+
|
39 |
+
self.head_dims = head_dims
|
40 |
+
self.num_heads = num_heads
|
41 |
+
self.dropout = dropout
|
42 |
+
self.dense_units = self.head_dims * self.num_heads
|
43 |
+
|
44 |
+
self.key = keras.layers.Dense(self.dense_units)
|
45 |
+
self.query = keras.layers.Dense(self.dense_units)
|
46 |
+
self.value = keras.layers.Dense(self.dense_units)
|
47 |
+
self.out = keras.layers.Dense(self.dense_units)
|
48 |
+
self.norm = keras.layers.LayerNormalization(-1)
|
49 |
+
self.dropout = keras.layers.Dropout(self.dropout)
|
50 |
+
|
51 |
+
self.q_norm_factor = 1/np.sqrt(self.num_heads * self.head_dims)
|
52 |
+
|
53 |
+
self._config = {'head_dims': head_dims, 'num_heads': num_heads, 'dropout': dropout}
|
54 |
+
|
55 |
+
def generate_mask(self, num_words):
|
56 |
+
"""
|
57 |
+
Generates a triangular mask to be applied
|
58 |
+
to attention scores to prevent attending to
|
59 |
+
future positions.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
num_words (int): Number of words in the
|
63 |
+
sequence.
|
64 |
+
|
65 |
+
Returns:
|
66 |
+
tf.Tensor: Triangular mask tensor.
|
67 |
+
"""
|
68 |
+
tensor = np.full((num_words, num_words), np.inf) # Initialize tensor with infinity
|
69 |
+
for i in range(num_words):
|
70 |
+
tensor[i, :i + 1] = 0
|
71 |
+
return keras.ops.convert_to_tensor(tensor, dtype="float32")
|
72 |
+
|
73 |
+
def _shape(self, tensor):
|
74 |
+
"""
|
75 |
+
Reshapes the input tensor for multihead attention
|
76 |
+
computations.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
tensor (tf.Tensor): Input tensor.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
tf.Tensor: Reshaped tensor.
|
83 |
+
"""
|
84 |
+
bsz = keras.ops.shape(tensor)[0]
|
85 |
+
tensor = keras.ops.reshape(tensor, (bsz, -1, self.num_heads, self.head_dims))
|
86 |
+
tensor = keras.ops.transpose(tensor, (0, 2, 1, 3))
|
87 |
+
return tensor
|
88 |
+
|
89 |
+
def call(self, inputs, use_cache=None):
|
90 |
+
"""
|
91 |
+
Forward pass of the multihead attention layer.
|
92 |
+
|
93 |
+
Args:
|
94 |
+
inputs (list): List containing key, query,
|
95 |
+
and value tensors.
|
96 |
+
use_cache (tuple): Cache from previous
|
97 |
+
attention operation (unsupported).
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
tf.Tensor: Output tensor.
|
101 |
+
tuple: Cache for subsequent attention operations.
|
102 |
+
"""
|
103 |
+
k, q, v = inputs
|
104 |
+
|
105 |
+
if use_cache is None:
|
106 |
+
k = self.key(k)
|
107 |
+
q = self.query(q)
|
108 |
+
v = self.value(v)
|
109 |
+
|
110 |
+
k, q, v = self._shape(k), self._shape(q), self._shape(v)
|
111 |
+
|
112 |
+
else:
|
113 |
+
raise NotImplementedError("`use_cache` argument is not supported yet!")
|
114 |
+
|
115 |
+
cache = (k, q, v)
|
116 |
+
|
117 |
+
kq = keras.ops.einsum('bijk, bilk -> bij', k, q)
|
118 |
+
kq *= self.q_norm_factor
|
119 |
+
|
120 |
+
num_words = keras.ops.shape(kq)[-1]
|
121 |
+
#num_words = 64
|
122 |
+
bsz = keras.ops.shape(kq)[0]
|
123 |
+
|
124 |
+
kq = keras.ops.reshape(kq, (bsz, 1, -1, num_words))
|
125 |
+
|
126 |
+
kq_copy = keras.ops.copy(kq)
|
127 |
+
|
128 |
+
for counter in range(num_words - 1):
|
129 |
+
kq = keras.ops.append(kq, kq_copy, 1)
|
130 |
+
|
131 |
+
mask = self.generate_mask(num_words)
|
132 |
+
kq = keras.ops.transpose(kq, (0, 2, 1, 3))
|
133 |
+
kq = kq - mask
|
134 |
+
kq = keras.ops.transpose(kq, (0, 2, 1, 3))
|
135 |
+
kq = keras.ops.softmax(kq, -1)
|
136 |
+
kqv = keras.ops.einsum('bijk, bjkl -> bijl', kq, v)
|
137 |
+
kqv = keras.ops.reshape(kqv, (bsz, num_words, -1))
|
138 |
+
kqv = self.norm(kqv)
|
139 |
+
kqv = self.dropout(kqv)
|
140 |
+
kqv = self.out(kqv)
|
141 |
+
|
142 |
+
return kqv, cache
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
class AttentionTrain(keras.layers.Layer):
|
147 |
+
def _shape(self, tensor):
|
148 |
+
"""
|
149 |
+
Reshapes the input tensor for multihead attention
|
150 |
+
computations.
|
151 |
+
|
152 |
+
Args:
|
153 |
+
tensor (tf.Tensor): Input tensor.
|
154 |
+
|
155 |
+
Returns:
|
156 |
+
tf.Tensor: Reshaped tensor.
|
157 |
+
"""
|
158 |
+
bsz = keras.ops.shape(tensor)[0]
|
159 |
+
tensor = keras.ops.reshape(tensor, (bsz, -1, self.num_heads, self.head_dims))
|
160 |
+
tensor = keras.ops.transpose(tensor, (0, 2, 1, 3))
|
161 |
+
return tensor
|
162 |
+
|
163 |
+
def generate_mask(self, num_words):
|
164 |
+
"""
|
165 |
+
Generates a triangular mask to be applied
|
166 |
+
to attention scores to prevent attending to
|
167 |
+
future positions.
|
168 |
+
|
169 |
+
Args:
|
170 |
+
num_words (int): Number of words in the
|
171 |
+
sequence.
|
172 |
+
|
173 |
+
Returns:
|
174 |
+
tf.Tensor: Triangular mask tensor.
|
175 |
+
"""
|
176 |
+
tensor = np.full((num_words, num_words), np.inf) # Initialize tensor with infinity
|
177 |
+
for i in range(num_words):
|
178 |
+
tensor[i, :i + 1] = 0
|
179 |
+
return keras.ops.convert_to_tensor(tensor, dtype="float32")
|
180 |
+
|
181 |
+
|
182 |
+
def __init__(self, num_heads, head_dims, dropout = 0.2, input_len = 64):
|
183 |
+
super().__init__()
|
184 |
+
self.num_heads = num_heads
|
185 |
+
self.head_dims = head_dims
|
186 |
+
|
187 |
+
self.k = keras.layers.Dense(self.num_heads * self.head_dims)
|
188 |
+
self.q = keras.layers.Dense(self.num_heads * self.head_dims)
|
189 |
+
self.v = keras.layers.Dense(self.num_heads * self.head_dims)
|
190 |
+
self.out = keras.layers.Dense(self.num_heads * self.head_dims)
|
191 |
+
|
192 |
+
self.q_norm = 1/keras.ops.sqrt(self.num_heads * self.head_dims)
|
193 |
+
self.mask = self.generate_mask(input_len)
|
194 |
+
|
195 |
+
self.dropout = keras.layers.Dropout(dropout)
|
196 |
+
|
197 |
+
|
198 |
+
def call(self, inputs):
|
199 |
+
|
200 |
+
k = self.k(inputs)
|
201 |
+
q = self.q(inputs)
|
202 |
+
v = self.v(inputs)
|
203 |
+
|
204 |
+
k, q, v = self._shape(k), self._shape(q), self._shape(v)
|
205 |
+
|
206 |
+
# (b, head, k_token, dims), (b, head, q_token, dims) -> (b, head, q_token, k_token)
|
207 |
+
kq = keras.ops.einsum('bijk, bilk -> bilj', k, q)
|
208 |
+
kq *= self.q_norm
|
209 |
+
kq -= self.mask
|
210 |
+
kq = self.dropout(kq)
|
211 |
+
kq = keras.ops.softmax(kq, -1)
|
212 |
+
|
213 |
+
# (b, head, q_token, k_token), (b, head, k_token, dims) -> (b, head, q_token, dims)
|
214 |
+
kqv = keras.ops.einsum('bilj, bijk -> bilk', kq, v)
|
215 |
+
|
216 |
+
kqv = keras.ops.transpose(kqv, (0, 2, 1, 3))
|
217 |
+
|
218 |
+
bsz = keras.ops.shape(v)[0]
|
219 |
+
kqv = keras.ops.reshape(kqv, (bsz, -1, self.num_heads * self.head_dims))
|
220 |
+
kqv = self.out(kqv)
|
221 |
+
|
222 |
+
return kqv
|
models/decoder.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import keras
|
2 |
+
from models.attention import Attention, AttentionTrain
|
3 |
+
|
4 |
+
class Decoder(keras.layers.Layer):
|
5 |
+
"""
|
6 |
+
Decoder layer in a Transformer model architecture.
|
7 |
+
|
8 |
+
This layer implements the decoder component of the Transformer model, which is responsible for generating
|
9 |
+
the output sequence based on the encoded input sequence and previously generated output tokens.
|
10 |
+
|
11 |
+
Parameters:
|
12 |
+
attention (keras.layers.Layer): Attention layer for the decoder.
|
13 |
+
dropout_rate (float): Dropout rate applied to the outputs of each sub-layer. Default is 0.2.
|
14 |
+
"""
|
15 |
+
|
16 |
+
def __init__(self, dropout_rate=0.2, num_heads = 32, head_dims = 40, fc_dim_factor = 5, input_len = 64):
|
17 |
+
"""
|
18 |
+
Initializes the Decoder layer.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
attention (keras.layers.Layer): Attention layer for the decoder.
|
22 |
+
dropout_rate (float): Dropout rate applied to the outputs of each sub-layer. Default is 0.2.
|
23 |
+
"""
|
24 |
+
super().__init__()
|
25 |
+
|
26 |
+
# Layer Normalization for the first sub-layer
|
27 |
+
self.norm1 = keras.layers.LayerNormalization(epsilon=1e-9)
|
28 |
+
|
29 |
+
# Layer Normalization for the second sub-layer
|
30 |
+
self.norm2 = keras.layers.LayerNormalization(epsilon=1e-9)
|
31 |
+
|
32 |
+
# Attention mechanism
|
33 |
+
self.attn = AttentionTrain(head_dims=head_dims, num_heads=num_heads, dropout=dropout_rate, input_len = input_len)
|
34 |
+
|
35 |
+
# Dense layer for the first feed-forward sub-layer
|
36 |
+
self.fc1 = keras.layers.Dense(num_heads * head_dims * fc_dim_factor, activation='gelu')
|
37 |
+
|
38 |
+
# Dense layer for the second feed-forward sub-layer
|
39 |
+
self.fc2 = keras.layers.Dense(num_heads * head_dims, activation='gelu')
|
40 |
+
|
41 |
+
# Dropout layers
|
42 |
+
self.dropout1 = keras.layers.Dropout(dropout_rate)
|
43 |
+
self.dropout2 = keras.layers.Dropout(dropout_rate)
|
44 |
+
|
45 |
+
self._config = {'dropout_rate': dropout_rate}
|
46 |
+
|
47 |
+
def call(self, inputs):
|
48 |
+
x = inputs
|
49 |
+
x = self.attn(x)
|
50 |
+
x = self.dropout1(x)
|
51 |
+
out1 = self.norm1(x + inputs)
|
52 |
+
|
53 |
+
x = out1
|
54 |
+
out1 = self.fc2(self.fc1(out1))
|
55 |
+
out1 = self.dropout2(out1)
|
56 |
+
return self.norm2(out1 + x)
|
models/embeddings.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import keras
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class TokenAndPositionEmbedding(keras.layers.Layer):
|
7 |
+
def __init__(self, maxlen, vocab_size, embed_dim):
|
8 |
+
super().__init__()
|
9 |
+
self.token_emb = keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
|
10 |
+
self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
|
11 |
+
|
12 |
+
def call(self, x):
|
13 |
+
maxlen = keras.ops.shape(x)[-1]
|
14 |
+
positions = keras.ops.arange(0, maxlen, 1)
|
15 |
+
positions = self.pos_emb(positions)
|
16 |
+
x = self.token_emb(x)
|
17 |
+
return x + positions
|
tokenizer/__pycache__/tokenizer.cpython-310.pyc
ADDED
Binary file (5.58 kB). View file
|
|
tokenizer/tokenizer.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sentencepiece as spm
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
class SPM_Tokenizer:
|
7 |
+
"""
|
8 |
+
A class for tokenizing text data in multiple languages using SentencePiece.
|
9 |
+
|
10 |
+
Attributes:
|
11 |
+
vocab_model_file (str): The file path to the pre-trained SentencePiece vocabulary model file.
|
12 |
+
vocab_size (int): The size of the vocabulary for training the tokenizer.
|
13 |
+
corpus (str): The file path to the corpus used for training the tokenizer if no pre-trained vocabulary model is provided.
|
14 |
+
model_prefix (str): The prefix for the output files generated during training if no pre-trained vocabulary model is provided.
|
15 |
+
input_size (int): The maximum sequence length for tokenized sequences.
|
16 |
+
model_type (str): The type of SentencePiece model to train, default is "unigram".
|
17 |
+
tokenizer (spm.SentencePieceProcessor): The SentencePiece tokenizer object.
|
18 |
+
|
19 |
+
Methods:
|
20 |
+
load_file(file_path): Loads and tokenizes text data from a file.
|
21 |
+
load_dataset(list_files): Loads and tokenizes text data from a list of files, yielding input-output pairs for training.
|
22 |
+
|
23 |
+
Examples:
|
24 |
+
```
|
25 |
+
>>> # Create a new Tokenizer
|
26 |
+
>>> SPM_Tokenizer(vocab_size = 5000, corpus='./stories.txt', input_size=100+1) # Context-width of GPT+1
|
27 |
+
>>> tokenizer = SPM_Tokenizer(vocab_model_file='./tokenizer_.model')
|
28 |
+
Success!
|
29 |
+
>>> tokenizer.tokenizer.encode_as_ids(['Hello World', 'How are you?'])
|
30 |
+
[[3063, 215, 920, 129, 1323], [654, 54, 217, 78]]
|
31 |
+
>>> dataset = tokenizer.load_dataset(['./stories.txt'])
|
32 |
+
>>> for (X, Y) in dataset:
|
33 |
+
>>> X=np.array(X)[0]
|
34 |
+
>>> Y=np.array(Y)[0]
|
35 |
+
>>> tokenizer.tokenizer.decode_ids(X.tolist()), tokenizer.tokenizer.decode_ids(Y.tolist())
|
36 |
+
('The Project Gutenberg EBook of The Thousand and One Nights, Vol. I., by Anonymous This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the',
|
37 |
+
'Project Gutenberg EBook of The Thousand and One Nights, Vol. I., by Anonymous This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms')
|
38 |
+
|
39 |
+
```
|
40 |
+
"""
|
41 |
+
|
42 |
+
def __init__(self, vocab_model_file=None, vocab_size=5000, corpus=None, model_prefix='tokenizer_', input_size=65, model_type="unigram"):
|
43 |
+
"""
|
44 |
+
Initializes the MultiLanguageTokenizer object.
|
45 |
+
|
46 |
+
Parameters:
|
47 |
+
vocab_model_file (str): The file path to the pre-trained SentencePiece vocabulary model file.
|
48 |
+
vocab_size (int): The size of the vocabulary for training the tokenizer.
|
49 |
+
corpus (str): The file path to the corpus used for training the tokenizer if no pre-trained vocabulary model is provided.
|
50 |
+
model_prefix (str): The prefix for the output files generated during training if no pre-trained vocabulary model is provided.
|
51 |
+
input_size (int): The maximum sequence length for tokenized sequences.
|
52 |
+
model_type (str): The type of SentencePiece model to train, default is "unigram".
|
53 |
+
"""
|
54 |
+
self.input_size = input_size
|
55 |
+
if vocab_model_file is not None and os.path.exists(vocab_model_file):
|
56 |
+
self.tokenizer = spm.SentencePieceProcessor()
|
57 |
+
self.tokenizer.load(vocab_model_file)
|
58 |
+
else:
|
59 |
+
if corpus is None:
|
60 |
+
raise Exception('A corpus to train the tokenizer must be provided!')
|
61 |
+
|
62 |
+
self.tokenizer = spm.SentencePieceTrainer.train(input=corpus, model_prefix=model_prefix, vocab_size=vocab_size, model_type=model_type)
|
63 |
+
|
64 |
+
if self.tokenizer is not None:
|
65 |
+
print('Success!')
|
66 |
+
|
67 |
+
def load_file(self, file_path):
|
68 |
+
"""
|
69 |
+
Loads and tokenizes text data from a file.
|
70 |
+
|
71 |
+
Parameters:
|
72 |
+
file_path (str): The file path to the text file.
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
list: A list of tokenized sequences.
|
76 |
+
"""
|
77 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
78 |
+
content = file.readlines()
|
79 |
+
content = " ".join(content).replace('\n', ' ')
|
80 |
+
content = self.tokenizer.encode_as_ids(content)
|
81 |
+
new_content = [content[i:i+self.input_size] for i in range(0, len(content), self.input_size)]
|
82 |
+
num_zeros = self.input_size - len(new_content[-1])
|
83 |
+
padded_list = [0] * num_zeros + new_content[-1]
|
84 |
+
new_content[-1] = padded_list
|
85 |
+
return new_content
|
86 |
+
|
87 |
+
def load_dataset(self, list_files):
|
88 |
+
"""
|
89 |
+
Loads and tokenizes text data from a list of files, yielding input-output pairs for training.
|
90 |
+
|
91 |
+
Parameters:
|
92 |
+
list_files (list): A list of file paths to text files.
|
93 |
+
|
94 |
+
Yields:
|
95 |
+
tuple: A tuple containing input and output sequences.
|
96 |
+
"""
|
97 |
+
for file in list_files:
|
98 |
+
content = self.load_file(file)
|
99 |
+
X = [line[:-1] for line in content]
|
100 |
+
Y = [line[1:] for line in content]
|
101 |
+
yield X, Y
|
102 |
+
|
utils/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .config import *
|
2 |
+
from .utils import *
|
utils/config.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class _C2G_CONFIG:
|
2 |
+
"""
|
3 |
+
This class provides functionalities for managing configurations and modules within the C2G system.
|
4 |
+
|
5 |
+
Methods
|
6 |
+
clean_config(): Resets the configuration dictionary.
|
7 |
+
clean_dir(): Clears the module directory and module list.
|
8 |
+
display_available_modules(): Prints the names of available modules.
|
9 |
+
update_inverse_list(): Updates the inverse dictionary of module names.
|
10 |
+
|
11 |
+
Attributes:
|
12 |
+
_C: A dictionary storing configuration settings.
|
13 |
+
_dir: A dictionary mapping module names to their corresponding objects.
|
14 |
+
_module_list: A list containing names of registered modules.
|
15 |
+
|
16 |
+
Example:
|
17 |
+
```
|
18 |
+
>>> @register_module('add')
|
19 |
+
>>> def add(a, b):
|
20 |
+
>>> return a+b
|
21 |
+
>>> add(-10, 12)
|
22 |
+
2
|
23 |
+
>>> cfg.display_available_modules()
|
24 |
+
Available Modules:
|
25 |
+
ADD
|
26 |
+
>>> cfg.clean_dir()
|
27 |
+
>>> cfg.display_available_modules()
|
28 |
+
Available Modules:
|
29 |
+
```
|
30 |
+
"""
|
31 |
+
_C = {}
|
32 |
+
_dir = {}
|
33 |
+
_module_list = []
|
34 |
+
|
35 |
+
def clean_config(self):
|
36 |
+
self._C = {}
|
37 |
+
|
38 |
+
def clean_dir(self):
|
39 |
+
self._dir = {}
|
40 |
+
self._module_list = []
|
41 |
+
|
42 |
+
def display_available_modules(self):
|
43 |
+
print('Available Modules:')
|
44 |
+
for module_name, _ in self._dir.items():
|
45 |
+
print(module_name)
|
46 |
+
|
47 |
+
def update_inverse_list(self):
|
48 |
+
self._inv_dir = {value : key for key, value in self._dir.items()}
|
49 |
+
|
50 |
+
|
51 |
+
class register_module:
|
52 |
+
"""
|
53 |
+
This class is a decorator used for registering modules within the C2G system.
|
54 |
+
|
55 |
+
Methods
|
56 |
+
__init__(self, name): Initializes the module with a given name.
|
57 |
+
__call__(self, module): Registers the module with the provided name and updates the module list.
|
58 |
+
|
59 |
+
Examples
|
60 |
+
```
|
61 |
+
>>> @register_module('add')
|
62 |
+
>>> def add(a, b):
|
63 |
+
>>> return a+b
|
64 |
+
>>> add(-10, 12)
|
65 |
+
2
|
66 |
+
>>> cfg.display_available_modules()
|
67 |
+
Available Modules:
|
68 |
+
ADD
|
69 |
+
```
|
70 |
+
"""
|
71 |
+
def __init__(self, name):
|
72 |
+
|
73 |
+
self.name = name.upper()
|
74 |
+
def __call__(self, module):
|
75 |
+
cfg._dir[self.name] = module
|
76 |
+
cfg.update_inverse_list()
|
77 |
+
cfg._module_list.append(self.name)
|
78 |
+
return cfg._dir[self.name]
|
utils/utils.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class BASE_C2G:
|
2 |
+
"""
|
3 |
+
This is the base class for C2G models.
|
4 |
+
|
5 |
+
Class Methods
|
6 |
+
serialize_c2g(cls, config): Serializes the configuration into a dictionary.
|
7 |
+
deserialize_c2g(self, config): Deserializes the configuration from a dictionary.
|
8 |
+
construct_model(self, configs): Abstract method for constructing a model. This needs to be implemented by subclasses.
|
9 |
+
|
10 |
+
Examples
|
11 |
+
```
|
12 |
+
>>> @register_module('test_module')
|
13 |
+
>>> class test_module(BASE_C2G):
|
14 |
+
>>> def __init__(self, a, b):
|
15 |
+
>>> self._config = {'a':a, 'b':b}
|
16 |
+
>>> self.a = a
|
17 |
+
>>> self.b = b
|
18 |
+
>>> def __call__(self):
|
19 |
+
>>> return self.a+self.b
|
20 |
+
|
21 |
+
>>> sub_module = test_module(1, 2)
|
22 |
+
>>> sub_module()
|
23 |
+
3
|
24 |
+
|
25 |
+
>>> @register_module('master_module')
|
26 |
+
>>> class master_module(BASE_C2G):
|
27 |
+
>>> def __init__(self, sub_module):
|
28 |
+
>>> self._config = {'sub_module': sub_module}
|
29 |
+
>>> self.sub_module = sub_module
|
30 |
+
|
31 |
+
>>> def __call__(self):
|
32 |
+
>>> print(self.sub_module())
|
33 |
+
|
34 |
+
>>> master_mod = master_module(sub_module)
|
35 |
+
>>> master_mod()
|
36 |
+
3
|
37 |
+
|
38 |
+
>>> save_master_module = BASE_C2G()
|
39 |
+
>>> save_master_module.serialize_c2g(master_mod._config)
|
40 |
+
{'BASE_C2G': {'SUB_MODULE': {'TEST_MODULE': {'a': 1, 'b': 2}}}}
|
41 |
+
>>> cfg._module_list
|
42 |
+
['TEST_MODULE', 'MASTER_MODULE']
|
43 |
+
```
|
44 |
+
|
45 |
+
"""
|
46 |
+
@classmethod
|
47 |
+
def serialize_c2g(cls, config):
|
48 |
+
cls_config = {}
|
49 |
+
for keyword, value in config.items():
|
50 |
+
if value.__class__.__name__.upper() not in cfg._module_list:
|
51 |
+
#if value.__name__.upper() not in cfg._module_list:
|
52 |
+
cls_config[keyword] = value
|
53 |
+
else:
|
54 |
+
cls_config[keyword.upper()] = value.serialize_c2g(value._config)
|
55 |
+
cfg._C = {cls.__name__.upper(): cls_config}
|
56 |
+
return cfg._C
|
57 |
+
|
58 |
+
def has_sub_dicts(self, config):
|
59 |
+
for key, val in config.items():
|
60 |
+
if isinstance(val, dict):
|
61 |
+
return False
|
62 |
+
return True
|
63 |
+
|
64 |
+
def deserialize_c2g(self, config):
|
65 |
+
for model, params in config.items():
|
66 |
+
pass
|
67 |
+
return cfg._dir[model](**params)
|
68 |
+
|
69 |
+
def construct_model(self, configs):
|
70 |
+
raise NotImplementedError('`construct_model` method is not implemented. To implement it, define it on the subclasses of BASE_C2G Models.')
|