Spaces:
Runtime error
Runtime error
import os | |
import json | |
import tensorflow as tf | |
from tqdm import tqdm | |
from GPT import * | |
import pickle | |
import argparse | |
import sys | |
def save_module(save_weights, model, vectorizer, save_tokenizer): | |
# Save the GPT Model | |
with open(save_weights, 'wb') as file: | |
pickle.dump(model.weights, file) | |
#Save the Vectorizer Model | |
vocabulary = vectorizer.get_vocabulary() | |
# Encode the vocabulary as JSON-compatible strings | |
encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary] | |
encoded_vocabulary = encoded_vocabulary[2:] | |
# Save the encoded vocabulary to a JSON file | |
with open(save_tokenizer, 'w') as f: | |
json.dump(encoded_vocabulary, f) | |
print("Vocabulary size saved: " + str(len(encoded_vocabulary))) | |
def read_file(f, vectorizer, chunk_size = 1024, starting_chunk = 0, ending_chunk = 5, gpt_input = 10): | |
i = 0 | |
chunk = [] | |
while True: | |
data = f.read(chunk_size) | |
if not data or i > ending_chunk: | |
break | |
if i >= starting_chunk and i <= ending_chunk: | |
file_contents = data.split() | |
input_tokens, output_tokens = [], [] | |
for j in range(len(file_contents) - gpt_input - 1): | |
input_tokens += [file_contents[j : j + gpt_input]] | |
output_tokens += [file_contents[j + gpt_input]] | |
X = [' '.join(input_tokens[j]) for j in range(len(input_tokens))] | |
Y = output_tokens | |
X = vectorizer(X) | |
Y = vectorizer(Y) | |
output = tf.concat([X, Y], 1) | |
yield output | |
i += 1 | |
def get_model(gpt_input, d_model, h, vocab_size, decoder_stacks, GPT_attention): | |
input_words = tf.keras.layers.Input((gpt_input)) | |
embedding = tf.keras.layers.Embedding(vocab_size + 2, d_model)(input_words) | |
positional_enc = PositionalEmbedding(words = gpt_input, embedding_size = d_model)(embedding) | |
decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(positional_enc) | |
for _ in range(decoder_stacks - 1): | |
decoder = Decoder(num_heads = h, key_dim = gpt_input, key_embedding = d_model, GPT_attention = GPT_attention)(decoder) | |
decoder = tf.keras.layers.Flatten()(decoder) | |
linear_layer = tf.keras.layers.Dense(vocab_size + 3)(decoder) | |
softmax = tf.nn.softmax(linear_layer) | |
GPT = tf.keras.Model(inputs = input_words, outputs = softmax) | |
return GPT | |
def MinimalGPT(data_path='.', | |
learning_rate=0, | |
output_length=0, | |
epochs = 1, | |
batch_size = 1, | |
gpt_input=10, | |
d_model=128, | |
h=8, | |
decoder_stacks=1, | |
starting_chunk = 0, | |
ending_chunk = 5, | |
chunk_size = 10, | |
token_end=40000, | |
vocabulary_start = 0, | |
vocabulary_end = 40000, | |
save=False, | |
load_tokenizer=None, | |
load_weights=None, | |
save_tokenizer=None, | |
save_weights=None, | |
optimizer=None, | |
inference_only = False, | |
return_model_and_vectorizer = False, | |
return_model_and_vectorizer_and_output = False, | |
GPT_attention = False, | |
TPU = False): | |
if chunk_size: | |
chunk_size *= 1024 | |
if inference_only == False: | |
with open(data_path, 'r', encoding = 'utf-8') as file: | |
corpus = file.read() | |
#file_contents = corpus.split()[token_start : token_end] | |
#print("Total tokens: " + str(len(file_contents))) | |
if load_tokenizer: | |
with open(load_tokenizer, 'r') as f: | |
encoded_vocabulary = json.load(f) | |
# Decode the encoded vocabulary to original strings | |
vocabulary = [word.encode('utf-8').decode('unicode_escape') for word in encoded_vocabulary] | |
vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace') | |
vectorizer.set_vocabulary(vocabulary) | |
vocab_size = vectorizer.vocabulary_size() | |
else: | |
vocab = [] | |
for word in tqdm(corpus.split()[vocabulary_start : vocabulary_end]): | |
vocab += [word] | |
vocab = list(set(vocab)) | |
vocab_size = len(vocab) | |
vectorizer = tf.keras.layers.TextVectorization(standardize = None, split = 'whitespace', vocabulary = vocab) | |
print('New Vectorizer created successfully...') | |
print("Vocabulary Size: " + str(vocab_size)) | |
del corpus | |
#if inference_only == False: | |
# input_tokens, output_tokens = [], [] | |
# for i in tqdm(range(len(file_contents) - gpt_input - 1)): | |
# input_tokens += [file_contents[i : i + gpt_input]] | |
# output_tokens += [file_contents[i + gpt_input]] | |
# X = [' '.join(input_tokens[i]) for i in tqdm(range(len(input_tokens)))] | |
# Y = output_tokens | |
# del corpus | |
# X = vectorizer(X) | |
# Y = vectorizer(Y) | |
if load_weights: | |
model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size - 2, GPT_attention = GPT_attention) | |
with open(load_weights, 'rb') as file: | |
W = pickle.load(file) | |
model.set_weights(W) | |
else: | |
model = get_model(gpt_input = gpt_input, d_model = d_model, h = h, decoder_stacks = decoder_stacks, vocab_size = vocab_size, GPT_attention = GPT_attention) | |
print(model.summary()) | |
if inference_only == False: | |
# Compile the model | |
if not optimizer: | |
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy') | |
else: | |
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy') | |
# Train the model | |
if learning_rate > 0: | |
for epoch in tqdm(range(epochs)): | |
with open(data_path, 'r', encoding='utf-8') as f: | |
chunk_number = 1 | |
for chunk in read_file(f, | |
vectorizer, | |
chunk_size, | |
starting_chunk, | |
ending_chunk, | |
gpt_input): | |
print('Chunk_size: ' + str(chunk.shape[0])) | |
model.fit(chunk[:, :gpt_input], tf.reshape(chunk[:, -1], (-1, 1)), batch_size = batch_size, epochs=1) | |
print("Chunk Number " + str(chunk_number) + "/" +str(ending_chunk - starting_chunk + 1) + " processed!") | |
chunk_number += 1 | |
# Print the output of the Model | |
output_seq = generate_output(gpt_input = gpt_input, model = model, vectorizer = vectorizer, text_size = output_length, input_sequence = []) | |
if save == True and TPU == False: | |
print('Saveeeeee') | |
save_module(save_weights, model, vectorizer, save_tokenizer) | |
if save == True and TPU == True: | |
return save_weights, model, vectorizer, save_tokenizer, output_seq | |
# Save the GPT Model | |
#with open(save_weights, 'wb') as file: | |
# pickle.dump(model.weights, file) | |
#Save the Vectorizer Model | |
#vocabulary = vectorizer.get_vocabulary() | |
# Encode the vocabulary as JSON-compatible strings | |
#encoded_vocabulary = [word.encode('unicode_escape').decode('utf-8') for word in vocabulary] | |
#encoded_vocabulary = encoded_vocabulary[2:] | |
# Save the encoded vocabulary to a JSON file | |
#with open(save_tokenizer, 'w') as f: | |
# json.dump(encoded_vocabulary, f) | |
# print("Vocabulary size saved: " + str(len(encoded_vocabulary))) | |
if return_model_and_vectorizer: | |
return model, vectorizer | |
elif return_model_and_vectorizer_and_output: | |
return model, vectorizer, output_seq.replace('@@ ', '') | |
else: | |
return output_seq.replace('@@ ', '') | |
# Example code to execute when the script file is called | |
def main(): | |
print("This code is executed when the script file is called directly.") | |
# Check if the script is being run as the main module | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-d', '--data-path', help='File: Corresponding to corpus or training text [String]') | |
parser.add_argument('-l', '--learning-rate', help='Float: Learning Rate. The model will train ONLY IF the rate is > 0, skip otherwise [Float]', type=float) | |
parser.add_argument('-ol', '--output-length', help='Length of the output sequence to be generated', type=int) | |
parser.add_argument('-e', '--epochs', help='Number of training Epochs [Int]', type=int) | |
parser.add_argument('-b', '--batch-size', help='Size of each batch [Int]', type=int) | |
parser.add_argument('-s', '--gpt-input', help='Number of Tokens of text the model inputs at a time [Int]', type=int) | |
parser.add_argument('-dm', '--d-model', help='Embedding layer output dimensions [Int]', type=int) | |
parser.add_argument('-p', '--multi-head', help='Number of Multi-head Attention layer in parallel [Int]', type=int) | |
parser.add_argument('-ds', '--decoder-stacks', help='Number of stacked Decoder layer [Int]', type=int) | |
parser.add_argument('-sc', '--chunk-start', help='The chunk number in the corpus to mark it as the starting point of the training [Int]', type=int) | |
parser.add_argument('-ec', '--chunk-end', help='The chunk number in the corpus to mark it as the end point of the training [Int]', type=int) | |
parser.add_argument('-csz', '--chunk-size', help='The size of each chunk in KB.', type=int) | |
parser.add_argument('-vs', '--vocabulary-start', help='Token number from the corpus to mark the starting point of vocabulary data [Int]', type=int) | |
parser.add_argument('-ve', '--vocabulary-end', help='Token number from the corpus to mark the end point of vocabulary data [Int]', type=int) | |
parser.add_argument('-sd', '--save', help='Save the Model and Vectorizer data to disk [True/False]', action='store_true') | |
parser.add_argument('-lt', '--load-tokenizer', help='File: Vectorization layer [File]') | |
parser.add_argument('-lw', '--load-weights', help='File: Model Weights [File]') | |
parser.add_argument('-st', '--save-tokenizer', help='File: Saving Vectorizer File [File]') | |
parser.add_argument('-sw', '--save-weights', help='File: Saving Model Weights[File]') | |
parser.add_argument('-ot', '--optimizer', help='Optimizer consistent to TensorFlow optimizer class [tf.keras.optimizers]') | |
parser.add_argument('-i', '--inference-only', help='Only Print the output of the model in Inference Mode [True/False]', action='store_true') | |
parser.add_argument('-mv', '--model-vectorizer', help='Return Model, Vectorizer Tuple [True/False]', action='store_true') | |
parser.add_argument('-mvo', '--model-vectorizer-output', help='Return Model, Vectorizer, Output Tuple [True/False]', action='store_true') | |
parser.add_argument('-ga', '--gpt-style-attention', help='Uses GPT-styled attention. Note: (d-model) parameter should be divisible by (multi-head), otherwise the program will throw an error! [True/False]', action='store_true') | |
parser.add_argument('-tpu', '--TPU', help='Use Tensor Processor Units (Distributed Learning)', action='store_true') | |
args = parser.parse_args() | |
data_path = args.data_path | |
learning_rate = args.learning_rate | |
output_length = args.output_length | |
epochs = args.epochs | |
batch_size = args.batch_size | |
gpt_input = args.gpt_input | |
d_model = args.d_model | |
h = args.multi_head | |
stacks = args.decoder_stacks | |
chunk_start = args.chunk_start | |
chunk_end = args.chunk_end | |
chunk_size = args.chunk_size | |
vocabulary_start = args.vocabulary_start | |
vocabulary_end = args.vocabulary_end | |
save = args.save | |
load_tokenizer = args.load_tokenizer | |
load_weights = args.load_weights | |
save_tokenizer = args.save_tokenizer | |
save_weights = args.save_weights | |
optimizer = args.optimizer | |
inference_only = args.inference_only | |
model_and_vectorizer = args.model_vectorizer | |
GPT_attention = args.gpt_style_attention | |
model_vectorizer_output = args.model_vectorizer_output | |
configuration = { | |
'data_path': args.data_path, | |
'learning_rate': args.learning_rate, | |
'output_length': args.output_length, | |
'epochs': args.epochs, | |
'batch_size': args.batch_size, | |
'gpt_input': args.gpt_input, | |
'd_model': args.d_model, | |
'h': args.multi_head, | |
'stacks': args.decoder_stacks, | |
'chunk_start': args.chunk_start, | |
'chunk_end': args.chunk_end, | |
'chunk_size': args.chunk_size, | |
'vocabulary_start': args.vocabulary_start, | |
'vocabulary_end': args.vocabulary_end, | |
'save': args.save, | |
'load_tokenizer': args.load_tokenizer, | |
'load_weights': args.load_weights, | |
'save_tokenizer': args.save_tokenizer, | |
'save_weights': args.save_weights, | |
'optimizer': args.optimizer, | |
'inference_only': args.inference_only, | |
'model_and_vectorizer': args.model_vectorizer, | |
'model_vectorizer_output': args.model_vectorizer_output, | |
'GPT_Attention' : args.gpt_style_attention | |
} | |
# Save the configuration to a JSON file | |
with open('last-configuration.json', 'w') as file: | |
json.dump(configuration, file) | |
if args.TPU == True: | |
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') | |
tf.config.experimental_connect_to_cluster(resolver) | |
# This is the TPU initialization code that has to be at the beginning. | |
tf.tpu.experimental.initialize_tpu_system(resolver) | |
print("All devices: ", tf.config.list_logical_devices('TPU')) | |
strategy = tf.distribute.TPUStrategy(resolver) | |
with strategy.scope(): | |
output = MinimalGPT(data_path = data_path, | |
learning_rate = learning_rate, | |
output_length = output_length, | |
epochs = epochs, | |
batch_size = batch_size, | |
gpt_input = gpt_input, | |
d_model = d_model, | |
h = h, | |
decoder_stacks = stacks, | |
starting_chunk = chunk_start, | |
ending_chunk = chunk_end, | |
chunk_size = chunk_size, | |
vocabulary_start = vocabulary_start, | |
vocabulary_end = vocabulary_end, | |
save = save, | |
load_tokenizer = load_tokenizer, | |
load_weights = load_weights, | |
save_tokenizer = save_tokenizer, | |
save_weights = save_weights, | |
optimizer = optimizer, | |
inference_only = inference_only, | |
return_model_and_vectorizer = model_and_vectorizer, | |
return_model_and_vectorizer_and_output = model_vectorizer_output, | |
GPT_attention = GPT_attention, | |
TPU = True) | |
save_module(output[0], output[1], output[2], output[3]) | |
print(output[4]) | |
sys.exit(0) | |
output = MinimalGPT(data_path = data_path, | |
learning_rate = learning_rate, | |
output_length = output_length, | |
epochs = epochs, | |
batch_size = batch_size, | |
gpt_input = gpt_input, | |
d_model = d_model, | |
h = h, | |
decoder_stacks = stacks, | |
starting_chunk = chunk_start, | |
ending_chunk = chunk_end, | |
chunk_size = chunk_size, | |
vocabulary_start = vocabulary_start, | |
vocabulary_end = vocabulary_end, | |
save = save, | |
load_tokenizer = load_tokenizer, | |
load_weights = load_weights, | |
save_tokenizer = save_tokenizer, | |
save_weights = save_weights, | |
optimizer = optimizer, | |
inference_only = inference_only, | |
return_model_and_vectorizer = model_and_vectorizer, | |
return_model_and_vectorizer_and_output = model_vectorizer_output, | |
GPT_attention = GPT_attention, | |
TPU = False) | |
print(output) |