In [1]:
import os
import random

#os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #disble gpu

def get_text_data():
 sentences=[]
 file_name="cleaned_assamese_text.txt"
 file=open(file_name,'r')
 file_sentences=file.read().split(',')
 sentences+=file_sentences
 file.close()
 sentences=list(filter(None,sentences))
 return sentences

sentences=get_text_data()
random.shuffle(sentences)
no_of_sentences=len(sentences)
text_train=sentences[:int(0.7*no_of_sentences)]
text_test=sentences[int(0.7*no_of_sentences):int(0.85*no_of_sentences)]
text_valid=sentences[int(0.85*no_of_sentences):]
#maxlen = len(max(sentences))
maxlen=10
print("length of the longest sentence: ",maxlen)
print("no_of_sentences: ",no_of_sentences)

length of the longest sentence: 10
no_of_sentences: 127946


In [2]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf

def custom_standardization(input_string):
 sentence = tf.strings.lower(input_string)
 #sentence = tf.strings.regex_replace(sentence, "\n", " ")
 return sentence

vectorize_layer = TextVectorization(
 standardize = custom_standardization,
 output_mode="int",
 output_sequence_length=maxlen + 1,
)

vectorize_layer.adapt(sentences)
vocab = vectorize_layer.get_vocabulary()

vocab_size = len(vocab)
print(vocab_size) # 49703
vectorize_layer(['এক অনন্য মাত্ৰা প্ৰদান কৰাৰ'])

2023-02-28 23:36:00.068548: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-28 23:36:01.115879: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.0/lib64:/usr/local/cuda-11.7/lib64::/home/yuvrajtalukdar/miniconda3/envs/miniproject/lib/
2023-02-28 23:36:01.116220: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.0/l

303475




In [3]:
index_lookup = dict(zip(range(len(vocab)), vocab))

In [4]:
batch_size = 10 #64

train_dataset = tf.data.Dataset.from_tensor_slices(text_train)
train_dataset = train_dataset.shuffle(buffer_size=256)
train_dataset = train_dataset.batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices(text_test)
test_dataset = test_dataset.shuffle(buffer_size=256)
test_dataset = test_dataset.batch(batch_size)

valid_dataset = tf.data.Dataset.from_tensor_slices(text_valid)
valid_dataset = valid_dataset.shuffle(buffer_size=256)
valid_dataset = valid_dataset.batch(batch_size)

In [5]:
def preprocess_text(text):
 text = tf.expand_dims(text, -1)
 tokenized_sentences = vectorize_layer(text)
 x = tokenized_sentences[:, :-1]
 y = tokenized_sentences[:, 1:]
 return x, y


train_dataset = train_dataset.map(preprocess_text)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

test_dataset = test_dataset.map(preprocess_text)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

valid_dataset = valid_dataset.map(preprocess_text)
valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)

In [6]:
for entry in train_dataset.take(1):
 print(entry)

(, )


In [7]:
import keras_nlp
from tensorflow import keras

embed_dim = 128
num_heads = 4

def create_model2(no_of_decoder=1):
 inputs = keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
 x = keras_nlp.layers.TokenAndPositionEmbedding(vocab_size, maxlen, embed_dim)(inputs)
 for i in range(4):
 x = keras_nlp.layers.TransformerDecoder(intermediate_dim=embed_dim*2, num_heads=num_heads,dropout=0.5)(x)
 do = keras.layers.Dropout(0.4)(x)
 outputs = keras.layers.Dense(vocab_size, activation='softmax')(do)
 
 model = keras.Model(inputs=inputs, outputs=outputs)
 model.compile(
 optimizer="adam", 
 loss='sparse_categorical_crossentropy',
 metrics=[keras_nlp.metrics.Perplexity(), 'accuracy']
 )
 return model

In [8]:
import numpy as np

class TextSampler(keras.callbacks.Callback):
 def __init__(self, start_prompt, max_tokens):
 self.start_prompt = start_prompt
 self.max_tokens = max_tokens
 
 # Helper method to choose a word from the top K probable words with respect to their probabilities
 # in a sequence
 def sample_token(self, logits):
 logits, indices = tf.math.top_k(logits, k=5, sorted=True)
 indices = np.asarray(indices).astype("int32")
 preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
 preds = np.asarray(preds).astype("float32")
 return np.random.choice(indices, p=preds)

 def on_epoch_end(self, epoch, logs=None):
 decoded_sample = self.start_prompt
 
 for i in range(self.max_tokens-1):
 tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
 predictions = self.model.predict([tokenized_prompt], verbose=0)
 # To find the index of the next word in the prediction array.
 # The tokenized prompt is already shorter than the original decoded sample
 # by one, len(decoded_sample.split()) is two words ahead - so we remove 1 to get
 # the next word in the sequence
 sample_index = len(decoded_sample.strip().split())-1
 
 sampled_token = self.sample_token(predictions[0][sample_index])
 sampled_token = index_lookup[sampled_token]
 decoded_sample += " " + sampled_token
 
 print(f"\nSample text:\n{decoded_sample}...\n")

# First 5 words of a random sentence to be used as a seed
random_sentence = ' '.join(random.choice(text_valid).replace('\n', ' ').split(' ')[:4])
sampler = TextSampler(random_sentence, 30)
reducelr = keras.callbacks.ReduceLROnPlateau(patience=10, monitor='val_loss')

In [9]:
model = create_model2(4)
model.summary()
history = model.fit(train_dataset,validation_data=valid_dataset,epochs=150,callbacks=[sampler, reducelr])

Model: "model"
_________________________________________________________________
 Layer (type) Output Shape Param # 
 input_1 (InputLayer) [(None, 10)] 0 
 
 token_and_position_embeddin (None, 10, 128) 38846080 
 g (TokenAndPositionEmbeddin 
 g) 
 
 transformer_decoder (Transf (None, 10, 128) 132480 
 ormerDecoder) 
 
 transformer_decoder_1 (Tran (None, 10, 128) 132480 
 sformerDecoder) 
 
 transformer_decoder_2 (Tran (None, 10, 128) 132480 
 sformerDecoder) 
 
 transformer_decoder_3 (Tran (None, 10, 128) 132480 
 sformerDecoder) 
 
 dropout (Dropout) (None, 10, 128) 0 
 
 dense (Dense) (None, 10, 303475) 39148275 
 
Total params: 78,524,275
Trainable params: 78,524,275
Non-trainable params: 0
_________________________________________________________________
Epoch 1/150


2023-02-28 23:36:23.887413: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-02-28 23:36:24.308423: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7ff6d67579b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-02-28 23:36:24.308518: I tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2023-02-28 23:36:24.328912: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-02-28 23:36:24.549826: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.




KeyboardInterrupt: 

In [None]:
def sample_token(logits):
 logits, indices = tf.math.top_k(logits, k=5, sorted=True)
 indices = np.asarray(indices).astype("int32")
 preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
 preds = np.asarray(preds).astype("float32")
 return np.random.choice(indices, p=preds)

def generate_text(prompt, response_length=50):
 decoded_sample = prompt
 for i in range(response_length-1):
 tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
 predictions = model.predict([tokenized_prompt], verbose=0)
 sample_index = len(decoded_sample.strip().split())-1

 sampled_token = sample_token(predictions[0][sample_index])
 sampled_token = index_lookup[sampled_token]
 decoded_sample += " " + sampled_token
 return decoded_sample

In [None]:
import pickle
model.save("pd_plaintext_transformer.h5")
pickle.dump(model, open('pd_plaintext_transformer.pkl', 'wb'))

In [None]:
generate_text('য়ুৰিৰ দাদাক আৰু ',response_length=50)