Delete README.ipynb
Browse files- README.ipynb +0 -241
README.ipynb
DELETED
@@ -1,241 +0,0 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
---
|
4 |
-
import tensorflow as tf
|
5 |
-
from string import punctuation
|
6 |
-
import numpy as np
|
7 |
-
import os
|
8 |
-
import time
|
9 |
-
import pickle
|
10 |
-
model_path='/content/drive/MyDrive/Colab Notebooks'
|
11 |
-
# create directory to store pickled files in
|
12 |
-
if not os.path.exists(f'/content/drive/MyDrive/Colab Notebooks/pkl'):
|
13 |
-
os.mkdir(f'/content/drive/MyDrive/Colab Notebooks/pkl')
|
14 |
-
|
15 |
-
# ----------------------------------------------------------------------
|
16 |
-
|
17 |
-
### LIMITING GPU MEMORY GROWTH ###
|
18 |
-
|
19 |
-
# get list of visible GPUs
|
20 |
-
gpus = tf.config.experimental.list_physical_devices('GPU')
|
21 |
-
|
22 |
-
if gpus: # if GPU(s) is detected
|
23 |
-
try: # try setting memory growth to true for all GPUs
|
24 |
-
for gpu in gpus:
|
25 |
-
tf.config.experimental.set_memory_growth(gpu, True) # enabling memory growth
|
26 |
-
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
|
27 |
-
print('\n', len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPU')
|
28 |
-
except RuntimeError as e:
|
29 |
-
# memory growth must be set before GPUs have been initialized
|
30 |
-
print('\n', e)
|
31 |
-
|
32 |
-
# ----------------------------------------------------------------------
|
33 |
-
|
34 |
-
### READ IN AND CLEAN THE LYRICS DATA ###
|
35 |
-
|
36 |
-
# ******TAKE IN USER INPUT FOR LYRICS (ARTIST NAME? FILE NAME?)******
|
37 |
-
|
38 |
-
# read in the lyrics text file
|
39 |
-
text = str(open('/content/drake.txt', 'r').read())
|
40 |
-
# artist_name = input('\nPlease ')
|
41 |
-
|
42 |
-
# make all letters lowercase and make line breaks into its own "word"
|
43 |
-
words = text.lower().replace('\n', ' \n ')
|
44 |
-
|
45 |
-
# remove punctuation
|
46 |
-
for punc in punctuation:
|
47 |
-
words = words.replace(punc, '')
|
48 |
-
|
49 |
-
# split the entire words string into a Python list of words
|
50 |
-
words = words.split(' ')
|
51 |
-
|
52 |
-
# obtain list of unique words across all lyrics
|
53 |
-
vocab = sorted(set(words))
|
54 |
-
print(f'\nThere are {len(vocab)} unique words in the lyrics file.')
|
55 |
-
|
56 |
-
# pickle the vocab file - will need it for the generation script
|
57 |
-
outfile = open(file='/content/drive/MyDrive/Colab Notebooks/pkl/vocab', mode='wb')
|
58 |
-
pickle.dump(vocab, outfile)
|
59 |
-
outfile.close()
|
60 |
-
|
61 |
-
# ----------------------------------------------------------------------
|
62 |
-
|
63 |
-
### WORD MAPPING ###
|
64 |
-
|
65 |
-
# map unique characters to indices
|
66 |
-
word2idx = {u:i for i, u in enumerate(vocab)}
|
67 |
-
|
68 |
-
# pickle this since it is needed in text generation
|
69 |
-
outfile = open(file='/content/drive/MyDrive/Colab Notebooks/pkl/word2idx', mode='wb')
|
70 |
-
pickle.dump(word2idx, outfile)
|
71 |
-
outfile.close()
|
72 |
-
|
73 |
-
# reverse the map - use this to specify an index to obtain a character
|
74 |
-
idx2word = np.array(vocab)
|
75 |
-
|
76 |
-
# pickle this since it is needed in text generation
|
77 |
-
outfile = open(file='/content/drive/MyDrive/Colab Notebooks/pkl/idx2word', mode='wb')
|
78 |
-
pickle.dump(idx2word, outfile)
|
79 |
-
outfile.close()
|
80 |
-
|
81 |
-
# entire text document represented in the above character-to-indices mapping
|
82 |
-
words_as_int = np.array([word2idx[c] for c in words])
|
83 |
-
|
84 |
-
# ----------------------------------------------------------------------
|
85 |
-
|
86 |
-
### CREATING TRAINING EXAMPLES & TARGETS ###
|
87 |
-
|
88 |
-
# ******TAKE IN USER INPUT FOR SEQUENCE LENGTH?******
|
89 |
-
|
90 |
-
# max sentence length (in number of words) desired for training
|
91 |
-
seq_length = 100
|
92 |
-
# seq_length = input('\nPlease enter a desired sequence length (in number of words) to train the model on: ')
|
93 |
-
examples_per_epoch = len(words) // (seq_length + 1)
|
94 |
-
|
95 |
-
# create training examples/targets
|
96 |
-
word_dataset = tf.data.Dataset.from_tensor_slices(words_as_int)
|
97 |
-
|
98 |
-
# data type of train examples/targets
|
99 |
-
print('\n', type(word_dataset))
|
100 |
-
|
101 |
-
# create sequence batches from the word_dataset
|
102 |
-
sequences = word_dataset.batch(seq_length + 1, drop_remainder=True)
|
103 |
-
print('\n', type(sequences))
|
104 |
-
|
105 |
-
# define the shifting (splitting) function
|
106 |
-
def split_input_target(chunk):
|
107 |
-
input_text = chunk[:-1] # up to but not including the last character
|
108 |
-
target_text = chunk[1:] # everything except for the firs tcharacter
|
109 |
-
return input_text, target_text
|
110 |
-
|
111 |
-
# apply the shifting to create input texts and target texts that comprise of our dataset
|
112 |
-
dataset = sequences.map(split_input_target)
|
113 |
-
|
114 |
-
# ----------------------------------------------------------------------
|
115 |
-
|
116 |
-
### CREATE TRAINING BATCHES ###
|
117 |
-
|
118 |
-
# batch size
|
119 |
-
BATCH_SIZE = 64
|
120 |
-
|
121 |
-
# buffer size to shuffle the dataset
|
122 |
-
# (TensorFlow data is designed to work with possibly infinite sequences,
|
123 |
-
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
|
124 |
-
# it maintains a buffer in which it shuffles elements)
|
125 |
-
BUFFER_SIZE = 10000
|
126 |
-
|
127 |
-
# create a dataset that has been shuffled and batched
|
128 |
-
dataset_sb = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
|
129 |
-
|
130 |
-
# display batch dataset shapes and data types
|
131 |
-
print('\n', dataset_sb)
|
132 |
-
|
133 |
-
# ----------------------------------------------------------------------
|
134 |
-
|
135 |
-
### BUILDING THE RNN ###
|
136 |
-
|
137 |
-
# vocabulary length (number of unique words in dataset)
|
138 |
-
vocab_size = len(vocab)
|
139 |
-
|
140 |
-
# embedding dimension
|
141 |
-
embedding_dim = 256
|
142 |
-
|
143 |
-
# number of RNN units
|
144 |
-
rnn_units = 1024
|
145 |
-
|
146 |
-
# pickle model parameters - will need in the generation script
|
147 |
-
model_params = [vocab_size, embedding_dim, rnn_units]
|
148 |
-
outfile = open(file='/content/drive/MyDrive/Colab Notebooks/pkl/model_params', mode='wb')
|
149 |
-
pickle.dump(model_params, outfile)
|
150 |
-
outfile.close()
|
151 |
-
|
152 |
-
# helper function to quickly build the RNN model based on vocab size, embedding dimension, number of RNN units, and batch size
|
153 |
-
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
|
154 |
-
|
155 |
-
# initialize sequential model architecture
|
156 |
-
model = tf.keras.Sequential()
|
157 |
-
|
158 |
-
# add embedding layer
|
159 |
-
model.add(tf.keras.layers.Embedding(
|
160 |
-
input_dim = vocab_size,
|
161 |
-
output_dim = embedding_dim,
|
162 |
-
batch_input_shape=[batch_size, None]
|
163 |
-
))
|
164 |
-
|
165 |
-
# add recurrent layer
|
166 |
-
model.add(tf.keras.layers.GRU(
|
167 |
-
units = rnn_units,
|
168 |
-
return_sequences = True,
|
169 |
-
stateful = True,
|
170 |
-
recurrent_initializer = 'glorot_uniform'
|
171 |
-
))
|
172 |
-
|
173 |
-
# add dense layer
|
174 |
-
model.add(tf.keras.layers.Dense(units=vocab_size))
|
175 |
-
model_path= '/content/drive/MyDrive/Colab Notebooks'
|
176 |
-
|
177 |
-
def save_model(self, model_path):
|
178 |
-
# Save the model weights
|
179 |
-
self.save_weights(model_path)
|
180 |
-
print(f"Model saved to {model_path}")
|
181 |
-
return model
|
182 |
-
|
183 |
-
# build the model using the above helper function
|
184 |
-
rnn = build_model(
|
185 |
-
vocab_size = vocab_size,
|
186 |
-
embedding_dim = embedding_dim,
|
187 |
-
rnn_units = rnn_units,
|
188 |
-
batch_size = BATCH_SIZE
|
189 |
-
)
|
190 |
-
|
191 |
-
# check the shape of the output
|
192 |
-
for input_example_batch, target_example_batch in dataset_sb.take(1):
|
193 |
-
example_batch_predictions = rnn(input_example_batch)
|
194 |
-
print('\n', example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')
|
195 |
-
|
196 |
-
# model architecture summary
|
197 |
-
print('\n', rnn.summary(), '\n')
|
198 |
-
|
199 |
-
# ----------------------------------------------------------------------
|
200 |
-
|
201 |
-
### SET UP METRICS ###
|
202 |
-
|
203 |
-
# helper function to obtain the loss function
|
204 |
-
def loss(labels, logits):
|
205 |
-
return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
|
206 |
-
|
207 |
-
# compile the model
|
208 |
-
rnn.compile(
|
209 |
-
optimizer = 'adam',
|
210 |
-
loss = loss,
|
211 |
-
metrics = ['accuracy']
|
212 |
-
)
|
213 |
-
|
214 |
-
# create directory where the checkpoints will be saved
|
215 |
-
checkpoint_dir = '/content/drive/MyDrive/Colab Notebooks/training_checkpoints'
|
216 |
-
|
217 |
-
# name of the checkpoint files
|
218 |
-
checkpoint_prefix = os.path.join(checkpoint_dir, 'checkpoint')
|
219 |
-
|
220 |
-
# create checkpoints-saving object
|
221 |
-
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
|
222 |
-
filepath = checkpoint_prefix,
|
223 |
-
monitor = 'loss',
|
224 |
-
save_best_only = True,
|
225 |
-
mode = 'min',
|
226 |
-
save_weights_only = True
|
227 |
-
)
|
228 |
-
|
229 |
-
# ----------------------------------------------------------------------
|
230 |
-
|
231 |
-
### MODEL TRAINING ###
|
232 |
-
|
233 |
-
# set number of desired epochs
|
234 |
-
EPOCHS = 200
|
235 |
-
|
236 |
-
# training!
|
237 |
-
history = rnn.fit(
|
238 |
-
x = dataset_sb,
|
239 |
-
epochs = EPOCHS,
|
240 |
-
callbacks = [checkpoint_callback]
|
241 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|