File size: 8,282 Bytes
2fc2c1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/Users/pranab/Tools/anaconda/bin/python

# avenir-python: Machine Learning
# Author: Pranab Ghosh
# 
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0 
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.

# Package imports
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib


if len(sys.argv) != 7:
	print "usage: <num_hidden_units> <data_set_size> <noise_in_data> <iteration_count> <learning_rate> <training_mode> "
	sys.exit()
	
# number of hidden units
nn_hdim = int(sys.argv[1])

# dat set size
dsize = int(sys.argv[2])

# noise in training data
noise_level = float(sys.argv[3])

# iteration count
it_count = int(sys.argv[4])

# learning rate
epsilon = float(sys.argv[5])

#training mode
training_mode = sys.argv[6]

# validation
use_validation_data = True

# Generate a dataset
#noise_level = 0.20
#noise_level = 0.01
vlo = 100
vup = vlo + dsize / 5
vsize = vup - vlo
print "trainig data size %d" %(vsize)
np.random.seed(0)
XC, yc = sklearn.datasets.make_moons(dsize, noise=noise_level)

print "complete data set generated"
def print_array(X,y):
	print X
	print y
	

# Generate a validation dataset
#np.random.seed(0)
#XV, yv = sklearn.datasets.make_moons(40, noise=0.20)
#print "validation data set generated"

XV = XC[vlo:vup:1]
yv = yc[vlo:vup:1]
print "validation data generated"
#print_array(XV, yv)

X = np.delete(XC, np.s_[vlo:vup:1], 0)
y = np.delete(yc, np.s_[vlo:vup:1], 0)
print "training data generated"
#print_array(X, y)
print X
print y

	
# Parameters
num_examples = len(X) # training set size
nn_input_dim = 2 # input layer dimensionality
nn_output_dim = 2 # output layer dimensionality

#training data indices
tr_data_indices = np.arange(num_examples)
#print tr_data_indices

# Gradient descent parameters (I picked these by hand)
#epsilon = 0.01 # learning rate for gradient descent
reg_lambda = 0.01 # regularization strength 

		
# Helper function to evaluate the total loss on the dataset
def calculate_loss(X,y,model):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    size = len(X)
    
    # Forward propagation to calculate our predictions
    z1 = X.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    # Calculating the loss
    corect_logprobs = -np.log(probs[range(size), y])
    data_loss = np.sum(corect_logprobs)
    
    # Add regulatization term to loss (optional)
    data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
    return 1./size * data_loss
    
    
# Helper function to predict an output (0 or 1)
def predict(model, x):
    W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
    
    # Forward propagation
    z1 = x.dot(W1) + b1
    a1 = np.tanh(z1)
    z2 = a1.dot(W2) + b2
    exp_scores = np.exp(z2)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return np.argmax(probs, axis=1)

# This function learns parameters for the neural network in batch mode and returns the model.
# - nn_hdim: Number of nodes in the hidden layer
# - num_passes: Number of passes through the training data for gradient descent
# - print_loss: If True, print the loss every 1000 iterations
def build_model_batch(nn_hdim, num_passes=10000, validation_interval=50):    
    # Initialize the parameters to random values. We need to learn these.
	np.random.seed(0)
	W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
	b1 = np.zeros((1, nn_hdim))
	W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
	b2 = np.zeros((1, nn_output_dim))

    # This is what we return at the end
	model = {}
    
    # Gradient descent. For each batch...
	loss = -1.0
	for i in xrange(0, num_passes):
		#print "pass %d" %(i)
		
		# Forward propagation
		z1 = X.dot(W1) + b1
		a1 = np.tanh(z1)
		z2 = a1.dot(W2) + b2
		exp_scores = np.exp(z2)
		probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

        # Back propagation
		delta3 = probs
		delta3[range(num_examples), y] -= 1
		dW2 = (a1.T).dot(delta3)
		db2 = np.sum(delta3, axis=0, keepdims=True)
		delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
		dW1 = np.dot(X.T, delta2)
		db1 = np.sum(delta2, axis=0)

        # Add regularization terms (b1 and b2 don't have regularization terms)
		dW2 += reg_lambda * W2
		dW1 += reg_lambda * W1

        # Gradient descent parameter update
		W1 += -epsilon * dW1
		b1 += -epsilon * db1
		W2 += -epsilon * dW2
		b2 += -epsilon * db2
        
        # Assign new parameters to the model
		model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
        
        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
		if i % validation_interval == 0:
			if use_validation_data:
				cur_loss = calculate_loss(XV,yv,model)
			else:
				cur_loss = calculate_loss(X,y,model)
				
			print "Loss after iteration %i: %.8f" %(i, cur_loss)
			loss = cur_loss
    
		
	return model
    
    
# This function learns parameters for the neural network in incremental and returns the model.
# - nn_hdim: Number of nodes in the hidden layer
# - num_passes: Number of passes through the training data for gradient descent
# - print_loss: If True, print the loss every 1000 iterations
def build_model_incr(nn_hdim, num_passes=10000, validation_interval=50):    
    # Initialize the parameters to random values. We need to learn these.
	np.random.seed(0)
	W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
	b1 = np.zeros((1, nn_hdim))
	W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
	b2 = np.zeros((1, nn_output_dim))

    # This is what we return at the end
	model = {}
    
    # gradient descent. For each batch...
	loss = -1.0
	for i in xrange(0, num_passes):
		#print "pass %d" %(i)
		
		#shuffle training data indices
		np.random.shuffle(tr_data_indices)
		
		# all training data
		for j in tr_data_indices:
			Xi = X[j].reshape(1,2)
			yi = y[j].reshape(1)
			
			# Forward propagation
			z1 = Xi.dot(W1) + b1
			a1 = np.tanh(z1)
			z2 = a1.dot(W2) + b2
			exp_scores = np.exp(z2)
			probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
			
        	# Back propagation
			delta3 = probs
			delta3[0,yi] -= 1			
			dW2 = (a1.T).dot(delta3)
			db2 = np.sum(delta3, axis=0, keepdims=True)
			delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
			dW1 = np.dot(Xi.T, delta2)
			db1 = np.sum(delta2, axis=0)

        	# Add regularization terms (b1 and b2 don't have regularization terms)
			dW2 += reg_lambda * W2
			dW1 += reg_lambda * W1

        	# Gradient descent parameter update
			W1 += -epsilon * dW1
			b1 += -epsilon * db1
			W2 += -epsilon * dW2
			b2 += -epsilon * db2
        
        	# Assign new parameters to the model
			model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
        
        # This is expensive because it uses the whole dataset, so we don't want to do it too often.
		if i % validation_interval == 0:
			if use_validation_data:
				cur_loss = calculate_loss(XV,yv,model)
			else:
				cur_loss = calculate_loss(X,y,model)
				
			print "Loss after iteration %i: %.8f" %(i, cur_loss)
			loss = cur_loss
		
	return model

    
# Build a model with a 3-dimensional hidden layer
if (training_mode == "batch"):
	model = build_model_batch(nn_hdim, num_passes=it_count, validation_interval=1)
elif (training_mode == "incr"):
	model = build_model_incr(nn_hdim, num_passes=it_count, validation_interval=1)
else:
	print "invalid learning mode"
	sys.exit()

print "hidden layer"
for row in model['W1']:
	print(row)

print "hidden layer bias"
for row in model['b1']:
	print(row)

print "output layer"
for row in model['W2']:
	print(row)

print "output layer bias"
for row in model['b2']:
	print(row)