Description
E14 BP Algorithm (C++/Python)
17341015 Hongzheng Chen
Contents
1 Horse Colic Data Set 2
2 Reference Materials 2
3 Tasks 5
4 Codes and Results 6
1 Horse Colic Data Set
The description of the horse colic data set (http://archive.ics.uci.edu/ml/datasets/Horse+ Colic) is as follows:
We aim at trying to predict if a horse with colic will live or die.
Note that we should deal with missing values in the data! Here are some options:
• Use the features mean value from all the available data.
• Fill in the unknown with a special value like -1.
• Ignore the instance.
• Use a mean value from similar items.
• Use another machine learning algorithm to predict the value.
2 Reference Materials
1. Stanford: CS231n: Convolutional Neural Networks for Visual Recognition by Fei-Fei
Li,etc.
• Course website: http://cs231n.stanford.edu/2017/syllabus.html
• Video website: https://www.bilibili.com/video/av17204303/?p=9&tdsourcetag=s_ pctim_aiomsg
2. Machine Learning by Hung-yi Lee
• Course website: http://speech.ee.ntu.edu.tw/~tlkagk/index.html
• Video website: https://www.bilibili.com/video/av9770302/from=search 3. A Simple neural network code template
# -*- coding: utf-8 -* import random import math
# Shorthand:
# “pd_” as a variable prefix means “partial derivative”
# “d_” as a variable prefix means “derivative”
# “_wrt_” is shorthand for “with respect to”
# “w_ho” and “w_ih” are the index of weights from hidden to output layer neurons and
,→ input to hidden layer neurons respectively
class NeuralNetwork: LEARNING_RATE = 0.5
def __init__(self, num_inputs, num_hidden, num_outputs, hidden_layer_weights = ,→ None, hidden_layer_bias = None, output_layer_weights = None, ,→ output_layer_bias = None):
#Your Code Here
def init_weights_from_inputs_to_hidden_layer_neurons(self, hidden_layer_weights):
#Your Code Here
def init_weights_from_hidden_layer_neurons_to_output_layer_neurons(self, ,→ output_layer_weights):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#Your Code Here
21
22def inspect(self):
23print(’——’)
24print(’* Inputs: {}’.format(self.num_inputs))
25print(’——’)
26print(’Hidden Layer’)
27self.hidden_layer.inspect()
28print(’——’)
29print(’* Output Layer’)
30self.output_layer.inspect()
31print(’——’)
32
33def feed_forward(self, inputs):
34#Your Code Here
35
36# Uses online learning, ie updating the weights after each training case 37def train(self, training_inputs, training_outputs):
38self.feed_forward(training_inputs)
39
40# 1. Output neuron deltas
41#Your Code Here
42# E / z
43
44# 2. Hidden neuron deltas
45# We need to calculate the derivative of the error with respect to the output
,→ of each hidden layer neuron
46# dE/ dy = E / z * z / y = E / z * w
47# E / z = dE/ dy * z /
48#Your Code Here
49
50# 3. Update output neuron weights
51# E / w = E / z * z / w
52# w = * E / w
53#Your Code Here
54
55# 4. Update hidden neuron weights
56# E / w = E / z * z / w
57# w = * E / w
58#Your Code Here
59
60def calculate_total_error(self, training_sets):
61#Your Code Here
62return total_error
63
64class NeuronLayer:
65def __init__(self, num_neurons, bias):
66
67# Every neuron in a layer shares the same bias
68self.bias = bias if bias else random.random()
69
70self.neurons = []
71for i in range(num_neurons):
72self.neurons.append(Neuron(self.bias))
73
74def inspect(self):
75print(’Neurons:’, len(self.neurons)) 76for n in range(len(self.neurons)):
77print(’ Neuron’, n)
78for w in range(len(self.neurons[n].weights)):
79print(’ Weight:’, self.neurons[n].weights[w])
80print(’ Bias:’, self.bias)
81
82def feed_forward(self, inputs):
83outputs = []
84for neuron in self.neurons:
85outputs.append(neuron.calculate_output(inputs))
86return outputs
87
88def get_outputs(self):
89outputs = []
90for neuron in self.neurons:
91outputs.append(neuron.output)
92return outputs
93
94class Neuron:
95def __init__(self, bias):
96self.bias = bias
97self.weights = []
98
99def calculate_output(self, inputs):
100#Your Code Here
101
102def calculate_total_net_input(self):
103#Your Code Here
104
105# Apply the logistic function to squash the output of the neuron 106# The result is sometimes referred to as ’net’ [2] or ’net’ [1] 107def squash(self, total_net_input):
108#Your Code Here
109
110# Determine how much the neuron’s total input has to change to move closer to the ,→ expected output
111#
112# Now that we have the partial derivative of the error with respect to the output ,→ (E/y) and
113# the derivative of the output with respect to the total net input (dy/dz) we can ,→ calculate
114# the partial derivative of the error with respect to the total net input.
115# This value is also known as the delta () [1]
116# = E / z = E / y * dy / dz
117#
118def calculate_pd_error_wrt_total_net_input(self, target_output):
119#Your Code Here
120
121# The error for each neuron is calculated by the Mean Square Error method:
122def calculate_error(self, target_output):
123#Your Code Here
124
125# The partial derivate of the error with respect to actual output then is ,→ calculated by:
126# = 2 * 0.5 * (target output – actual output) ^ (2 – 1) * -1
# = -(target output – actual output)
#
# The Wikipedia article on backpropagation [1] simplifies to the following, but
,→ most other learning material does not [2]
# = actual output – target output
#
# Alternative, you can use (target – output), but then need to add it during
,→ backpropagation [3]
#
# Note that the actual output of the output neuron is often written as y and ,→ target output as t so:
# = E / y = -( t – y )
def calculate_pd_error_wrt_output(self, target_output): #Your Code Here
# The total net input into the neuron is squashed using logistic function to ,→ calculate the neuron’s output:
# y = = 1 / (1 + e^(-z))
# Note that where represents the output of the neurons in whatever layer we’re
,→ looking at and represents the layer below it
#
# The derivative (not partial derivative since there is only one variable) of the ,→ output then is:
# dy / dz = y * (1 – y )
def calculate_pd_total_net_input_wrt_input(self): #Your Code Here
# The total net input is the weighted sum of all the inputs to the neuron and ,→ their respective weights:
# = z = net = xw + xw …
#
# The partial derivative of the total net input with respective to a given weight ,→ (with everything else held constant) then is:
# = z / w = some constant + 1 * xw ^(1-0) + some constant … = x def calculate_pd_total_net_input_wrt_weight(self, index): #Your Code Here
# An example:
nn = NeuralNetwork(2, 2, 2, hidden_layer_weights=[0.15, 0.2, 0.25, 0.3],
,→ hidden_layer_bias=0.35, output_layer_weights=[0.4, 0.45, 0.5, 0.55],
,→ output_layer_bias=0.6) for i in range(10000):
nn.train([0.05, 0.1], [0.01, 0.99]) print(i, round(nn.calculate_total_error([[[0.05, 0.1], [0.01, 0.99]]]), 9))
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
3 Tasks
• Given the training set horse-colic.data and the testing set horse-colic.test, implement the BP algorithm and establish a neural network to predict if horses with colic will live or die. In addition, you should calculate the accuracy rate.
• Please submit a file named E14 YourNumber.pdf and send it to ai 201901@foxmail.com
4 Codes and Results
This experiment costs me three full days to finish (finetune the hyperparameters), but I still cannot figure out why my accuracy is so awkward. Sad 🙁
The following figure gives the training loss and the accuracy (without early stopping).
The training log is shown below (with early stopping), and the best accuracy I can get is 92% accuracy on test set. (Notice the two figures are not in the same training process.)
Please refer to nn.py and bp.ipynb (or the generated bp.py) for the codes. Highlight some used techniques:
• The network structure is ni-8-3.
• Used Kaiming He’s method to initialize the network • A pytorch-like network class with forward method is designed.
• L2-regulization and weight decay are used for training.
• All computation are based on tensor, and only the numpy package is used.
• np.einsum is used for accelerating the tensor product in backpropagation.
• Heavy preprocessing methods are used, including one-hot encoding, missing data complement, and useless attributes removal. please refer to bp.ipynb file for details.
• Early stopping is used to avoid overfitting, and learning rate decay is used for better convergence.
• Batch SGD is used to accelerate training.
• Checkpoints and logging make training more controlable.
Following gives the code of nn.py.
import numpy as np
class FullyConnectedLayer(object):
“””
Linear transformation: y = x W^T + b
Input: (N, in_features), i.e. a row vector, in this example, N = 1
Output: (N, out_features)
Attributes:
Weight: (in_features, out_features) Bias: (out_features)
Ref: http://cs231n.stanford.edu/vecDerivs.pdf “””
def __init__(self, in_features, out_features, bias=True):
self.in_features = in_features self.out_features = out_features
“””
Xavier initialization
# https://www.deeplearning.ai/ai-notes/initialization/
W^{[l]} &sim mathcal{N}(mu=0,sigma^2 = rac{1}{n^{[l-1]}}) b^{[l]} &= 0
Kaiming He initialization
# https://medium.com/@shoray.goel/kaiming-he-initialization-a8d9ed0b5899
“”” self.weight = np.random.normal(0,np.sqrt(2/in_features),(out_features,in_features)) if bias: self.bias = np.random.rand(out_features)
else:
self.bias = None
def forward(self, inputs):
“””
Forward propagation
“”” if type(self.bias) != type(None):
return np.dot(inputs, self.weight.T) + self.bias
else:
return np.dot(inputs, self.weight.T)
def __call__(self,x):
“””
Syntax sugar for forward method
“””
return self.forward(x)
class Network(object):
def __init__(self,in_features,hidden_features,out_features,learning_rate=0.01):
“””
Here three-layer network architecture is used
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56The number of neurons in each layer is listed below:
57in_features -> hidden_features -> out_features
58″””
59self.fc1 = FullyConnectedLayer(in_features,hidden_features,True)
60self.fc2 = FullyConnectedLayer(hidden_features,out_features,True)
61self.learning_rate = learning_rate
62self.memory = {} # used for store intermediate results
63self.train_flag = True
64
65def train(self):
66″””
67When training, memory is set to remember the intermediate results
68″””
69self.train_flag = True
70
71def eval(self):
72″””
73When inferencing, memory is no need to set
74″””
75self.train_flag = False
76
77def relu(self,x):
78″””
79Relu(x) = x, x > 0
800, x <= 0
81″””
82return np.maximum(0,x)
83
84def d_relu(self,x):
85x[x <= 0] = 0
86x[x > 0] = 1
87return x
88
89def sigmoid(self,x):
90″””
91Element-wise function
92Sigma(x) = 1/(1+ee^{-x})
93″””
94return 1 / (1 + np.exp(-x))
95
96def d_sigmoid(self,x):
97″””
98Derivative of sigmoid function
99Sigma’(x) = Sigma(x) * (1 – Sigma(x))
100″””
101return self.sigmoid(x) * (1 – self.sigmoid(x))
102
103def tanh(self,x):
104return np.tanh(x)
105
106def d_tanh(self,x):
107return 1 – np.tanh(x) ** 2
108
109def MSE(self,y_hat,y):
110″””
111Mean-square error (MSE)
112″””
113return np.linalg.norm(y_hat – y) # 2-norm
114
115def cross_entropy(self,y_hat,y):
116″””
117Cross entropy loss
118″””
119return y * np.log(y_hat) + (1 – y) * np.log(1 – y_hat)
120
121def forward(self,x):
122″””
123w/o activation: z^{(l+1)} = W^{(l)}a^{(l)} + b^{(l)}
124w/ activation : a^{(l+1)} = f(z^{(l+1)})
125″””
126# training
127if self.train_flag:
128self.memory[“a0”] = np.copy(x) 129x = self.fc1(x) # N * hidden
130self.memory[“z1”] = np.copy(x) 131x = self.sigmoid(x)
132self.memory[“a1”] = np.copy(x) 133x = self.fc2(x) # N * out
134self.memory[“z2″] = np.copy(x)
135x = self.sigmoid(x)
136# inferencing 137else:
138x = self.fc1(x) # N * hidden
139x = self.sigmoid(x)
140x = self.fc2(x) # N * out
141x = self.sigmoid(x)
142return x
143
144def backward(self,y_hat,y,lamb=0):
145″””
146Use Mean-Squared Error (MSE) as error function
147
148lambda is used for weight decay
149
150Ref: http://ufldl.stanford.edu/tutorial/supervised/MultiLayerNeuralNetworks/
151″””
152batch_size = y.shape[0]
153# Calculate delta
154# output layer: delta(n_l) = -(y – a(n_l)) * f’(z(n_l))
155# other layers: delta(l) = W(l)^Tdelta(l+1) * f’(z(l))
156delta = [0] * 3
157delta[2] = (y_hat – y) * self.d_sigmoid(self.memory[“z2”]) # N * out_features
158delta[1] = np.dot(delta[2],self.fc2.weight) * self.d_sigmoid(self.memory[“z1”]) # N ,→ * hidden_features
159# print(delta[2].shape,delta[1].shape)
160
161# Calculat abla
162# output layer: abla_{W(l)}J(W,b;x,y) = delta(l+1)(a(l))^T # outer product
163# other layers: abla_{b(l)}J(W,b;x,y) = delta(l+1)
164nabla_W = [0] * 2
165nabla_W[1] = np.einsum(“ij,ik->ijk”,delta[2],self.memory[“a1”]) # N * out_features ,→ * hidden_features
nabla_W[0] = np.einsum(“ij,ik->ijk”,delta[1],self.memory[“a0”]) # N *
,→ hidden_features * in_features nabla_b = [0] * 2 nabla_b[1] = delta[2] # N * out_features nabla_b[0] = delta[1] # N * hidden_features
# print(nabla_W[1].shape,nabla_W[0].shape,nabla_b[1].shape,nabla_b[0].shape)
# Update parameters
# W(l) = W(l) – alpha((1/m Delta W(l)) + lambda W(l))
# b(l) = b(l) – alpha(1/m Delta b(l))
# Use einsum to accelerate
# weight decay, lambda is the L2 regularization term
self.fc2.weight -= self.learning_rate * (nabla_W[1] + lamb * self.fc2.weight /
,→ batch_size) self.fc1.weight -= self.learning_rate * (nabla_W[0] + lamb * self.fc1.weight /
,→ batch_size) self.fc2.bias -= self.learning_rate * nabla_b[1] self.fc1.bias -= self.learning_rate * nabla_b[0]
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
Reviews
There are no reviews yet.