library(lbfgsb3)
## Loading required package: numDeriv
Feed forward algorithm for predicting values (forward propagation):
# Function predictNN for neural networks:
predictNN <- function(X,Theta1,Theta2) {
m <- dim(X)[1]
classes <- dim(Theta2)[1]
pred <- rep(0,m)
a1 <- ones(X)
z2 <- a1 %*% t(Theta1)
a2 <- ones(sigmoid(z2))
z3 <- a2 %*% t(Theta2)
a3 <- sigmoid(z3)
pred <- apply(a3,1,which.max)
return(list(prediction=pred,outputLayer=a3))
}
Die Kostenfunktion für neuronale Netze ist bis auf kleine Anpassungen identisch zu der Kostenfunktion der logistischen Regression: J(θ)=−1mm∑i=1[y(i) log(hθ(x(i)))+(1−y(i)) log(1−hθ(x(i)))]+λ2mn∑j=1θ2j Um diese Kostenfunktion für neuronale Netze anwenden zu können, muss sie in zwei Punkten erweitert werden:
Kostenfunktion für neuronale Netze: J(Θ)=−1mm∑i=1K∑k=1[y(i)klog((hΘ(x(i)))k)+(1−y(i)k)log(1−(hΘ(x(i)))k)]+λ2mL−1∑l=1sl∑i=1sl+1∑j=1(Θ(l)j,i)2
CostFunctionNN <-
function(input_layer_size, hidden_layer_size, classes,X, y, lambda) {
# CostFunctionNN Implements the neural network cost function for a two layer
# neural network which performs classification
# The parameters for the neural network are "unrolled" into the vector
# nn_params and need to be converted back into the weight matrices.
function(nn_params) {
# Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
# for our 2 layer neural network
Theta1 <-
matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
nrow=hidden_layer_size, ncol=(input_layer_size + 1))
Theta2 <-
matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
nrow=classes, ncol=(hidden_layer_size + 1))
# Setup some useful variables
m <- dim(X)[1]
# You need to return the following variable correctly
J <- 0
# recode y to Y
Y <- matrix(0, nrow=m, ncol=classes)
for (i in 1:m){
Y[i,] <- diag(classes)[y[i],]
}
# feedforward
h <- predictNN(X,Theta1,Theta2)$outputLayer
# calculte penalty for parameters
penalty <- sum(Theta1[,-1] ^ 2) + sum(Theta2[,-1] ^ 2)
# calculate Cost
J <-
sum((-Y) * log(h) - (1 - Y) * log(1 - h)) / m + lambda * penalty / (2 * m)
return(J)
}
}
Function to compute the gradients in a neural network
GradFunctionNN <-
function(input_layer_size, hidden_layer_size, classes,
X, y, lambda) {
# GradFunctionNN Implements the neural network gradient function for a two layer
# neural network which performs classification
#
# The parameters for the neural network are "unrolled" into the vector
# nn_params and need to be converted back into the weight matrices.
#
# The returned parameter grad should be a "unrolled" vector of the
# partial derivatives of the neural network.
#
function(nn_params) {
# Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
# for our 2 layer neural network
Theta1 <-
matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
nrow=hidden_layer_size, ncol=(input_layer_size + 1))
Theta2 <-
matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
nrow=classes, ncol=(hidden_layer_size + 1))
# Setup some useful variables
m <- dim(X)[1]
# You need to return the following variables correctly
Theta1_grad <- matrix(0,dim(Theta1)[1],dim(Theta1)[2])
Theta2_grad <- matrix(0,dim(Theta2)[1],dim(Theta2)[2])
# recode y to Y
Y <- matrix(0, nrow=m, ncol=classes)
for (i in 1:m){
Y[i,] <- diag(classes)[y[i],]
}
# feedforward
a1 <- ones(X)
z2 <- a1 %*% t(Theta1)
a2 <- ones(sigmoid(z2))
z3 <- a2 %*% t(Theta2)
a3 <- sigmoid(z3)
h <- a3
# calculate sigmas
sigma3 <- h - Y
sigma2 <- (sigma3 %*% Theta2) * sigmoid(ones(z2)) * (1 - sigmoid(ones(z2)))
sigma2 <- sigma2[,-1]
# accumulate gradients
delta_1 <- (t(sigma2) %*% a1)
delta_2 <- (t(sigma3) %*% a2)
# calculate regularized gradient
p1 <- (lambda / m) * cbind(rep(0,dim(Theta1)[1]), Theta1[,-1])
p2 <- (lambda / m) * cbind(rep(0,dim(Theta2)[1]), Theta2[,-1])
Theta1_grad <- delta_1 / m + p1
Theta2_grad <- delta_2 / m + p2
# Unroll gradients
grad <- c(c(Theta1_grad), c(Theta2_grad))
return(grad)
}
}
# Initialize weights randomly. L_in: number of incoming arcs, L_out: number of outgoing arcs.
randInitializeWeights <- function(L_in, L_out) {
Weights <- matrix(0,L_out, 1 + L_in) # +1 for bias unit
epsilon_init <- 0.12
rnd <- runif(L_out * (1 + L_in))
rnd <- matrix(rnd,L_out,1 + L_in)
W <- rnd * 2 * epsilon_init - epsilon_init
return(W)
}