library(lbfgsb3)
## Loading required package: numDeriv

Forward Propagation

Feed forward algorithm for predicting values (forward propagation):

# Function predictNN for neural networks:
predictNN <- function(X,Theta1,Theta2) {
  m <- dim(X)[1]
  classes <- dim(Theta2)[1]
  pred <- rep(0,m)
  
  a1 <- ones(X)
  z2 <- a1 %*% t(Theta1)
  a2 <- ones(sigmoid(z2))
  z3 <- a2 %*% t(Theta2)
  a3 <- sigmoid(z3)
  
  pred <- apply(a3,1,which.max)
  return(list(prediction=pred,outputLayer=a3))
}

Cost function for Neural Networks

Die Kostenfunktion für neuronale Netze ist bis auf kleine Anpassungen identisch zu der Kostenfunktion der logistischen Regression: \[ J(\theta) = - \frac{1}{m} \sum_{i=1}^m \large[ y^{(i)}\ \log (h_\theta (x^{(i)})) + (1 - y^{(i)})\ \log (1 - h_\theta(x^{(i)}))\large] + \frac{\lambda}{2m}\sum_{j=1}^n \theta_j^2\] Um diese Kostenfunktion für neuronale Netze anwenden zu können, muss sie in zwei Punkten erweitert werden:

  1. es muss über alle Units des Output Layers summiert werden, da jedes Unit des Output Layers im Prinzip einer eigenen Hypothesenfunktion entspricht (das i-te Unit entspricht der Funktion, die die Wahrscheinlichkeit der Zugehörigkeit zur Klase i vorhersagt).
  2. im Regularisierungsterm muss über alle Elemente aller \(\Theta^{(j)}\)-Matrizen summiert werden.

Kostenfunktion für neuronale Netze: \[ \begin{gather*}\large J(\Theta) = - \frac{1}{m} \sum_{i=1}^m \sum_{k=1}^K \left[y^{(i)}_k \log ((h_\Theta (x^{(i)}))_k) + (1 - y^{(i)}_k)\log (1 - (h_\Theta(x^{(i)}))_k)\right] + \frac{\lambda}{2m}\sum_{l=1}^{L-1} \sum_{i=1}^{s_l} \sum_{j=1}^{s_{l+1}} ( \Theta_{j,i}^{(l)})^2\end{gather*}\]

CostFunctionNN  <-
  function(input_layer_size, hidden_layer_size, classes,X, y, lambda) {
    # CostFunctionNN Implements the neural network cost function for a two layer
    # neural network which performs classification
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    function(nn_params) {
      # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
      # for our 2 layer neural network
      Theta1 <-
        matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
               nrow=hidden_layer_size, ncol=(input_layer_size + 1))
      Theta2 <-
        matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
        nrow=classes, ncol=(hidden_layer_size + 1))
      
      # Setup some useful variables
      m <- dim(X)[1]
      
      # You need to return the following variable correctly
      J <- 0
      
      # recode y to Y
      Y <- matrix(0, nrow=m, ncol=classes)
      for (i in 1:m){
        Y[i,] <- diag(classes)[y[i],]
      }
      
      # feedforward
      h <- predictNN(X,Theta1,Theta2)$outputLayer
      
      # calculte penalty for parameters
      penalty <- sum(Theta1[,-1] ^ 2) + sum(Theta2[,-1] ^ 2)
      
      # calculate Cost
      J <-
        sum((-Y) * log(h) - (1 - Y) * log(1 - h)) / m + lambda * penalty / (2 * m)
      
      return(J)
  }
}

Function to compute the gradients in a neural network

GradFunctionNN  <-
  function(input_layer_size, hidden_layer_size, classes,
           X, y, lambda) {
    # GradFunctionNN Implements the neural network gradient function for a two layer
    # neural network which performs classification
    #
    # The parameters for the neural network are "unrolled" into the vector
    # nn_params and need to be converted back into the weight matrices.
    #
    # The returned parameter grad should be a "unrolled" vector of the
    # partial derivatives of the neural network.
    #
    function(nn_params) {
      # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
      # for our 2 layer neural network
      Theta1 <-
        matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
               nrow=hidden_layer_size, ncol=(input_layer_size + 1))
      Theta2 <-
        matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
               nrow=classes, ncol=(hidden_layer_size + 1))
      
      # Setup some useful variables
      m <- dim(X)[1]
      
      # You need to return the following variables correctly
      Theta1_grad <- matrix(0,dim(Theta1)[1],dim(Theta1)[2])
      Theta2_grad <- matrix(0,dim(Theta2)[1],dim(Theta2)[2])
      
      
      # recode y to Y
      Y <- matrix(0, nrow=m, ncol=classes)
      for (i in 1:m){
        Y[i,] <- diag(classes)[y[i],]
      }
      
      # feedforward
      a1 <- ones(X)
      z2 <- a1 %*% t(Theta1)
      a2 <- ones(sigmoid(z2))
      z3 <- a2 %*% t(Theta2)
      a3 <- sigmoid(z3)
      h <- a3
      
      # calculate sigmas 
      sigma3 <- h - Y
      sigma2 <- (sigma3 %*% Theta2) * sigmoid(ones(z2)) * (1 - sigmoid(ones(z2))) 
      sigma2 <- sigma2[,-1]
      
      # accumulate gradients
      delta_1 <- (t(sigma2) %*% a1)
      delta_2 <- (t(sigma3) %*% a2)
      
      # calculate regularized gradient
      p1 <- (lambda / m) * cbind(rep(0,dim(Theta1)[1]), Theta1[,-1])
      p2 <- (lambda / m) * cbind(rep(0,dim(Theta2)[1]), Theta2[,-1])
      Theta1_grad <- delta_1 / m + p1
      Theta2_grad <- delta_2 / m + p2
      
      # Unroll gradients
      grad <-  c(c(Theta1_grad), c(Theta2_grad))
      return(grad)
    }
  }
# Initialize weights randomly. L_in: number of incoming arcs, L_out: number of outgoing arcs.
randInitializeWeights <- function(L_in, L_out) {
  Weights <- matrix(0,L_out, 1 + L_in) # +1 for bias unit
  epsilon_init <- 0.12
  rnd <- runif(L_out * (1 + L_in))
  rnd <- matrix(rnd,L_out,1 + L_in)
  W <- rnd * 2 * epsilon_init - epsilon_init
  return(W)
}