Functions for linear regression

name: linearRegression_functions.R
author: Wiebke Petersen
date: 01 Mai, 2017
based_on: ex1.m
original author: Andrew Ng
context: Machine Learning course on coursera.org

conventions

m : number of training example
n : number of features (for exercise 1: n=1)
y : m-dimensional vector
X : \(m\times (1+n)\) matrix (note, the first column is per convention a vector consisting only of 1’s)
\(\Theta\) : \((n+1)\)-dimensional parameter vector
\(x^{(i)}\) : \((n+1)\)-dimensional vector of i-th training example (i-th row of X)
\(x_j\) : \(m\)-dimensional vector (j-th column of X)

hypothesis

hypothesis as a function of a single value:
\(h(x) = \theta_0 + \theta_1 x\)

hypothesis as a function of a vector:
\(h(\vec{x}) = \theta_0 x_0 + \theta_1 x_1\)
\(h(\vec{x}) = \Theta' \times \vec{x}\)
\(\begin{eqnarray} h\left(\begin{pmatrix}x_0=1\\ x_1 \end{pmatrix}\right) &=& \begin{pmatrix}\theta_0\\ \theta_1 \end{pmatrix}' \times \begin{pmatrix}x_0=1\\ x_1 \end{pmatrix}\\ &=& \theta_0 x_0 + \theta_1 x_1 \\ &=& \theta_0 + \theta_1 x_1 \end{eqnarray}\)

hypothesis as a function of a matrix:
\(\begin{eqnarray} \begin{pmatrix} 1 & x_1^1 \\ 1 & x_1^2 \\ 1 & x_1^3 \end{pmatrix} \times \begin{pmatrix} \theta_0 \\ \theta_1 \end{pmatrix} = \begin{pmatrix} \theta_0 + \theta_1 x_1 \\ \theta_0 + \theta_1 x_2 \\ \theta_0 + \theta_1 x_3 \end{pmatrix} \end{eqnarray}\)
\(\begin{eqnarray} h(X) = X \times \Theta \end{eqnarray}\)

# hypothesis (note: here, theta is a 2-dimensional vector and X is a mx2 matrix)
h <- function(theta,X){
  prediction <- X %*% theta
  return(prediction)
}

# hypothesis for normalized features
hnormal <- function(theta,x,mu,sigma){
  for (i in 1:length(x)){
    if (sigma[i] !=0) {x[i] <- (x[i]-mu[i])/sigma[i]}
    if (sigma[i] == 0){x[i] <- x[i]-mu[i]}
  }
  x <- c(1,x)
  return(h(theta,x))
}

cost function

\[\begin{eqnarray} J \left(\begin{pmatrix}\theta_0\\ \theta_1\end{pmatrix} \right) &=& \frac{1}{2m}\sum_{i=1}^{m} (h(x^{(i)})-y^{(i)})^2 \\ &=& \frac{1}{2m}\sum_{i=1}^{m} ((\theta_0 + \theta_1 x^{(i)})-y^{(i)})^2 \\ &=& \frac{1}{2m}(X\times\Theta-y)'\times (X\times\Theta-y) \end{eqnarray}\]

# cost function 
computeCost <-  function(data,theta){
  m <- dim(data)[1]
  n <- dim(data)[2]
  X <- as.matrix(cbind(rep(1,times=m),data[,1:n-1]))
  y <- data[,n]
  cost <- h(theta,X)-y
  cost <- t(cost) %*% cost /(2*m)  
  return(cost)
}

gradient descent

adapting \(\Theta\) in each iteration step:
In what follows, let \(x^{(i)}_0=1\) for all \(i\), thus \(h_{\Theta}(x^{(i)})=\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}\). \[\begin{eqnarray} \theta_0 &:=& \theta_0 - \alpha\frac{\delta}{\delta\theta_0} J(\Theta)\\ &=& \theta_0 - \alpha \frac{1}{m} \sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})x_0^{(i)}\\ &=& \theta_0 - \alpha \frac{1}{m} (h_{\Theta}(X)- y)'\times x_0\\ &=& \theta_0 - \alpha \frac{1}{m} (X\times \Theta- y)' \times x_0\\[1ex] \theta_1 &:=& \theta_1 - \alpha\frac{\delta}{\delta\theta_1} J(\Theta)\\ &=& \theta_1 - \alpha \frac{1}{m} \sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})x_1^{(i)} \\ &=& \theta_1 - \alpha \frac{1}{m} (h_{\Theta}(X)- y)' \times x_1\\ &=& \theta_1 - \alpha \frac{1}{m} (X\times\Theta-y)' \times x_1\\ \end{eqnarray}\]

Computing the partial derivations: \(J(\Theta) = \frac{1}{2m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}-y^{(i)})^2\)

We compute first \(\frac{\delta}{\delta \theta_0} J(\Theta)\)
\[\begin{eqnarray} \frac{\delta}{\delta \theta_0} J(\Theta) &=& \frac{\delta}{\delta \theta_0}\frac{1}{2m} \sum_{i=1}^{m}(h_{\Theta}(x^{(i)})-y^{(i)})^2\\ &=& \frac{\delta}{\delta \theta_0} \frac{1}{2m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}-y^{(i)})^2 \\ &=& \frac{\delta}{\delta \theta_0}\frac{1}{2m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + (\theta_1 x_1^{(i)}-y^{(i)}))^2 \\ &=& \frac{\delta}{\delta \theta_0}\frac{1}{2m} \sum_{i=1}^{m} (\theta_0^2 (x_0^{(i)})^2 + 2\theta_0 x_0^{(i)}(\theta_1 x_1^{(i)}-y^{(i)})+ (\theta_1 x_1^{(i)}-y^{(i)})^2)\\ &=& \frac{1}{2m} \sum_{i=1}^{m} (2\theta_0 (x_0^{(i)})^2+ 2x_0^{(i)}(\theta_1 x_1^{(i)}-y^{(i)}))\\ &=& \frac{1}{m} \sum_{i=1}^{m} (\theta_0 (x_0^{(i)})^2 + x_0^{(i)}(\theta_1 x_1^{(i)}-y^{(i)}))\\ &=& \frac{1}{m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}-y^{(i)})x_0^{(i)}\\ &=& \frac{1}{m} \sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})x_0^{(i)}\\ \end{eqnarray}\] For \(\frac{\delta}{\delta \theta_1} J(\Theta)\) one gets by the same calculation: \[\begin{eqnarray} \frac{\delta}{\delta \theta_1} J(\Theta) &=& \frac{1}{2m} \sum_{i=1}^{m}(h_{\Theta}(x^{(i)})-y^{(i)})^2\\ &=& \frac{1}{m} \sum_{i=1}^{m} (h(x^{(i)})-y^{(i)})x_1^{(i)}\\ \end{eqnarray}\] The theta-vector can be updated in one go:
\[\begin{eqnarray} \Theta := \Theta - \frac{\alpha}{m} ((X \times \Theta -y)' \times X)' \end{eqnarray}\]
# gradient descent algorithm
gradientDescent <- function(data,theta, alpha, iterations){
  m <- dim(data)[1]
  n <- dim(data)[2]
  X <- ones(data[,1:n-1])
  y <- data[,n]
  cost_vec <- computeCost(data,theta)
  theta_vec <- t(theta)
  for (i in 1:iterations){
    theta = theta - alpha/m * t(t(X%*%theta-y) %*% X) 
    # paramterized alternative:                 
    #temp <- theta
    #for (j in 1:length(theta)){
    #  derivation <- (t(h(theta,X)-y) %*% X[, j]) / m
    #  temp[j] <- theta[j] - alpha * derivation     
    #}
    #theta <- temp
    cost <- computeCost(data,theta)
    cost_vec <- c(cost_vec,cost)
    theta_vec <- rbind(theta_vec,t(theta))
  }
  return(list(theta=theta,theta_vec=theta_vec,cost_vec=cost_vec))
}

plotting functions

# Plot the linear fit
plotLinearFit <- function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  min <- min(data[,1])
  max <- max(data[,1])
  cost <- computeCost(data,theta)
  plot(data,xlim=c(min,max),
       col="red",
       xlab="x",ylab="y",
       main = "linear fit",
       sub= paste("theta0=", round(theta[1],digits=3),", theta1=", round(theta[2],digits=3), ", cost=",round(cost,digits=3))
  ) 
  abline(theta[1],theta[2],col="blue",lwd=2)
  abline(h=0,col="gray",lwd=1)
  abline(v=0,col="gray",lwd=1)
}
# plot cost development
plotCostDev <- function(grad_desc){
  cost_vec <- grad_desc$cost_vec
  plot(cost_vec,type="l",col="blue",cex=.2,
        xlab="iteration step",
       ylab='cost',
       main='cost development in gradient descent'  )  
}
# plot cost function and add cost development in gradient descent
plotCostSurface <- function(data,grad_desc){
  # get values
  theta_vec <- grad_desc$theta_vec
  cost_vec <- grad_desc$cost_vec
  theta <- grad_desc$theta
  X <- cbind(rep(1,times=dim(data)[1]),data[,1])
  y <- data[,2]
  # compute boundaries
  theta0 <- max(abs(grad_desc$theta_vec[,1]))
  theta0min <- theta[1]-theta0-10
  theta0max <- theta[1]+theta0+10
  theta1 <- max(abs(grad_desc$theta_vec[,2]))
  theta1min <- theta[2]-theta1-1
  theta1max <- theta[2]+theta1+1
  costmax <- max(computeCost(data,c(theta0min,theta1min)),
                 computeCost(data,c(theta0min,theta1max)),
                 computeCost(data,c(theta0max,theta1min)),
                 computeCost(data,c(theta0max,theta1max)))
  #costmax <- min(costmax,1000)
  # Grid over which we will calculate J
  theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
  theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)  
  # initialize J_vals to a matrix of 0's
  J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
  # Fill out J_vals
  for (i in 1:length(theta0_vals)){
    for (j in 1:length(theta1_vals)){
      t <- c(theta0_vals[i], theta1_vals[j])    
      J_vals[i,j] <- computeCost(data,t)
    }
  }
  # Surface plot
  while (rgl.cur() > 0) { rgl.close() } # close old plots
  open3d()
  color = rev(rainbow(100, start = 0/8, end = 6/8))
  persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9, 
          fog=TRUE,
          xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
  levels <- exp(seq(from=1,to=10,length.out=30))
  levels <- levels*costmax /levels[length(levels)] 
  lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
  for (i in seq_along(lines)) {
    contx <- lines[[i]]$x
    conty <- lines[[i]]$y
    contz <- rep(lines[[i]]$level, length(contx))
    lines3d(contx, conty, contz,lwd=.5)
  }
  X <- cbind(rep(1,times=dim(data)[1]),data[,1])
  y <- data[,2]
  points3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",pch=4,lwd=2)
  lines3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",lwd=1)
  points3d(theta[1],theta[2],computeCost(data,theta),col="black",size=10)
}

Alternative cost functions

1st alternative cost function

\[\begin{eqnarray} J(\Theta) &=& \frac{1}{m}\sum_{i=1}^{m} |(h_{\Theta}(x^{(i)})-y^{(i)}| \\ &=& \frac{1}{m}\sum_{i=1}^{m} |\theta_0 + \theta_1 x^{(i)}-y^{(i)}| \end{eqnarray}\]
# cost function 
computeCost1 <-  function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  y <- data[,2]
  cost <- sum(abs(h(theta,X)-y))/m
  return(cost)
}

2nd alternative cost function

\[\begin{eqnarray} J(\Theta) &=& \left|\sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})\right|\\ &=& \left|\sum_{i=1}^{m} (\theta_0 + \theta_1 x^{(i)}-y^{(i)})\right| \end{eqnarray}\]
# cost function 
computeCost2 <-  function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  y <- data[,2]
  cost <- abs(sum(h(theta,X)-y))
  return(cost)
}


# plot cost function and add cost development in gradient descent
plotCostSurface_pure <- function(data,costFunction,theta0Range,theta1Range){
  # Grid over which we will calculate J
  theta0min <- theta0Range[1]
  theta0max <- theta0Range[2]
  theta1min <- theta1Range[1]
  theta1max <- theta1Range[2]
  theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
  theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)

  costmax <- max(computeCost(data,c(theta0min,theta1min)),
                 computeCost(data,c(theta0min,theta1max)),
                 computeCost(data,c(theta0max,theta1min)),
                 computeCost(data,c(theta0max,theta1max)))
  # initialize J_vals to a matrix of 0's
  J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
  # Fill out J_vals
  for (i in 1:length(theta0_vals)){
    for (j in 1:length(theta1_vals)){
      t <- c(theta0_vals[i], theta1_vals[j])    
      J_vals[i,j] <- costFunction(data,t)
    }
  }
  # Surface plot
  while (rgl.cur() > 0) { rgl.close() }
  open3d()
  color = rev(rainbow(100, start = 0/8, end = 6/8))
  persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9, 
          fog=TRUE,
          xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
  levels <- exp(seq(from=1,to=10,length.out=30))
  levels <- levels*costmax /levels[length(levels)] 
  lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
  for (i in seq_along(lines)) {
    contx <- lines[[i]]$x
    conty <- lines[[i]]$y
    contz <- rep(lines[[i]]$level, length(contx))
    lines3d(contx, conty, contz,lwd=.5)
  }
}

Feature Normalization

featureNormalization <- function(value_matrix){
  mu <- apply(value_matrix,c(2),mean)
  sigma <- apply(value_matrix,c(2),sd)
  normal_matrix <- matrix(0,nrow=dim(value_matrix)[1],ncol=dim(value_matrix)[2])
  for (i in 1:dim(value_matrix)[2]){
    if(sigma[i] !=0){value_matrix[,i] <- (value_matrix[,i]-mu[i])/sigma[i]}
    if(sigma[i] == 0){value_matrix[,i] <- value_matrix[,i]-mu[i]}
  }
  return(list(nv=value_matrix,mu=mu,sigma=sigma))
}

Normal Equation

normalEquation <- function(X,y){
  theta <- ginv(t(X) %*% X) %*% t(X) %*% y
  return(theta)
}

Normal Equation (exact)

normalEquationExact <- function(X,y){
  theta <- ginv(t(X) %*% X,tol=0) %*% t(X) %*% y
  return(theta)
}

Auxiliary Functions

######################################################
# auxiliary functions

last <- function(vec,n){
  return(vec[(length(vec)-n+1):length(vec)])
}

ones <- function(X){
  return(as.matrix(cbind(rep(1,times=dim(X)[1]),X)))
}