Machine Learning - Week 1: Linear Regression

Functions for linear regression

conventions
hypothesis
cost function
gradient descent
plotting functions

Alternative cost functions

1st alternative cost function
2nd alternative cost function

Feature Normalization

Normal Equation

Normal Equation (exact)

Auxiliary Functions

Functions for linear regression

name: linearRegression_functions.R
author: Wiebke Petersen
date: 01 Mai, 2017
based_on: ex1.m
original author: Andrew Ng
context: Machine Learning course on coursera.org

conventions

m : number of training example
n : number of features (for exercise 1: n=1)
y : m-dimensional vector
X : $m\times (1+n)$ matrix (note, the first column is per convention a vector consisting only of 1’s)
$\Theta$ : $(n+1)$ -dimensional parameter vector
$x^{(i)}$ : $(n+1)$ -dimensional vector of i-th training example (i-th row of X)
$x_j$ : $m$ -dimensional vector (j-th column of X)

hypothesis

hypothesis as a function of a single value:
$h(x) = \theta_0 + \theta_1 x$

hypothesis as a function of a vector:
$h(\vec{x}) = \theta_0 x_0 + \theta_1 x_1$
$h(\vec{x}) = \Theta' \times \vec{x}$
$\begin{eqnarray} h\left(\begin{pmatrix}x_0=1\\ x_1 \end{pmatrix}\right) &=& \begin{pmatrix}\theta_0\\ \theta_1 \end{pmatrix}' \times \begin{pmatrix}x_0=1\\ x_1 \end{pmatrix}\\ &=& \theta_0 x_0 + \theta_1 x_1 \\ &=& \theta_0 + \theta_1 x_1 \end{eqnarray}$

hypothesis as a function of a matrix:
$\begin{eqnarray} \begin{pmatrix} 1 & x_1^1 \\ 1 & x_1^2 \\ 1 & x_1^3 \end{pmatrix} \times \begin{pmatrix} \theta_0 \\ \theta_1 \end{pmatrix} = \begin{pmatrix} \theta_0 + \theta_1 x_1 \\ \theta_0 + \theta_1 x_2 \\ \theta_0 + \theta_1 x_3 \end{pmatrix} \end{eqnarray}$
$\begin{eqnarray} h(X) = X \times \Theta \end{eqnarray}$

# hypothesis (note: here, theta is a 2-dimensional vector and X is a mx2 matrix)
h <- function(theta,X){
  prediction <- X %*% theta
  return(prediction)
}

# hypothesis for normalized features
hnormal <- function(theta,x,mu,sigma){
  for (i in 1:length(x)){
    if (sigma[i] !=0) {x[i] <- (x[i]-mu[i])/sigma[i]}
    if (sigma[i] == 0){x[i] <- x[i]-mu[i]}
  }
  x <- c(1,x)
  return(h(theta,x))
}

cost function

$\begin{eqnarray} J \left(\begin{pmatrix}\theta_0\\ \theta_1\end{pmatrix} \right) &=& \frac{1}{2m}\sum_{i=1}^{m} (h(x^{(i)})-y^{(i)})^2 \\ &=& \frac{1}{2m}\sum_{i=1}^{m} ((\theta_0 + \theta_1 x^{(i)})-y^{(i)})^2 \\ &=& \frac{1}{2m}(X\times\Theta-y)'\times (X\times\Theta-y) \end{eqnarray}$

# cost function 
computeCost <-  function(data,theta){
  m <- dim(data)[1]
  n <- dim(data)[2]
  X <- as.matrix(cbind(rep(1,times=m),data[,1:n-1]))
  y <- data[,n]
  cost <- h(theta,X)-y
  cost <- t(cost) %*% cost /(2*m)  
  return(cost)
}

gradient descent

adapting

$\Theta$ in each iteration step:
In what follows, let

$x^{(i)}_0=1$ for all

$i$ , thus

$h_{\Theta}(x^{(i)})=\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}$ .

$\begin{eqnarray} \theta_0 &:=& \theta_0 - \alpha\frac{\delta}{\delta\theta_0} J(\Theta)\\ &=& \theta_0 - \alpha \frac{1}{m} \sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})x_0^{(i)}\\ &=& \theta_0 - \alpha \frac{1}{m} (h_{\Theta}(X)- y)'\times x_0\\ &=& \theta_0 - \alpha \frac{1}{m} (X\times \Theta- y)' \times x_0\\[1ex] \theta_1 &:=& \theta_1 - \alpha\frac{\delta}{\delta\theta_1} J(\Theta)\\ &=& \theta_1 - \alpha \frac{1}{m} \sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})x_1^{(i)} \\ &=& \theta_1 - \alpha \frac{1}{m} (h_{\Theta}(X)- y)' \times x_1\\ &=& \theta_1 - \alpha \frac{1}{m} (X\times\Theta-y)' \times x_1\\ \end{eqnarray}$

Computing the partial derivations: $J(\Theta) = \frac{1}{2m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}-y^{(i)})^2$

We compute first

$\frac{\delta}{\delta \theta_0} J(\Theta)$

$\begin{eqnarray} \frac{\delta}{\delta \theta_0} J(\Theta) &=& \frac{\delta}{\delta \theta_0}\frac{1}{2m} \sum_{i=1}^{m}(h_{\Theta}(x^{(i)})-y^{(i)})^2\\ &=& \frac{\delta}{\delta \theta_0} \frac{1}{2m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}-y^{(i)})^2 \\ &=& \frac{\delta}{\delta \theta_0}\frac{1}{2m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + (\theta_1 x_1^{(i)}-y^{(i)}))^2 \\ &=& \frac{\delta}{\delta \theta_0}\frac{1}{2m} \sum_{i=1}^{m} (\theta_0^2 (x_0^{(i)})^2 + 2\theta_0 x_0^{(i)}(\theta_1 x_1^{(i)}-y^{(i)})+ (\theta_1 x_1^{(i)}-y^{(i)})^2)\\ &=& \frac{1}{2m} \sum_{i=1}^{m} (2\theta_0 (x_0^{(i)})^2+ 2x_0^{(i)}(\theta_1 x_1^{(i)}-y^{(i)}))\\ &=& \frac{1}{m} \sum_{i=1}^{m} (\theta_0 (x_0^{(i)})^2 + x_0^{(i)}(\theta_1 x_1^{(i)}-y^{(i)}))\\ &=& \frac{1}{m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}-y^{(i)})x_0^{(i)}\\ &=& \frac{1}{m} \sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})x_0^{(i)}\\ \end{eqnarray}$ For

$\frac{\delta}{\delta \theta_1} J(\Theta)$ one gets by the same calculation:

$\begin{eqnarray} \frac{\delta}{\delta \theta_1} J(\Theta) &=& \frac{1}{2m} \sum_{i=1}^{m}(h_{\Theta}(x^{(i)})-y^{(i)})^2\\ &=& \frac{1}{m} \sum_{i=1}^{m} (h(x^{(i)})-y^{(i)})x_1^{(i)}\\ \end{eqnarray}$ The theta-vector can be updated in one go:

$\begin{eqnarray} \Theta := \Theta - \frac{\alpha}{m} ((X \times \Theta -y)' \times X)' \end{eqnarray}$

# gradient descent algorithm
gradientDescent <- function(data,theta, alpha, iterations){
  m <- dim(data)[1]
  n <- dim(data)[2]
  X <- ones(data[,1:n-1])
  y <- data[,n]
  cost_vec <- computeCost(data,theta)
  theta_vec <- t(theta)
  for (i in 1:iterations){
    theta = theta - alpha/m * t(t(X%*%theta-y) %*% X) 
    # paramterized alternative:                 
    #temp <- theta
    #for (j in 1:length(theta)){
    #  derivation <- (t(h(theta,X)-y) %*% X[, j]) / m
    #  temp[j] <- theta[j] - alpha * derivation     
    #}
    #theta <- temp
    cost <- computeCost(data,theta)
    cost_vec <- c(cost_vec,cost)
    theta_vec <- rbind(theta_vec,t(theta))
  }
  return(list(theta=theta,theta_vec=theta_vec,cost_vec=cost_vec))
}

plotting functions

# Plot the linear fit
plotLinearFit <- function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  min <- min(data[,1])
  max <- max(data[,1])
  cost <- computeCost(data,theta)
  plot(data,xlim=c(min,max),
       col="red",
       xlab="x",ylab="y",
       main = "linear fit",
       sub= paste("theta0=", round(theta[1],digits=3),", theta1=", round(theta[2],digits=3), ", cost=",round(cost,digits=3))
  ) 
  abline(theta[1],theta[2],col="blue",lwd=2)
  abline(h=0,col="gray",lwd=1)
  abline(v=0,col="gray",lwd=1)
}

# plot cost development
plotCostDev <- function(grad_desc){
  cost_vec <- grad_desc$cost_vec
  plot(cost_vec,type="l",col="blue",cex=.2,
        xlab="iteration step",
       ylab='cost',
       main='cost development in gradient descent'  )  
}

# plot cost function and add cost development in gradient descent
plotCostSurface <- function(data,grad_desc){
  # get values
  theta_vec <- grad_desc$theta_vec
  cost_vec <- grad_desc$cost_vec
  theta <- grad_desc$theta
  X <- cbind(rep(1,times=dim(data)[1]),data[,1])
  y <- data[,2]
  # compute boundaries
  theta0 <- max(abs(grad_desc$theta_vec[,1]))
  theta0min <- theta[1]-theta0-10
  theta0max <- theta[1]+theta0+10
  theta1 <- max(abs(grad_desc$theta_vec[,2]))
  theta1min <- theta[2]-theta1-1
  theta1max <- theta[2]+theta1+1
  costmax <- max(computeCost(data,c(theta0min,theta1min)),
                 computeCost(data,c(theta0min,theta1max)),
                 computeCost(data,c(theta0max,theta1min)),
                 computeCost(data,c(theta0max,theta1max)))
  #costmax <- min(costmax,1000)
  # Grid over which we will calculate J
  theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
  theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)  
  # initialize J_vals to a matrix of 0's
  J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
  # Fill out J_vals
  for (i in 1:length(theta0_vals)){
    for (j in 1:length(theta1_vals)){
      t <- c(theta0_vals[i], theta1_vals[j])    
      J_vals[i,j] <- computeCost(data,t)
    }
  }
  # Surface plot
  while (rgl.cur() > 0) { rgl.close() } # close old plots
  open3d()
  color = rev(rainbow(100, start = 0/8, end = 6/8))
  persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9, 
          fog=TRUE,
          xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
  levels <- exp(seq(from=1,to=10,length.out=30))
  levels <- levels*costmax /levels[length(levels)] 
  lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
  for (i in seq_along(lines)) {
    contx <- lines[[i]]$x
    conty <- lines[[i]]$y
    contz <- rep(lines[[i]]$level, length(contx))
    lines3d(contx, conty, contz,lwd=.5)
  }
  X <- cbind(rep(1,times=dim(data)[1]),data[,1])
  y <- data[,2]
  points3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",pch=4,lwd=2)
  lines3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",lwd=1)
  points3d(theta[1],theta[2],computeCost(data,theta),col="black",size=10)
}

Alternative cost functions

1st alternative cost function

$\begin{eqnarray} J(\Theta) &=& \frac{1}{m}\sum_{i=1}^{m} |(h_{\Theta}(x^{(i)})-y^{(i)}| \\ &=& \frac{1}{m}\sum_{i=1}^{m} |\theta_0 + \theta_1 x^{(i)}-y^{(i)}| \end{eqnarray}$

# cost function 
computeCost1 <-  function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  y <- data[,2]
  cost <- sum(abs(h(theta,X)-y))/m
  return(cost)
}

2nd alternative cost function

$\begin{eqnarray} J(\Theta) &=& \left|\sum_{i=1}^{m} (h_{\Theta}(x^{(i)})-y^{(i)})\right|\\ &=& \left|\sum_{i=1}^{m} (\theta_0 + \theta_1 x^{(i)}-y^{(i)})\right| \end{eqnarray}$

# cost function 
computeCost2 <-  function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  y <- data[,2]
  cost <- abs(sum(h(theta,X)-y))
  return(cost)
}


# plot cost function and add cost development in gradient descent
plotCostSurface_pure <- function(data,costFunction,theta0Range,theta1Range){
  # Grid over which we will calculate J
  theta0min <- theta0Range[1]
  theta0max <- theta0Range[2]
  theta1min <- theta1Range[1]
  theta1max <- theta1Range[2]
  theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
  theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)

  costmax <- max(computeCost(data,c(theta0min,theta1min)),
                 computeCost(data,c(theta0min,theta1max)),
                 computeCost(data,c(theta0max,theta1min)),
                 computeCost(data,c(theta0max,theta1max)))
  # initialize J_vals to a matrix of 0's
  J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
  # Fill out J_vals
  for (i in 1:length(theta0_vals)){
    for (j in 1:length(theta1_vals)){
      t <- c(theta0_vals[i], theta1_vals[j])    
      J_vals[i,j] <- costFunction(data,t)
    }
  }
  # Surface plot
  while (rgl.cur() > 0) { rgl.close() }
  open3d()
  color = rev(rainbow(100, start = 0/8, end = 6/8))
  persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9, 
          fog=TRUE,
          xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
  levels <- exp(seq(from=1,to=10,length.out=30))
  levels <- levels*costmax /levels[length(levels)] 
  lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
  for (i in seq_along(lines)) {
    contx <- lines[[i]]$x
    conty <- lines[[i]]$y
    contz <- rep(lines[[i]]$level, length(contx))
    lines3d(contx, conty, contz,lwd=.5)
  }
}

Feature Normalization

featureNormalization <- function(value_matrix){
  mu <- apply(value_matrix,c(2),mean)
  sigma <- apply(value_matrix,c(2),sd)
  normal_matrix <- matrix(0,nrow=dim(value_matrix)[1],ncol=dim(value_matrix)[2])
  for (i in 1:dim(value_matrix)[2]){
    if(sigma[i] !=0){value_matrix[,i] <- (value_matrix[,i]-mu[i])/sigma[i]}
    if(sigma[i] == 0){value_matrix[,i] <- value_matrix[,i]-mu[i]}
  }
  return(list(nv=value_matrix,mu=mu,sigma=sigma))
}

Normal Equation

normalEquation <- function(X,y){
  theta <- ginv(t(X) %*% X) %*% t(X) %*% y
  return(theta)
}

Normal Equation (exact)

normalEquationExact <- function(X,y){
  theta <- ginv(t(X) %*% X,tol=0) %*% t(X) %*% y
  return(theta)
}

Auxiliary Functions

######################################################
# auxiliary functions

last <- function(vec,n){
  return(vec[(length(vec)-n+1):length(vec)])
}

ones <- function(X){
  return(as.matrix(cbind(rep(1,times=dim(X)[1]),X)))
}