Processing math: 100%
  • Functions for linear regression
    • conventions
    • hypothesis
    • cost function
    • gradient descent
    • plotting functions
  • Alternative cost functions
    • 1st alternative cost function
    • 2nd alternative cost function
  • Feature Normalization
  • Normal Equation
  • Normal Equation (exact)
  • Auxiliary Functions

Functions for linear regression

name: linearRegression_functions.R
author: Wiebke Petersen
date: 01 Mai, 2017
based_on: ex1.m
original author: Andrew Ng
context: Machine Learning course on coursera.org

conventions

m : number of training example
n : number of features (for exercise 1: n=1)
y : m-dimensional vector
X : m×(1+n) matrix (note, the first column is per convention a vector consisting only of 1’s)
Θ : (n+1)-dimensional parameter vector
x(i) : (n+1)-dimensional vector of i-th training example (i-th row of X)
xj : m-dimensional vector (j-th column of X)

hypothesis

hypothesis as a function of a single value:
h(x)=θ0+θ1x

hypothesis as a function of a vector:
h(x)=θ0x0+θ1x1
h(x)=Θ×x
h((x0=1x1))=(θ0θ1)×(x0=1x1)=θ0x0+θ1x1=θ0+θ1x1

hypothesis as a function of a matrix:
(1x111x211x31)×(θ0θ1)=(θ0+θ1x1θ0+θ1x2θ0+θ1x3)
h(X)=X×Θ

# hypothesis (note: here, theta is a 2-dimensional vector and X is a mx2 matrix)
h <- function(theta,X){
  prediction <- X %*% theta
  return(prediction)
}

# hypothesis for normalized features
hnormal <- function(theta,x,mu,sigma){
  for (i in 1:length(x)){
    if (sigma[i] !=0) {x[i] <- (x[i]-mu[i])/sigma[i]}
    if (sigma[i] == 0){x[i] <- x[i]-mu[i]}
  }
  x <- c(1,x)
  return(h(theta,x))
}

cost function

J((θ0θ1))=12mmi=1(h(x(i))y(i))2=12mmi=1((θ0+θ1x(i))y(i))2=12m(X×Θy)×(X×Θy)

# cost function 
computeCost <-  function(data,theta){
  m <- dim(data)[1]
  n <- dim(data)[2]
  X <- as.matrix(cbind(rep(1,times=m),data[,1:n-1]))
  y <- data[,n]
  cost <- h(theta,X)-y
  cost <- t(cost) %*% cost /(2*m)  
  return(cost)
}

gradient descent

adapting Θ in each iteration step:
In what follows, let x(i)0=1 for all i, thus hΘ(x(i))=θ0x(i)0+θ1x(i)1. θ0:=θ0αδδθ0J(Θ)=θ0α1mmi=1(hΘ(x(i))y(i))x(i)0=θ0α1m(hΘ(X)y)×x0=θ0α1m(X×Θy)×x0θ1:=θ1αδδθ1J(Θ)=θ1α1mmi=1(hΘ(x(i))y(i))x(i)1=θ1α1m(hΘ(X)y)×x1=θ1α1m(X×Θy)×x1

Computing the partial derivations: J(Θ)=12mmi=1(θ0x(i)0+θ1x(i)1y(i))2

We compute first δδθ0J(Θ)
δδθ0J(Θ)=δδθ012mmi=1(hΘ(x(i))y(i))2=δδθ012mmi=1(θ0x(i)0+θ1x(i)1y(i))2=δδθ012mmi=1(θ0x(i)0+(θ1x(i)1y(i)))2=δδθ012mmi=1(θ20(x(i)0)2+2θ0x(i)0(θ1x(i)1y(i))+(θ1x(i)1y(i))2)=12mmi=1(2θ0(x(i)0)2+2x(i)0(θ1x(i)1y(i)))=1mmi=1(θ0(x(i)0)2+x(i)0(θ1x(i)1y(i)))=1mmi=1(θ0x(i)0+θ1x(i)1y(i))x(i)0=1mmi=1(hΘ(x(i))y(i))x(i)0 For δδθ1J(Θ) one gets by the same calculation: δδθ1J(Θ)=12mmi=1(hΘ(x(i))y(i))2=1mmi=1(h(x(i))y(i))x(i)1 The theta-vector can be updated in one go:
Θ:=Θαm((X×Θy)×X)
# gradient descent algorithm
gradientDescent <- function(data,theta, alpha, iterations){
  m <- dim(data)[1]
  n <- dim(data)[2]
  X <- ones(data[,1:n-1])
  y <- data[,n]
  cost_vec <- computeCost(data,theta)
  theta_vec <- t(theta)
  for (i in 1:iterations){
    theta = theta - alpha/m * t(t(X%*%theta-y) %*% X) 
    # paramterized alternative:                 
    #temp <- theta
    #for (j in 1:length(theta)){
    #  derivation <- (t(h(theta,X)-y) %*% X[, j]) / m
    #  temp[j] <- theta[j] - alpha * derivation     
    #}
    #theta <- temp
    cost <- computeCost(data,theta)
    cost_vec <- c(cost_vec,cost)
    theta_vec <- rbind(theta_vec,t(theta))
  }
  return(list(theta=theta,theta_vec=theta_vec,cost_vec=cost_vec))
}

plotting functions

# Plot the linear fit
plotLinearFit <- function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  min <- min(data[,1])
  max <- max(data[,1])
  cost <- computeCost(data,theta)
  plot(data,xlim=c(min,max),
       col="red",
       xlab="x",ylab="y",
       main = "linear fit",
       sub= paste("theta0=", round(theta[1],digits=3),", theta1=", round(theta[2],digits=3), ", cost=",round(cost,digits=3))
  ) 
  abline(theta[1],theta[2],col="blue",lwd=2)
  abline(h=0,col="gray",lwd=1)
  abline(v=0,col="gray",lwd=1)
}
# plot cost development
plotCostDev <- function(grad_desc){
  cost_vec <- grad_desc$cost_vec
  plot(cost_vec,type="l",col="blue",cex=.2,
        xlab="iteration step",
       ylab='cost',
       main='cost development in gradient descent'  )  
}
# plot cost function and add cost development in gradient descent
plotCostSurface <- function(data,grad_desc){
  # get values
  theta_vec <- grad_desc$theta_vec
  cost_vec <- grad_desc$cost_vec
  theta <- grad_desc$theta
  X <- cbind(rep(1,times=dim(data)[1]),data[,1])
  y <- data[,2]
  # compute boundaries
  theta0 <- max(abs(grad_desc$theta_vec[,1]))
  theta0min <- theta[1]-theta0-10
  theta0max <- theta[1]+theta0+10
  theta1 <- max(abs(grad_desc$theta_vec[,2]))
  theta1min <- theta[2]-theta1-1
  theta1max <- theta[2]+theta1+1
  costmax <- max(computeCost(data,c(theta0min,theta1min)),
                 computeCost(data,c(theta0min,theta1max)),
                 computeCost(data,c(theta0max,theta1min)),
                 computeCost(data,c(theta0max,theta1max)))
  #costmax <- min(costmax,1000)
  # Grid over which we will calculate J
  theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
  theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)  
  # initialize J_vals to a matrix of 0's
  J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
  # Fill out J_vals
  for (i in 1:length(theta0_vals)){
    for (j in 1:length(theta1_vals)){
      t <- c(theta0_vals[i], theta1_vals[j])    
      J_vals[i,j] <- computeCost(data,t)
    }
  }
  # Surface plot
  while (rgl.cur() > 0) { rgl.close() } # close old plots
  open3d()
  color = rev(rainbow(100, start = 0/8, end = 6/8))
  persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9, 
          fog=TRUE,
          xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
  levels <- exp(seq(from=1,to=10,length.out=30))
  levels <- levels*costmax /levels[length(levels)] 
  lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
  for (i in seq_along(lines)) {
    contx <- lines[[i]]$x
    conty <- lines[[i]]$y
    contz <- rep(lines[[i]]$level, length(contx))
    lines3d(contx, conty, contz,lwd=.5)
  }
  X <- cbind(rep(1,times=dim(data)[1]),data[,1])
  y <- data[,2]
  points3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",pch=4,lwd=2)
  lines3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",lwd=1)
  points3d(theta[1],theta[2],computeCost(data,theta),col="black",size=10)
}

Alternative cost functions

1st alternative cost function

J(Θ)=1mmi=1|(hΘ(x(i))y(i)|=1mmi=1|θ0+θ1x(i)y(i)|
# cost function 
computeCost1 <-  function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  y <- data[,2]
  cost <- sum(abs(h(theta,X)-y))/m
  return(cost)
}

2nd alternative cost function

J(Θ)=|mi=1(hΘ(x(i))y(i))|=|mi=1(θ0+θ1x(i)y(i))|
# cost function 
computeCost2 <-  function(data,theta){
  m <- dim(data)[1]
  X <- cbind(rep(1,times=m),data[,1])
  y <- data[,2]
  cost <- abs(sum(h(theta,X)-y))
  return(cost)
}


# plot cost function and add cost development in gradient descent
plotCostSurface_pure <- function(data,costFunction,theta0Range,theta1Range){
  # Grid over which we will calculate J
  theta0min <- theta0Range[1]
  theta0max <- theta0Range[2]
  theta1min <- theta1Range[1]
  theta1max <- theta1Range[2]
  theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
  theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)

  costmax <- max(computeCost(data,c(theta0min,theta1min)),
                 computeCost(data,c(theta0min,theta1max)),
                 computeCost(data,c(theta0max,theta1min)),
                 computeCost(data,c(theta0max,theta1max)))
  # initialize J_vals to a matrix of 0's
  J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
  # Fill out J_vals
  for (i in 1:length(theta0_vals)){
    for (j in 1:length(theta1_vals)){
      t <- c(theta0_vals[i], theta1_vals[j])    
      J_vals[i,j] <- costFunction(data,t)
    }
  }
  # Surface plot
  while (rgl.cur() > 0) { rgl.close() }
  open3d()
  color = rev(rainbow(100, start = 0/8, end = 6/8))
  persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9, 
          fog=TRUE,
          xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
  levels <- exp(seq(from=1,to=10,length.out=30))
  levels <- levels*costmax /levels[length(levels)] 
  lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
  for (i in seq_along(lines)) {
    contx <- lines[[i]]$x
    conty <- lines[[i]]$y
    contz <- rep(lines[[i]]$level, length(contx))
    lines3d(contx, conty, contz,lwd=.5)
  }
}

Feature Normalization

featureNormalization <- function(value_matrix){
  mu <- apply(value_matrix,c(2),mean)
  sigma <- apply(value_matrix,c(2),sd)
  normal_matrix <- matrix(0,nrow=dim(value_matrix)[1],ncol=dim(value_matrix)[2])
  for (i in 1:dim(value_matrix)[2]){
    if(sigma[i] !=0){value_matrix[,i] <- (value_matrix[,i]-mu[i])/sigma[i]}
    if(sigma[i] == 0){value_matrix[,i] <- value_matrix[,i]-mu[i]}
  }
  return(list(nv=value_matrix,mu=mu,sigma=sigma))
}

Normal Equation

normalEquation <- function(X,y){
  theta <- ginv(t(X) %*% X) %*% t(X) %*% y
  return(theta)
}

Normal Equation (exact)

normalEquationExact <- function(X,y){
  theta <- ginv(t(X) %*% X,tol=0) %*% t(X) %*% y
  return(theta)
}

Auxiliary Functions

######################################################
# auxiliary functions

last <- function(vec,n){
  return(vec[(length(vec)-n+1):length(vec)])
}

ones <- function(X){
  return(as.matrix(cbind(rep(1,times=dim(X)[1]),X)))
}