name: linearRegression_functions.R
author: Wiebke Petersen
date: 01 Mai, 2017
based_on: ex1.m
original author: Andrew Ng
context: Machine Learning course on coursera.org
m : number of training example
n : number of features (for exercise 1: n=1)
y : m-dimensional vector
X : \(m\times (1+n)\) matrix (note, the first column is per convention a vector consisting only of 1’s)
\(\Theta\) : \((n+1)\)-dimensional parameter vector
\(x^{(i)}\) : \((n+1)\)-dimensional vector of i-th training example (i-th row of X)
\(x_j\) : \(m\)-dimensional vector (j-th column of X)
hypothesis as a function of a single value:
\(h(x) = \theta_0 + \theta_1 x\)
hypothesis as a function of a vector:
\(h(\vec{x}) = \theta_0 x_0 + \theta_1 x_1\)
\(h(\vec{x}) = \Theta' \times \vec{x}\)
\(\begin{eqnarray} h\left(\begin{pmatrix}x_0=1\\ x_1 \end{pmatrix}\right) &=& \begin{pmatrix}\theta_0\\ \theta_1 \end{pmatrix}' \times \begin{pmatrix}x_0=1\\ x_1 \end{pmatrix}\\ &=& \theta_0 x_0 + \theta_1 x_1 \\ &=& \theta_0 + \theta_1 x_1 \end{eqnarray}\)
hypothesis as a function of a matrix:
\(\begin{eqnarray} \begin{pmatrix} 1 & x_1^1 \\ 1 & x_1^2 \\ 1 & x_1^3 \end{pmatrix} \times \begin{pmatrix} \theta_0 \\ \theta_1 \end{pmatrix} = \begin{pmatrix} \theta_0 + \theta_1 x_1 \\ \theta_0 + \theta_1 x_2 \\ \theta_0 + \theta_1 x_3 \end{pmatrix} \end{eqnarray}\)
\(\begin{eqnarray} h(X) = X \times \Theta \end{eqnarray}\)
# hypothesis (note: here, theta is a 2-dimensional vector and X is a mx2 matrix)
h <- function(theta,X){
prediction <- X %*% theta
return(prediction)
}
# hypothesis for normalized features
hnormal <- function(theta,x,mu,sigma){
for (i in 1:length(x)){
if (sigma[i] !=0) {x[i] <- (x[i]-mu[i])/sigma[i]}
if (sigma[i] == 0){x[i] <- x[i]-mu[i]}
}
x <- c(1,x)
return(h(theta,x))
}
\[\begin{eqnarray} J \left(\begin{pmatrix}\theta_0\\ \theta_1\end{pmatrix} \right) &=& \frac{1}{2m}\sum_{i=1}^{m} (h(x^{(i)})-y^{(i)})^2 \\ &=& \frac{1}{2m}\sum_{i=1}^{m} ((\theta_0 + \theta_1 x^{(i)})-y^{(i)})^2 \\ &=& \frac{1}{2m}(X\times\Theta-y)'\times (X\times\Theta-y) \end{eqnarray}\]
# cost function
computeCost <- function(data,theta){
m <- dim(data)[1]
n <- dim(data)[2]
X <- as.matrix(cbind(rep(1,times=m),data[,1:n-1]))
y <- data[,n]
cost <- h(theta,X)-y
cost <- t(cost) %*% cost /(2*m)
return(cost)
}
Computing the partial derivations: \(J(\Theta) = \frac{1}{2m} \sum_{i=1}^{m} (\theta_0 x_0^{(i)} + \theta_1 x_1^{(i)}-y^{(i)})^2\)
We compute first \(\frac{\delta}{\delta \theta_0} J(\Theta)\)# gradient descent algorithm
gradientDescent <- function(data,theta, alpha, iterations){
m <- dim(data)[1]
n <- dim(data)[2]
X <- ones(data[,1:n-1])
y <- data[,n]
cost_vec <- computeCost(data,theta)
theta_vec <- t(theta)
for (i in 1:iterations){
theta = theta - alpha/m * t(t(X%*%theta-y) %*% X)
# paramterized alternative:
#temp <- theta
#for (j in 1:length(theta)){
# derivation <- (t(h(theta,X)-y) %*% X[, j]) / m
# temp[j] <- theta[j] - alpha * derivation
#}
#theta <- temp
cost <- computeCost(data,theta)
cost_vec <- c(cost_vec,cost)
theta_vec <- rbind(theta_vec,t(theta))
}
return(list(theta=theta,theta_vec=theta_vec,cost_vec=cost_vec))
}
# Plot the linear fit
plotLinearFit <- function(data,theta){
m <- dim(data)[1]
X <- cbind(rep(1,times=m),data[,1])
min <- min(data[,1])
max <- max(data[,1])
cost <- computeCost(data,theta)
plot(data,xlim=c(min,max),
col="red",
xlab="x",ylab="y",
main = "linear fit",
sub= paste("theta0=", round(theta[1],digits=3),", theta1=", round(theta[2],digits=3), ", cost=",round(cost,digits=3))
)
abline(theta[1],theta[2],col="blue",lwd=2)
abline(h=0,col="gray",lwd=1)
abline(v=0,col="gray",lwd=1)
}
# plot cost development
plotCostDev <- function(grad_desc){
cost_vec <- grad_desc$cost_vec
plot(cost_vec,type="l",col="blue",cex=.2,
xlab="iteration step",
ylab='cost',
main='cost development in gradient descent' )
}
# plot cost function and add cost development in gradient descent
plotCostSurface <- function(data,grad_desc){
# get values
theta_vec <- grad_desc$theta_vec
cost_vec <- grad_desc$cost_vec
theta <- grad_desc$theta
X <- cbind(rep(1,times=dim(data)[1]),data[,1])
y <- data[,2]
# compute boundaries
theta0 <- max(abs(grad_desc$theta_vec[,1]))
theta0min <- theta[1]-theta0-10
theta0max <- theta[1]+theta0+10
theta1 <- max(abs(grad_desc$theta_vec[,2]))
theta1min <- theta[2]-theta1-1
theta1max <- theta[2]+theta1+1
costmax <- max(computeCost(data,c(theta0min,theta1min)),
computeCost(data,c(theta0min,theta1max)),
computeCost(data,c(theta0max,theta1min)),
computeCost(data,c(theta0max,theta1max)))
#costmax <- min(costmax,1000)
# Grid over which we will calculate J
theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)
# initialize J_vals to a matrix of 0's
J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
# Fill out J_vals
for (i in 1:length(theta0_vals)){
for (j in 1:length(theta1_vals)){
t <- c(theta0_vals[i], theta1_vals[j])
J_vals[i,j] <- computeCost(data,t)
}
}
# Surface plot
while (rgl.cur() > 0) { rgl.close() } # close old plots
open3d()
color = rev(rainbow(100, start = 0/8, end = 6/8))
persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9,
fog=TRUE,
xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
levels <- exp(seq(from=1,to=10,length.out=30))
levels <- levels*costmax /levels[length(levels)]
lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
for (i in seq_along(lines)) {
contx <- lines[[i]]$x
conty <- lines[[i]]$y
contz <- rep(lines[[i]]$level, length(contx))
lines3d(contx, conty, contz,lwd=.5)
}
X <- cbind(rep(1,times=dim(data)[1]),data[,1])
y <- data[,2]
points3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",pch=4,lwd=2)
lines3d(theta_vec[,1],theta_vec[,2],cost_vec,col="red",lwd=1)
points3d(theta[1],theta[2],computeCost(data,theta),col="black",size=10)
}
# cost function
computeCost1 <- function(data,theta){
m <- dim(data)[1]
X <- cbind(rep(1,times=m),data[,1])
y <- data[,2]
cost <- sum(abs(h(theta,X)-y))/m
return(cost)
}
# cost function
computeCost2 <- function(data,theta){
m <- dim(data)[1]
X <- cbind(rep(1,times=m),data[,1])
y <- data[,2]
cost <- abs(sum(h(theta,X)-y))
return(cost)
}
# plot cost function and add cost development in gradient descent
plotCostSurface_pure <- function(data,costFunction,theta0Range,theta1Range){
# Grid over which we will calculate J
theta0min <- theta0Range[1]
theta0max <- theta0Range[2]
theta1min <- theta1Range[1]
theta1max <- theta1Range[2]
theta0_vals = seq(from=theta0min, to=theta0max, length.out=100)
theta1_vals = seq(from=theta1min, to=theta1max, length.out=100)
costmax <- max(computeCost(data,c(theta0min,theta1min)),
computeCost(data,c(theta0min,theta1max)),
computeCost(data,c(theta0max,theta1min)),
computeCost(data,c(theta0max,theta1max)))
# initialize J_vals to a matrix of 0's
J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
# Fill out J_vals
for (i in 1:length(theta0_vals)){
for (j in 1:length(theta1_vals)){
t <- c(theta0_vals[i], theta1_vals[j])
J_vals[i,j] <- costFunction(data,t)
}
}
# Surface plot
while (rgl.cur() > 0) { rgl.close() }
open3d()
color = rev(rainbow(100, start = 0/8, end = 6/8))
persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9,
fog=TRUE,
xlab='t0', ylab='t1',zlab="cost",main="Contour plot of cost function")
levels <- exp(seq(from=1,to=10,length.out=30))
levels <- levels*costmax /levels[length(levels)]
lines <- contourLines(theta0_vals, theta1_vals, J_vals, levels=levels)
for (i in seq_along(lines)) {
contx <- lines[[i]]$x
conty <- lines[[i]]$y
contz <- rep(lines[[i]]$level, length(contx))
lines3d(contx, conty, contz,lwd=.5)
}
}
featureNormalization <- function(value_matrix){
mu <- apply(value_matrix,c(2),mean)
sigma <- apply(value_matrix,c(2),sd)
normal_matrix <- matrix(0,nrow=dim(value_matrix)[1],ncol=dim(value_matrix)[2])
for (i in 1:dim(value_matrix)[2]){
if(sigma[i] !=0){value_matrix[,i] <- (value_matrix[,i]-mu[i])/sigma[i]}
if(sigma[i] == 0){value_matrix[,i] <- value_matrix[,i]-mu[i]}
}
return(list(nv=value_matrix,mu=mu,sigma=sigma))
}
normalEquation <- function(X,y){
theta <- ginv(t(X) %*% X) %*% t(X) %*% y
return(theta)
}
normalEquationExact <- function(X,y){
theta <- ginv(t(X) %*% X,tol=0) %*% t(X) %*% y
return(theta)
}
######################################################
# auxiliary functions
last <- function(vec,n){
return(vec[(length(vec)-n+1):length(vec)])
}
ones <- function(X){
return(as.matrix(cbind(rep(1,times=dim(X)[1]),X)))
}