2017-05-02 17:24:07

Linear Regression with multiple variables

Read and inspect data

# graphical parameter
par(lwd=2,pch=4)
# read data
data <-as.matrix(read.table('ex1data2.txt',
                  sep=",", 
                  encoding="UTF-8",
                  header=FALSE
))

m <- dim(data)[1]
n <- dim(data)[2]
X <- data[,1:n-1]
y <- data[,n]

# print some data
data10<-data[1:10,]
kable(data10,col.names=c("x1: size","x2: rooms","y: price"),caption="First 10 training examples before feature normalization")
First 10 training examples before feature normalization
x1: size x2: rooms y: price
2104 3 399900
1600 3 329900
2400 3 369000
1416 2 232000
3000 4 539900
1985 4 299900
1534 3 314900
1427 3 198999
1380 3 212000
1494 3 242500
# plot some data
plot(data[,1],data[,3],ylab="price",xlab="size",col="red")
plot(data[,2],data[,3],ylab="price",xlab="bedrooms",col="red")
plot(data[,2],data[,1],ylab="size",xlab="bedrooms",col="red")

Feature Normalization

fN <- featureNormalization(X)
XN <- fN$nv
X10 <- XN[1:10,]
y10 <- y[1:10]
data10<-cbind(X10,y10)
kable(data10,col.names=c("x1: size","x2: rooms","y: price"),caption="First 10 training examples  after feature normalization")
First 10 training examples after feature normalization
x1: size x2: rooms y: price
0.1300099 -0.2236752 399900
-0.5041898 -0.2236752 329900
0.5024764 -0.2236752 369000
-0.7357231 -1.5377669 232000
1.2574760 1.0904165 539900
-0.0197317 1.0904165 299900
-0.5872398 -0.2236752 314900
-0.7218814 -0.2236752 198999
-0.7810230 -0.2236752 212000
-0.6375731 -0.2236752 242500

Some experiments with feature normalization

t <- c(1:10)
k <- rep(2,times=10)
k[1] <- 2.1
r <- k
r[1] <- 2.001
s <- k
s[1] <- 17
K <- cbind(t,k,r,s)
kable(K,caption="Example data before feature normalization")
Example data before feature normalization
t k r s
1 2.1 2.001 17
2 2.0 2.000 2
3 2.0 2.000 2
4 2.0 2.000 2
5 2.0 2.000 2
6 2.0 2.000 2
7 2.0 2.000 2
8 2.0 2.000 2
9 2.0 2.000 2
10 2.0 2.000 2
K <- featureNormalization(K)$nv
kable(K,caption="Example data after feature normalization")
Example data after feature normalization
t k r s
-1.4863011 2.8460499 2.8460499 2.8460499
-1.1560120 -0.3162278 -0.3162278 -0.3162278
-0.8257228 -0.3162278 -0.3162278 -0.3162278
-0.4954337 -0.3162278 -0.3162278 -0.3162278
-0.1651446 -0.3162278 -0.3162278 -0.3162278
0.1651446 -0.3162278 -0.3162278 -0.3162278
0.4954337 -0.3162278 -0.3162278 -0.3162278
0.8257228 -0.3162278 -0.3162278 -0.3162278
1.1560120 -0.3162278 -0.3162278 -0.3162278
1.4863011 -0.3162278 -0.3162278 -0.3162278

Task: explain the normalized data.

Gradient descent with multiple variables

# prepare data
data_normal <- cbind(XN,y)
# Some gradient descent settings
theta_init = as.matrix(c(0,0,0)) 
iterations = 50
alpha = 0.01

# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=0.01, iterations=50 and theta_init=0, 0, 0.

non convergent cost function \(\Rightarrow\) increase learning rate

alpha <- 0.1
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=0.1, iterations=50 and theta_init=0, 0, 0.

The graph looks good, but check the scale. Let us look at the last 10 values of the cost function:

cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
Last 10 cost values in iteration
cost
2099703830
2093608777
2088282084
2083610571
2079499804
2075870768
2072657148
2069803105
2067261467
2064992246
2062961418

Still decreasing pretty fast \(> 10^{-3}\) per iteration step.

Try even higher learning rate:

alpha <- 0.3
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=0.3, iterations=50 and theta_init=0, 0, 0.

cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
Last 10 cost values in iteration
cost
2043303169
2043297581
2043293344
2043290131
2043287694
2043285847
2043284446
2043283383
2043282578
2043281967
2043281504

Still decreasing with \(> 10^{-3}\) per iteration step.

Try even higher learning rate:

alpha <- 1
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=1, iterations=50 and theta_init=0, 0, 0.

cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
Last 10 cost values in iteration
cost
2043280051
2043280051
2043280051
2043280051
2043280051
2043280051
2043280051
2043280051
2043280051
2043280051
2043280051

We have chosen our parameters such that the cost function converges. But let us increase the iterations value as well (50 is very low).

alpha <- 1
iterations <- 1000
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
# cost vector
cost_vec <- grad_desc$cost_vec

The plot shows the development of the cost function for alpha=1, iterations=1000 and theta_init=0, 0, 0.

Now the cost function is monotonic decreasing and the change in the cost function is less than \(10^{-3}\) per iteration step. Thus gradient descent results in theta= 3.404126610^{5}, 1.106310510^{5}, -6649.4742708.

Predict a price

Predict a price by gradient descent with feature normalization

theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma
x <- c(1650,3)
price <- hnormal(theta,x,mu,sigma)

The predicted price of a 1650 sq-ft, 3 br house is 293,081.5 (predicted by gradient descent with feature normalization).

Predict a price by gradient descent without feature normalization

Let us check which prediction we would get with the same settings and no feature normalization:

grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=1, iterations=1000 and theta_init=0, 0, 0.

Ok, the cost function is not converging so we first have to decrease the learning rate

alpha <- 0.01
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=0.01, iterations=1000 and theta_init=0, 0, 0.

We have to decrease alpha even more:

alpha <- 0.0001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=10^{-4}, iterations=1000 and theta_init=0, 0, 0.

And even more:

alpha <- 0.000001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=10^{-6}, iterations=1000 and theta_init=0, 0, 0.

And even more:

alpha <- 0.00000001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)

The plot shows the development of the cost function for alpha=10^{-8}, iterations=1000 and theta_init=0, 0, 0.

With this ultra small learning rate the cost function seems to converge. Let’s increase the number of iterations and check the last ten cost values in the iteration:

iterations <- 10000
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption=paste("Last 10 cost values, ",iterations," iterations"),col.names=c("cost"))
Last 10 cost values, 10000 iterations
cost
2397824484
2397824481
2397824477
2397824473
2397824470
2397824466
2397824462
2397824459
2397824455
2397824451
2397824448
x <- c(1,1650,3)
theta <- grad_desc$theta
price <- h(theta,x)

Task: Explain why the cost function is still decreasing (although slowly).

The price predicted after 10^{4} iteration steps without feature normalization for a 1650 sq-ft, 3 br house is 272,883.8.

Plotting regression result

Be carefull: only works for 2 features

# set the good learning parameters
alpha=1
iterations = 1000
theta_init = c(0,0,0)
grad_desc <- gradientDescent(data_normal, theta_init, alpha,iterations)

xx <- seq(min(data[,1]),max(data[,1]),length.out=25)  # 1st feature = size
yy <- seq(min(data[,2]),max(data[,2]),length.out = 25)  # 2nd feature = br
zz <- matrix(0,length(xx),length(yy))   # price

theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma


for (i in 1:length(xx)){
  for (j in 1:length(yy)){
    zz[i,j] <- hnormal(theta,c(xx[i],yy[j]),mu,sigma)
  }
}

open3d()
## wgl 
##   1
plot3d(data[,1],data[,2],data[,3], 
       xlab= "size (sq-feet)", ylab="bedroom num.", zlab="price",
       col="blue",type="s",size=1.5, main="Result of Gradient Descent")
persp3d(xx,yy,zz, col=heat.colors(100) ,alpha=.7, add=TRUE)

You must enable Javascript to view this page properly.

Linear Regression by normal equation

Predict a price by normal equation without feature normalization

# read data 
data <-read.table('ex1data2.txt',
                  sep=",", 
                  encoding="UTF-8",
                  header=FALSE,
                  as.is=TRUE
)
m <- dim(data)[1]
n <- dim(data)[2]
X <- as.matrix(cbind(rep(1,times=m),data[,1:n-1]))
y <- data[,n]
theta <- normalEquationExact(X, y)
price <- h(theta,c(1,1650,3))

The predicted price of a 1650 sq-ft, 3 br house is 293,081.5 (predicted by normal equation without feature normalization).

Predict a price by normal equation with feature normalization

X <- data[,1:n-1]
y <- as.matrix(data[n])
fN <- featureNormalization(X)
mu <- fN$mu
sigma <- fN$sigma
XN <- as.matrix(ones(fN$nv))
theta <- normalEquationExact(XN, y)
price <- hnormal(theta,c(1650,3),sigma,mu)

The predicted price of a 1650 sq-ft, 3 br house is 383,011.5 (predicted by normal equation with feature normalization).

Note the big difference to the other predicted prices.

Polynomial regression

Size features

In the following we look at the following features:
\(size, \sqrt{size},size^2,size^3,size^4\)

Size features with normalized features and gradient descent

sizes <- data[,1]
y <- data[,3]
alpha <- 0.4
iterations <- 1500
theta_init <- c(0,0,0,0,0,0)
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
fN <- featureNormalization(sizes)
mu <- fN$mu
sigma <- fN$sigma
grad_desc <- gradientDescent(cbind(fN$nv,y),theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotdata <- cbind(data[,1],y)
x <- as.matrix(seq(from=0,to=5000,length.out=5000),ncol=1)
X <- cbind(x,sqrt(x),x^2,x^3,x^4)
ypred <- c(1,times=5000)
for (i in 1:dim(X)[1]){
  ypred[i] <- hnormal(theta,X[i,],mu,sigma)
}
plot(plotdata,col="red",pch=4,lwd=2,xlab="size",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
points(x,ypred,type="l",col="blue")

Theta values: 340412.66, 55199.58, -25098.50, 123268.06, 47768.23, -101072.67

Size features with normal equation (standard values for R’s inverse function ginv)

sizes <- data[,1]
y <- data[,3]
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
sizes <- ones(sizes)
theta <- normalEquation(sizes,y)
plotdata <- cbind(data[,1],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="sizes",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)

Theta values: 1.344396e-23, 4.582734e-20, 7.757718e-22, 1.691249e-16, 6.589354e-13, 2.661079e-09

Size features with normal equation (parameter tol=0 for R’s inverse function ginv)

sizes <- data[,1]
y <- data[,3]
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
sizes <- ones(sizes)
theta <- normalEquationExact(sizes,y)
plotdata <- cbind(data[,1],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="sizes",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)

Theta values: 2.044467e+05, 3.003035e-01, -8.389948e+02, 3.892136e-02, 5.106840e-06, -1.870584e-09

Number of bedrooms features

In the following we look at the following features:
\(bedrooms, \sqrt{bedrooms},bedrooms^2,bedrooms^3,bedrooms^4\)

Number of bedrooms features with normalized features and gradient descent

bedrooms <- data[,2]
y <- data[,3]
alpha <- 0.4
iterations <- 1500
theta_init <- c(0,0,0,0,0,0)
bedrooms <- cbind(bedrooms,sqrt(bedrooms),bedrooms^2,bedrooms^3,bedrooms^4)
fN <- featureNormalization(bedrooms)
mu <- fN$mu
sigma <- fN$sigma
grad_desc <- gradientDescent(cbind(fN$nv,y),theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotdata <- cbind(data[,2],y)
x <- as.matrix(seq(from=0,to=10,length.out=500),ncol=1)
X <- cbind(x,sqrt(x),x^2,x^3,x^4)
ypred <- c(1,times=500)
for (i in 1:dim(X)[1]){
  ypred[i] <- hnormal(theta,X[i,],mu,sigma)
}
plot(plotdata,col="red",pch=4,lwd=2,xlab="bedrooms",ylab="price",xlim=c(0,10),ylim=c(0,1000000))
points(x,ypred,type="l",col="blue")

Theta values: 340412.66, -39531.47, 178076.09, -226671.90, -102517.28, 264158.47

Number of bedrooms with normal equation

bedrooms <- data[,2]
y <- data[,3]
bedrooms <- cbind(bedrooms,sqrt(bedrooms),bedrooms^2,bedrooms^3,bedrooms^4)
bedrooms <- ones(bedrooms)
theta <- normalEquation(bedrooms,y)
plotdata <- cbind(data[,2],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="bedrooms",ylab="price",xlim=c(0,10),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)

Theta values: 6338.473, 64925.975, 33509.299, 101086.504, -51273.058, 6677.330

Be careful not to overfit your data!