2017-05-02 17:24:07
# graphical parameter
par(lwd=2,pch=4)
# read data
data <-as.matrix(read.table('ex1data2.txt',
sep=",",
encoding="UTF-8",
header=FALSE
))
m <- dim(data)[1]
n <- dim(data)[2]
X <- data[,1:n-1]
y <- data[,n]
# print some data
data10<-data[1:10,]
kable(data10,col.names=c("x1: size","x2: rooms","y: price"),caption="First 10 training examples before feature normalization")
| x1: size | x2: rooms | y: price |
|---|---|---|
| 2104 | 3 | 399900 |
| 1600 | 3 | 329900 |
| 2400 | 3 | 369000 |
| 1416 | 2 | 232000 |
| 3000 | 4 | 539900 |
| 1985 | 4 | 299900 |
| 1534 | 3 | 314900 |
| 1427 | 3 | 198999 |
| 1380 | 3 | 212000 |
| 1494 | 3 | 242500 |
# plot some data
plot(data[,1],data[,3],ylab="price",xlab="size",col="red")
plot(data[,2],data[,3],ylab="price",xlab="bedrooms",col="red")
plot(data[,2],data[,1],ylab="size",xlab="bedrooms",col="red")
fN <- featureNormalization(X)
XN <- fN$nv
X10 <- XN[1:10,]
y10 <- y[1:10]
data10<-cbind(X10,y10)
kable(data10,col.names=c("x1: size","x2: rooms","y: price"),caption="First 10 training examples after feature normalization")
| x1: size | x2: rooms | y: price |
|---|---|---|
| 0.1300099 | -0.2236752 | 399900 |
| -0.5041898 | -0.2236752 | 329900 |
| 0.5024764 | -0.2236752 | 369000 |
| -0.7357231 | -1.5377669 | 232000 |
| 1.2574760 | 1.0904165 | 539900 |
| -0.0197317 | 1.0904165 | 299900 |
| -0.5872398 | -0.2236752 | 314900 |
| -0.7218814 | -0.2236752 | 198999 |
| -0.7810230 | -0.2236752 | 212000 |
| -0.6375731 | -0.2236752 | 242500 |
t <- c(1:10)
k <- rep(2,times=10)
k[1] <- 2.1
r <- k
r[1] <- 2.001
s <- k
s[1] <- 17
K <- cbind(t,k,r,s)
kable(K,caption="Example data before feature normalization")
| t | k | r | s |
|---|---|---|---|
| 1 | 2.1 | 2.001 | 17 |
| 2 | 2.0 | 2.000 | 2 |
| 3 | 2.0 | 2.000 | 2 |
| 4 | 2.0 | 2.000 | 2 |
| 5 | 2.0 | 2.000 | 2 |
| 6 | 2.0 | 2.000 | 2 |
| 7 | 2.0 | 2.000 | 2 |
| 8 | 2.0 | 2.000 | 2 |
| 9 | 2.0 | 2.000 | 2 |
| 10 | 2.0 | 2.000 | 2 |
K <- featureNormalization(K)$nv
kable(K,caption="Example data after feature normalization")
| t | k | r | s |
|---|---|---|---|
| -1.4863011 | 2.8460499 | 2.8460499 | 2.8460499 |
| -1.1560120 | -0.3162278 | -0.3162278 | -0.3162278 |
| -0.8257228 | -0.3162278 | -0.3162278 | -0.3162278 |
| -0.4954337 | -0.3162278 | -0.3162278 | -0.3162278 |
| -0.1651446 | -0.3162278 | -0.3162278 | -0.3162278 |
| 0.1651446 | -0.3162278 | -0.3162278 | -0.3162278 |
| 0.4954337 | -0.3162278 | -0.3162278 | -0.3162278 |
| 0.8257228 | -0.3162278 | -0.3162278 | -0.3162278 |
| 1.1560120 | -0.3162278 | -0.3162278 | -0.3162278 |
| 1.4863011 | -0.3162278 | -0.3162278 | -0.3162278 |
Task: explain the normalized data.
# prepare data
data_normal <- cbind(XN,y)
# Some gradient descent settings
theta_init = as.matrix(c(0,0,0))
iterations = 50
alpha = 0.01
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=0.01, iterations=50 and theta_init=0, 0, 0.
alpha <- 0.1
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=0.1, iterations=50 and theta_init=0, 0, 0.
The graph looks good, but check the scale. Let us look at the last 10 values of the cost function:
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
| cost |
|---|
| 2099703830 |
| 2093608777 |
| 2088282084 |
| 2083610571 |
| 2079499804 |
| 2075870768 |
| 2072657148 |
| 2069803105 |
| 2067261467 |
| 2064992246 |
| 2062961418 |
Still decreasing pretty fast \(> 10^{-3}\) per iteration step.
Try even higher learning rate:
alpha <- 0.3
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=0.3, iterations=50 and theta_init=0, 0, 0.
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
| cost |
|---|
| 2043303169 |
| 2043297581 |
| 2043293344 |
| 2043290131 |
| 2043287694 |
| 2043285847 |
| 2043284446 |
| 2043283383 |
| 2043282578 |
| 2043281967 |
| 2043281504 |
Still decreasing with \(> 10^{-3}\) per iteration step.
Try even higher learning rate:
alpha <- 1
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=1, iterations=50 and theta_init=0, 0, 0.
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
| cost |
|---|
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
| 2043280051 |
We have chosen our parameters such that the cost function converges. But let us increase the iterations value as well (50 is very low).
alpha <- 1
iterations <- 1000
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
# cost vector
cost_vec <- grad_desc$cost_vec
The plot shows the development of the cost function for
alpha=1, iterations=1000 and theta_init=0, 0, 0.
Now the cost function is monotonic decreasing and the change in the cost function is less than \(10^{-3}\) per iteration step. Thus gradient descent results in theta= 3.404126610^{5}, 1.106310510^{5}, -6649.4742708.
theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma
x <- c(1650,3)
price <- hnormal(theta,x,mu,sigma)
The predicted price of a 1650 sq-ft, 3 br house is 293,081.5 (predicted by gradient descent with feature normalization).
Let us check which prediction we would get with the same settings and no feature normalization:
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for
alpha=1, iterations=1000 and theta_init=0, 0, 0.
Ok, the cost function is not converging so we first have to decrease the learning rate
alpha <- 0.01
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for
alpha=0.01, iterations=1000 and theta_init=0, 0, 0.
We have to decrease alpha even more:
alpha <- 0.0001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=10^{-4}, iterations=1000 and theta_init=0, 0, 0.
And even more:
alpha <- 0.000001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=10^{-6}, iterations=1000 and theta_init=0, 0, 0.
And even more:
alpha <- 0.00000001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=10^{-8}, iterations=1000 and theta_init=0, 0, 0.
With this ultra small learning rate the cost function seems to converge. Let’s increase the number of iterations and check the last ten cost values in the iteration:
iterations <- 10000
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption=paste("Last 10 cost values, ",iterations," iterations"),col.names=c("cost"))
| cost |
|---|
| 2397824484 |
| 2397824481 |
| 2397824477 |
| 2397824473 |
| 2397824470 |
| 2397824466 |
| 2397824462 |
| 2397824459 |
| 2397824455 |
| 2397824451 |
| 2397824448 |
x <- c(1,1650,3)
theta <- grad_desc$theta
price <- h(theta,x)
Task: Explain why the cost function is still decreasing (although slowly).
The price predicted after 10^{4} iteration steps without feature normalization for a 1650 sq-ft, 3 br house is 272,883.8.
Be carefull: only works for 2 features
# set the good learning parameters
alpha=1
iterations = 1000
theta_init = c(0,0,0)
grad_desc <- gradientDescent(data_normal, theta_init, alpha,iterations)
xx <- seq(min(data[,1]),max(data[,1]),length.out=25) # 1st feature = size
yy <- seq(min(data[,2]),max(data[,2]),length.out = 25) # 2nd feature = br
zz <- matrix(0,length(xx),length(yy)) # price
theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma
for (i in 1:length(xx)){
for (j in 1:length(yy)){
zz[i,j] <- hnormal(theta,c(xx[i],yy[j]),mu,sigma)
}
}
open3d()
## wgl
## 1
plot3d(data[,1],data[,2],data[,3],
xlab= "size (sq-feet)", ylab="bedroom num.", zlab="price",
col="blue",type="s",size=1.5, main="Result of Gradient Descent")
persp3d(xx,yy,zz, col=heat.colors(100) ,alpha=.7, add=TRUE)
You must enable Javascript to view this page properly.
# read data
data <-read.table('ex1data2.txt',
sep=",",
encoding="UTF-8",
header=FALSE,
as.is=TRUE
)
m <- dim(data)[1]
n <- dim(data)[2]
X <- as.matrix(cbind(rep(1,times=m),data[,1:n-1]))
y <- data[,n]
theta <- normalEquationExact(X, y)
price <- h(theta,c(1,1650,3))
The predicted price of a 1650 sq-ft, 3 br house is 293,081.5 (predicted by normal equation without feature normalization).
X <- data[,1:n-1]
y <- as.matrix(data[n])
fN <- featureNormalization(X)
mu <- fN$mu
sigma <- fN$sigma
XN <- as.matrix(ones(fN$nv))
theta <- normalEquationExact(XN, y)
price <- hnormal(theta,c(1650,3),sigma,mu)
The predicted price of a 1650 sq-ft, 3 br house is 383,011.5 (predicted by normal equation with feature normalization).
Note the big difference to the other predicted prices.
In the following we look at the following features:
\(size, \sqrt{size},size^2,size^3,size^4\)
sizes <- data[,1]
y <- data[,3]
alpha <- 0.4
iterations <- 1500
theta_init <- c(0,0,0,0,0,0)
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
fN <- featureNormalization(sizes)
mu <- fN$mu
sigma <- fN$sigma
grad_desc <- gradientDescent(cbind(fN$nv,y),theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotdata <- cbind(data[,1],y)
x <- as.matrix(seq(from=0,to=5000,length.out=5000),ncol=1)
X <- cbind(x,sqrt(x),x^2,x^3,x^4)
ypred <- c(1,times=5000)
for (i in 1:dim(X)[1]){
ypred[i] <- hnormal(theta,X[i,],mu,sigma)
}
plot(plotdata,col="red",pch=4,lwd=2,xlab="size",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
points(x,ypred,type="l",col="blue")
Theta values: 340412.66, 55199.58, -25098.50, 123268.06, 47768.23, -101072.67
sizes <- data[,1]
y <- data[,3]
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
sizes <- ones(sizes)
theta <- normalEquation(sizes,y)
plotdata <- cbind(data[,1],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="sizes",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)
Theta values: 1.344396e-23, 4.582734e-20, 7.757718e-22, 1.691249e-16, 6.589354e-13, 2.661079e-09
tol=0 for R’s inverse function ginv)sizes <- data[,1]
y <- data[,3]
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
sizes <- ones(sizes)
theta <- normalEquationExact(sizes,y)
plotdata <- cbind(data[,1],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="sizes",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)
Theta values: 2.044467e+05, 3.003035e-01, -8.389948e+02, 3.892136e-02, 5.106840e-06, -1.870584e-09
In the following we look at the following features:
\(bedrooms, \sqrt{bedrooms},bedrooms^2,bedrooms^3,bedrooms^4\)
bedrooms <- data[,2]
y <- data[,3]
alpha <- 0.4
iterations <- 1500
theta_init <- c(0,0,0,0,0,0)
bedrooms <- cbind(bedrooms,sqrt(bedrooms),bedrooms^2,bedrooms^3,bedrooms^4)
fN <- featureNormalization(bedrooms)
mu <- fN$mu
sigma <- fN$sigma
grad_desc <- gradientDescent(cbind(fN$nv,y),theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotdata <- cbind(data[,2],y)
x <- as.matrix(seq(from=0,to=10,length.out=500),ncol=1)
X <- cbind(x,sqrt(x),x^2,x^3,x^4)
ypred <- c(1,times=500)
for (i in 1:dim(X)[1]){
ypred[i] <- hnormal(theta,X[i,],mu,sigma)
}
plot(plotdata,col="red",pch=4,lwd=2,xlab="bedrooms",ylab="price",xlim=c(0,10),ylim=c(0,1000000))
points(x,ypred,type="l",col="blue")
Theta values: 340412.66, -39531.47, 178076.09, -226671.90, -102517.28, 264158.47
bedrooms <- data[,2]
y <- data[,3]
bedrooms <- cbind(bedrooms,sqrt(bedrooms),bedrooms^2,bedrooms^3,bedrooms^4)
bedrooms <- ones(bedrooms)
theta <- normalEquation(bedrooms,y)
plotdata <- cbind(data[,2],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="bedrooms",ylab="price",xlim=c(0,10),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)
Theta values: 6338.473, 64925.975, 33509.299, 101086.504, -51273.058, 6677.330
Be careful not to overfit your data!