2017-05-02 17:24:07
# graphical parameter
par(lwd=2,pch=4)
# read data
data <-as.matrix(read.table('ex1data2.txt',
sep=",",
encoding="UTF-8",
header=FALSE
))
m <- dim(data)[1]
n <- dim(data)[2]
X <- data[,1:n-1]
y <- data[,n]
# print some data
data10<-data[1:10,]
kable(data10,col.names=c("x1: size","x2: rooms","y: price"),caption="First 10 training examples before feature normalization")
x1: size | x2: rooms | y: price |
---|---|---|
2104 | 3 | 399900 |
1600 | 3 | 329900 |
2400 | 3 | 369000 |
1416 | 2 | 232000 |
3000 | 4 | 539900 |
1985 | 4 | 299900 |
1534 | 3 | 314900 |
1427 | 3 | 198999 |
1380 | 3 | 212000 |
1494 | 3 | 242500 |
# plot some data
plot(data[,1],data[,3],ylab="price",xlab="size",col="red")
plot(data[,2],data[,3],ylab="price",xlab="bedrooms",col="red")
plot(data[,2],data[,1],ylab="size",xlab="bedrooms",col="red")
fN <- featureNormalization(X)
XN <- fN$nv
X10 <- XN[1:10,]
y10 <- y[1:10]
data10<-cbind(X10,y10)
kable(data10,col.names=c("x1: size","x2: rooms","y: price"),caption="First 10 training examples after feature normalization")
x1: size | x2: rooms | y: price |
---|---|---|
0.1300099 | -0.2236752 | 399900 |
-0.5041898 | -0.2236752 | 329900 |
0.5024764 | -0.2236752 | 369000 |
-0.7357231 | -1.5377669 | 232000 |
1.2574760 | 1.0904165 | 539900 |
-0.0197317 | 1.0904165 | 299900 |
-0.5872398 | -0.2236752 | 314900 |
-0.7218814 | -0.2236752 | 198999 |
-0.7810230 | -0.2236752 | 212000 |
-0.6375731 | -0.2236752 | 242500 |
t <- c(1:10)
k <- rep(2,times=10)
k[1] <- 2.1
r <- k
r[1] <- 2.001
s <- k
s[1] <- 17
K <- cbind(t,k,r,s)
kable(K,caption="Example data before feature normalization")
t | k | r | s |
---|---|---|---|
1 | 2.1 | 2.001 | 17 |
2 | 2.0 | 2.000 | 2 |
3 | 2.0 | 2.000 | 2 |
4 | 2.0 | 2.000 | 2 |
5 | 2.0 | 2.000 | 2 |
6 | 2.0 | 2.000 | 2 |
7 | 2.0 | 2.000 | 2 |
8 | 2.0 | 2.000 | 2 |
9 | 2.0 | 2.000 | 2 |
10 | 2.0 | 2.000 | 2 |
K <- featureNormalization(K)$nv
kable(K,caption="Example data after feature normalization")
t | k | r | s |
---|---|---|---|
-1.4863011 | 2.8460499 | 2.8460499 | 2.8460499 |
-1.1560120 | -0.3162278 | -0.3162278 | -0.3162278 |
-0.8257228 | -0.3162278 | -0.3162278 | -0.3162278 |
-0.4954337 | -0.3162278 | -0.3162278 | -0.3162278 |
-0.1651446 | -0.3162278 | -0.3162278 | -0.3162278 |
0.1651446 | -0.3162278 | -0.3162278 | -0.3162278 |
0.4954337 | -0.3162278 | -0.3162278 | -0.3162278 |
0.8257228 | -0.3162278 | -0.3162278 | -0.3162278 |
1.1560120 | -0.3162278 | -0.3162278 | -0.3162278 |
1.4863011 | -0.3162278 | -0.3162278 | -0.3162278 |
Task: explain the normalized data.
# prepare data
data_normal <- cbind(XN,y)
# Some gradient descent settings
theta_init = as.matrix(c(0,0,0))
iterations = 50
alpha = 0.01
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=
0.01, iterations=
50 and theta_init=
0, 0, 0.
alpha <- 0.1
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=
0.1, iterations=
50 and theta_init=
0, 0, 0.
The graph looks good, but check the scale. Let us look at the last 10 values of the cost function:
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
cost |
---|
2099703830 |
2093608777 |
2088282084 |
2083610571 |
2079499804 |
2075870768 |
2072657148 |
2069803105 |
2067261467 |
2064992246 |
2062961418 |
Still decreasing pretty fast >10−3 per iteration step.
Try even higher learning rate:
alpha <- 0.3
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=
0.3, iterations=
50 and theta_init=
0, 0, 0.
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
cost |
---|
2043303169 |
2043297581 |
2043293344 |
2043290131 |
2043287694 |
2043285847 |
2043284446 |
2043283383 |
2043282578 |
2043281967 |
2043281504 |
Still decreasing with >10−3 per iteration step.
Try even higher learning rate:
alpha <- 1
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=
1, iterations=
50 and theta_init=
0, 0, 0.
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption="Last 10 cost values in iteration",col.names=c("cost"))
cost |
---|
2043280051 |
2043280051 |
2043280051 |
2043280051 |
2043280051 |
2043280051 |
2043280051 |
2043280051 |
2043280051 |
2043280051 |
2043280051 |
We have chosen our parameters such that the cost function converges. But let us increase the iterations
value as well (50 is very low).
alpha <- 1
iterations <- 1000
# run gradient descent
grad_desc <- gradientDescent(data_normal, theta_init, alpha, iterations)
# plot cost development
plotCostDev(grad_desc)
# cost vector
cost_vec <- grad_desc$cost_vec
The plot shows the development of the cost function for
alpha=
1, iterations=
1000 and theta_init=
0, 0, 0.
Now the cost function is monotonic decreasing and the change in the cost function is less than 10−3 per iteration step. Thus gradient descent results in theta=
3.404126610^{5}, 1.106310510^{5}, -6649.4742708.
theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma
x <- c(1650,3)
price <- hnormal(theta,x,mu,sigma)
The predicted price of a 1650 sq-ft, 3 br house is 293,081.5 (predicted by gradient descent with feature normalization).
Let us check which prediction we would get with the same settings and no feature normalization:
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for
alpha=
1, iterations=
1000 and theta_init=
0, 0, 0.
Ok, the cost function is not converging so we first have to decrease the learning rate
alpha <- 0.01
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for
alpha=
0.01, iterations=
1000 and theta_init=
0, 0, 0.
We have to decrease alpha
even more:
alpha <- 0.0001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=
10^{-4}, iterations=
1000 and theta_init=
0, 0, 0.
And even more:
alpha <- 0.000001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=
10^{-6}, iterations=
1000 and theta_init=
0, 0, 0.
And even more:
alpha <- 0.00000001
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotCostDev(grad_desc)
The plot shows the development of the cost function for alpha=
10^{-8}, iterations=
1000 and theta_init=
0, 0, 0.
With this ultra small learning rate the cost function seems to converge. Let’s increase the number of iterations and check the last ten cost values in the iteration:
iterations <- 10000
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
cost_vec <- grad_desc$cost_vec
c<-length(cost_vec)
cv <- cost_vec[(c-10):c]
kable(cv,caption=paste("Last 10 cost values, ",iterations," iterations"),col.names=c("cost"))
cost |
---|
2397824484 |
2397824481 |
2397824477 |
2397824473 |
2397824470 |
2397824466 |
2397824462 |
2397824459 |
2397824455 |
2397824451 |
2397824448 |
x <- c(1,1650,3)
theta <- grad_desc$theta
price <- h(theta,x)
Task: Explain why the cost function is still decreasing (although slowly).
The price predicted after 10^{4} iteration steps without feature normalization for a 1650 sq-ft, 3 br house is 272,883.8.
Be carefull: only works for 2 features
# set the good learning parameters
alpha=1
iterations = 1000
theta_init = c(0,0,0)
grad_desc <- gradientDescent(data_normal, theta_init, alpha,iterations)
xx <- seq(min(data[,1]),max(data[,1]),length.out=25) # 1st feature = size
yy <- seq(min(data[,2]),max(data[,2]),length.out = 25) # 2nd feature = br
zz <- matrix(0,length(xx),length(yy)) # price
theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma
for (i in 1:length(xx)){
for (j in 1:length(yy)){
zz[i,j] <- hnormal(theta,c(xx[i],yy[j]),mu,sigma)
}
}
open3d()
## wgl
## 1
plot3d(data[,1],data[,2],data[,3],
xlab= "size (sq-feet)", ylab="bedroom num.", zlab="price",
col="blue",type="s",size=1.5, main="Result of Gradient Descent")
persp3d(xx,yy,zz, col=heat.colors(100) ,alpha=.7, add=TRUE)
# read data
data <-read.table('ex1data2.txt',
sep=",",
encoding="UTF-8",
header=FALSE,
as.is=TRUE
)
m <- dim(data)[1]
n <- dim(data)[2]
X <- as.matrix(cbind(rep(1,times=m),data[,1:n-1]))
y <- data[,n]
theta <- normalEquationExact(X, y)
price <- h(theta,c(1,1650,3))
The predicted price of a 1650 sq-ft, 3 br house is 293,081.5 (predicted by normal equation without feature normalization).
X <- data[,1:n-1]
y <- as.matrix(data[n])
fN <- featureNormalization(X)
mu <- fN$mu
sigma <- fN$sigma
XN <- as.matrix(ones(fN$nv))
theta <- normalEquationExact(XN, y)
price <- hnormal(theta,c(1650,3),sigma,mu)
The predicted price of a 1650 sq-ft, 3 br house is 383,011.5 (predicted by normal equation with feature normalization).
Note the big difference to the other predicted prices.
In the following we look at the following features:
size,√size,size2,size3,size4
sizes <- data[,1]
y <- data[,3]
alpha <- 0.4
iterations <- 1500
theta_init <- c(0,0,0,0,0,0)
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
fN <- featureNormalization(sizes)
mu <- fN$mu
sigma <- fN$sigma
grad_desc <- gradientDescent(cbind(fN$nv,y),theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotdata <- cbind(data[,1],y)
x <- as.matrix(seq(from=0,to=5000,length.out=5000),ncol=1)
X <- cbind(x,sqrt(x),x^2,x^3,x^4)
ypred <- c(1,times=5000)
for (i in 1:dim(X)[1]){
ypred[i] <- hnormal(theta,X[i,],mu,sigma)
}
plot(plotdata,col="red",pch=4,lwd=2,xlab="size",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
points(x,ypred,type="l",col="blue")
Theta values: 340412.66, 55199.58, -25098.50, 123268.06, 47768.23, -101072.67
sizes <- data[,1]
y <- data[,3]
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
sizes <- ones(sizes)
theta <- normalEquation(sizes,y)
plotdata <- cbind(data[,1],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="sizes",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)
Theta values: 1.344396e-23, 4.582734e-20, 7.757718e-22, 1.691249e-16, 6.589354e-13, 2.661079e-09
tol=0
for R’s inverse function ginv
)sizes <- data[,1]
y <- data[,3]
sizes <- cbind(sizes,sqrt(sizes),sizes^2,sizes^3,sizes^4)
sizes <- ones(sizes)
theta <- normalEquationExact(sizes,y)
plotdata <- cbind(data[,1],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="sizes",ylab="price",xlim=c(0,5000),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)
Theta values: 2.044467e+05, 3.003035e-01, -8.389948e+02, 3.892136e-02, 5.106840e-06, -1.870584e-09
In the following we look at the following features:
bedrooms,√bedrooms,bedrooms2,bedrooms3,bedrooms4
bedrooms <- data[,2]
y <- data[,3]
alpha <- 0.4
iterations <- 1500
theta_init <- c(0,0,0,0,0,0)
bedrooms <- cbind(bedrooms,sqrt(bedrooms),bedrooms^2,bedrooms^3,bedrooms^4)
fN <- featureNormalization(bedrooms)
mu <- fN$mu
sigma <- fN$sigma
grad_desc <- gradientDescent(cbind(fN$nv,y),theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotdata <- cbind(data[,2],y)
x <- as.matrix(seq(from=0,to=10,length.out=500),ncol=1)
X <- cbind(x,sqrt(x),x^2,x^3,x^4)
ypred <- c(1,times=500)
for (i in 1:dim(X)[1]){
ypred[i] <- hnormal(theta,X[i,],mu,sigma)
}
plot(plotdata,col="red",pch=4,lwd=2,xlab="bedrooms",ylab="price",xlim=c(0,10),ylim=c(0,1000000))
points(x,ypred,type="l",col="blue")
Theta values: 340412.66, -39531.47, 178076.09, -226671.90, -102517.28, 264158.47
bedrooms <- data[,2]
y <- data[,3]
bedrooms <- cbind(bedrooms,sqrt(bedrooms),bedrooms^2,bedrooms^3,bedrooms^4)
bedrooms <- ones(bedrooms)
theta <- normalEquation(bedrooms,y)
plotdata <- cbind(data[,2],y)
plot(plotdata,col="red",pch=4,lwd=2,xlab="bedrooms",ylab="price",xlim=c(0,10),ylim=c(0,1000000))
curve(theta[1]+theta[2]*x+theta[3]*sqrt(x)+theta[4]*(x^2)+theta[5]*(x^3)+theta[6]*(x^4),col="blue",add=TRUE)
Theta values: 6338.473, 64925.975, 33509.299, 101086.504, -51273.058, 6677.330
Be careful not to overfit your data!