###################################################
# graphical parameter
par(lwd=2,pch=4)
### Play with data and linear regression
n = 120 # number of samples
m = 10 # max value of x (min is zero)
theta_init = c(0,0)
iterations = 2000
alpha = 0.01# try differnt values
X <- seq(from=0,to=m,length.out=n)
y <- rep(1,times=n)
y[seq(from=1,to=n,by=3)] <- 4
data1 <- cbind(X,y)
plot(data1,col="red",main="data1: 1/3 of all values = 4, rest=1")
data <- data1
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)
You must enable Javascript to view this page properly.
# non-linear distribution
X <- seq(from=0,to=m,length.out=n)
y <- (X-m/2)^2
data2 <- cbind(X,y)
plot(data2,col="red",main="data2: quadratic function `$y=(x-5)^2$´")
data <- data2
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)
You must enable Javascript to view this page properly.
# linear distribution, residuals = 0
X <- seq(from=0,to=m,length.out=n)
y <- 4*X +2
data3 <- cbind(X,y)
plot(data3,col="red",main="data3: linear function ´$y=4x+2$`")
data <- data3
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)
You must enable Javascript to view this page properly.
# linear with normally distributed residuals
X <- seq(from=0,to=m,length.out=n)
y <- 2 + 4*X
res_vec <- rnorm(n, mean = 0, sd = 8)
y <- y + res_vec
data4 <- cbind(X,y)
plot(data4,col="red",main="data4: linear function with normally distributed residuals, `$\\mu=0,\\sigma=8`")
data <- data4
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)
You must enable Javascript to view this page properly.
# linear with uniformly distributed residuals
X <- seq(from=0,to=m,length.out=n)
y <- 4*X +2
res_vec <- runif(n, min = min(res_vec), max = max(res_vec))
y <- y + res_vec
data5 <- cbind(X,y)
plot(data5,col="red",main="data5: linear function with uniformly distributed residuals")
data <- data5
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)
You must enable Javascript to view this page properly.
# uniformly distributed data
X <- seq(from=0,to=m,length.out=n)
y <- runif(n,min=0,max=10)
data6 <- cbind(X,y)
plot(data6,col="red",main="data6: uniformly distributed")
data <- data6
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)
You must enable Javascript to view this page properly.
Why do we always end up with \(\theta_0\approx 0\) for data6
?
Let’s look at a bigger context of our data and at some possible and impossible linear fits:
theta_exp <- matrix(c(5,0),ncol=2)
theta_exp <- rbind(theta_exp,c(-5000,1000))
theta_exp <- rbind(theta_exp,c(-2,0))
theta_exp <- rbind(theta_exp,c(0,1))
theta_exp <- rbind(theta_exp,c(10,-1))
cost <- c()
for (i in 1:nrow(theta_exp)){
cost<-c(cost,computeCost(data,theta_exp[i,]))
}
theta_exp <- cbind(theta_exp,cost)
colnames(theta_exp) <- c("t0","t1","cost")
plot(data,col="red",xlim=c(-5,15),ylim=c(-5,15),main="dat6: uniformly distributed",sub="with some (im)possible linear fits")
for (i in 1:nrow(theta_exp)){
curve(theta_exp[i,1]+theta_exp[i,2]*x,col="blue",add=TRUE)
}
theta_exp <- round(theta_exp,digits=2)
kable(theta_exp,row.names=FALSE,caption="cost values for example thetas")
t0 | t1 | cost |
---|---|---|
5 | 0 | 3.68 |
-5000 | 1000 | 4235633.34 |
-2 | 0 | 26.68 |
0 | 1 | 6.84 |
10 | -1 | 8.99 |
You must enable Javascript to view this page properly.
# vertical data
X <- seq(from=1,to=2,length.out=n)
y <- rnorm(n, mean=0, sd=40)
data7 <- cbind(X,y)
plot(data7,col="red",main="data6: 'vertical data'",xlim=c(-50,50))
data <- data7
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)
You must enable Javascript to view this page properly.
data6
and explain why \(\theta_0\approx 0\) gives the best linear fit.data2
, data4
, data5
and data6
. Why are data2
and data6
performing comparably well?data7
? Why do we not get a vertical linear fit?