data 1: two parallel lines

###################################################
# graphical parameter
par(lwd=2,pch=4)

### Play with data and linear regression
n = 120 # number of samples
m = 10  # max value of x (min is zero)
theta_init = c(0,0) 
iterations = 2000
alpha = 0.01# try differnt values


X <- seq(from=0,to=m,length.out=n)
y <- rep(1,times=n)
y[seq(from=1,to=n,by=3)] <- 4
data1 <- cbind(X,y)
plot(data1,col="red",main="data1: 1/3 of all values = 4, rest=1")  
data <- data1
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data2: quadratic function

# non-linear distribution
X <- seq(from=0,to=m,length.out=n)
y <- (X-m/2)^2 
data2 <- cbind(X,y)
plot(data2,col="red",main="data2: quadratic function `$y=(x-5)^2$´")  
data <- data2
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data3: linear function \(f(x)=2+4x\)

# linear distribution, residuals = 0
X <- seq(from=0,to=m,length.out=n)
y <- 4*X +2
data3 <- cbind(X,y)
plot(data3,col="red",main="data3: linear function ´$y=4x+2$`")  
data <- data3
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data4: linear function \(f(x)=2+4x\) with normally distributed residuals

# linear with normally distributed residuals
X <- seq(from=0,to=m,length.out=n)
y <- 2 + 4*X 
res_vec <- rnorm(n, mean = 0, sd = 8) 
y <- y + res_vec
data4 <- cbind(X,y)
plot(data4,col="red",main="data4: linear function with normally distributed residuals, `$\\mu=0,\\sigma=8`")  
data <- data4
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data5: linear function \(f(x)=2+4x\) with uniformly distributed residuals

# linear with uniformly distributed residuals
X <- seq(from=0,to=m,length.out=n)
y <- 4*X +2
res_vec <- runif(n, min = min(res_vec), max = max(res_vec))
y <- y + res_vec
data5 <- cbind(X,y)
plot(data5,col="red",main="data5: linear function with uniformly distributed residuals")  
data <- data5
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data6: uniformly distributed data

# uniformly distributed data
X <- seq(from=0,to=m,length.out=n)
y <- runif(n,min=0,max=10)
data6 <- cbind(X,y)
plot(data6,col="red",main="data6: uniformly distributed")  
data <- data6
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

Why do we always end up with \(\theta_0\approx 0\) for data6?

Let’s look at a bigger context of our data and at some possible and impossible linear fits:

theta_exp <- matrix(c(5,0),ncol=2)
theta_exp <- rbind(theta_exp,c(-5000,1000))
theta_exp <- rbind(theta_exp,c(-2,0))
theta_exp <- rbind(theta_exp,c(0,1))
theta_exp <- rbind(theta_exp,c(10,-1))
cost <- c()
for (i in 1:nrow(theta_exp)){
  cost<-c(cost,computeCost(data,theta_exp[i,]))
}
theta_exp <- cbind(theta_exp,cost)
colnames(theta_exp) <- c("t0","t1","cost")


plot(data,col="red",xlim=c(-5,15),ylim=c(-5,15),main="dat6: uniformly distributed",sub="with some (im)possible linear fits")  
for (i in 1:nrow(theta_exp)){
  curve(theta_exp[i,1]+theta_exp[i,2]*x,col="blue",add=TRUE)
}
theta_exp <- round(theta_exp,digits=2)
kable(theta_exp,row.names=FALSE,caption="cost values for example thetas")
cost values for example thetas
t0 t1 cost
5 0 3.68
-5000 1000 4235633.34
-2 0 26.68
0 1 6.84
10 -1 8.99

You must enable Javascript to view this page properly.

data7: ‘vertical data’

# vertical data
X <- seq(from=1,to=2,length.out=n)
y <- rnorm(n, mean=0, sd=40)
data7 <- cbind(X,y)
plot(data7,col="red",main="data6: 'vertical data'",xlim=c(-50,50))  
data <- data7
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

Exercise:

  1. Look at the plot for data6 and explain why \(\theta_0\approx 0\) gives the best linear fit.
  2. Compare the final costs of data2, data4, data5 and data6. Why are data2 and data6 performing comparably well?
  3. What is happening in data7? Why do we not get a vertical linear fit?