data 1: two parallel lines

###################################################
# graphical parameter
par(lwd=2,pch=4)

### Play with data and linear regression
n = 120 # number of samples
m = 10  # max value of x (min is zero)
theta_init = c(0,0) 
iterations = 2000
alpha = 0.01# try differnt values


X <- seq(from=0,to=m,length.out=n)
y <- rep(1,times=n)
y[seq(from=1,to=n,by=3)] <- 4
data1 <- cbind(X,y)
plot(data1,col="red",main="data1: 1/3 of all values = 4, rest=1")  
data <- data1
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data2: quadratic function

# non-linear distribution
X <- seq(from=0,to=m,length.out=n)
y <- (X-m/2)^2 
data2 <- cbind(X,y)
plot(data2,col="red",main="data2: quadratic function `$y=(x-5)^2$´")  
data <- data2
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data3: linear function \(f(x)=2+4x\)

# linear distribution, residuals = 0
X <- seq(from=0,to=m,length.out=n)
y <- 4*X +2
data3 <- cbind(X,y)
plot(data3,col="red",main="data3: linear function ´$y=4x+2$`")  
data <- data3
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data4: linear function \(f(x)=2+4x\) with normally distributed residuals

# linear with normally distributed residuals
X <- seq(from=0,to=m,length.out=n)
y <- 2 + 4*X 
res_vec <- rnorm(n, mean = 0, sd = 8) 
y <- y + res_vec
data4 <- cbind(X,y)
plot(data4,col="red",main="data4: linear function with normally distributed residuals, `$\\mu=0,\\sigma=8`")  
data <- data4
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data5: linear function \(f(x)=2+4x\) with uniformly distributed residuals

# linear with uniformly distributed residuals
X <- seq(from=0,to=m,length.out=n)
y <- 4*X +2
res_vec <- runif(n, min = min(res_vec), max = max(res_vec))
y <- y + res_vec
data5 <- cbind(X,y)
plot(data5,col="red",main="data5: linear function with uniformly distributed residuals")  
data <- data5
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

data6: uniformly distributed data

# uniformly distributed data
X <- seq(from=0,to=m,length.out=n)
y <- runif(n,min=0,max=10)
data6 <- cbind(X,y)
plot(data6,col="red",main="data6: uniformly distributed")  
data <- data6
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

Why do we always end up with \(\theta_0\approx 0\) for data6?

Let’s look at a bigger context of our data and at some possible and impossible linear fits:

theta_exp <- matrix(c(5,0),ncol=2)
theta_exp <- rbind(theta_exp,c(-5000,1000))
theta_exp <- rbind(theta_exp,c(-2,0))
theta_exp <- rbind(theta_exp,c(0,1))
theta_exp <- rbind(theta_exp,c(10,-1))
cost <- c()
for (i in 1:nrow(theta_exp)){
  cost<-c(cost,computeCost(data,theta_exp[i,]))
}
theta_exp <- cbind(theta_exp,cost)
colnames(theta_exp) <- c("t0","t1","cost")


plot(data,col="red",xlim=c(-5,15),ylim=c(-5,15),main="dat6: uniformly distributed",sub="with some (im)possible linear fits")  
for (i in 1:nrow(theta_exp)){
  curve(theta_exp[i,1]+theta_exp[i,2]*x,col="blue",add=TRUE)
}
theta_exp <- round(theta_exp,digits=2)
kable(theta_exp,row.names=FALSE,caption="cost values for example thetas")

cost values for example thetas
t0	t1	cost
5	0	3.68
-5000	1000	4235633.34
-2	0	26.68
0	1	6.84
10	-1	8.99

You must enable Javascript to view this page properly.

data7: ‘vertical data’

# vertical data
X <- seq(from=1,to=2,length.out=n)
y <- rnorm(n, mean=0, sd=40)
data7 <- cbind(X,y)
plot(data7,col="red",main="data6: 'vertical data'",xlim=c(-50,50))  
data <- data7
grad_desc <- gradientDescent(data,theta_init,alpha,iterations)
plotLinearFit(data,grad_desc$theta)
plotCostSurface(data,grad_desc)

You must enable Javascript to view this page properly.

Exercise:

Look at the plot for data6 and explain why \(\theta_0\approx 0\) gives the best linear fit.
Compare the final costs of data2, data4, data5 and data6. Why are data2 and data6 performing comparably well?
What is happening in data7? Why do we not get a vertical linear fit?

Machine Learning - Session 1: Linear Regression

Part 2: Example data sets

Wiebke Petersen

01 Mai, 2017

data 1: two parallel lines

data2: quadratic function

data3: linear function \(f(x)=2+4x\)

data4: linear function \(f(x)=2+4x\) with normally distributed residuals

data5: linear function \(f(x)=2+4x\) with uniformly distributed residuals

data6: uniformly distributed data

data7: ‘vertical data’

Exercise: