# graphical parameter
par(lwd=2,pch=4)
# read data
data <-as.matrix(read.table('ex2data1.txt',
sep=",",
encoding="UTF-8",
header=FALSE
))
m <- dim(data)[1]
n <- dim(data)[2]
X <- data[,1:n-1]
y <- data[,n]
# print some data
data10<-data[1:10,]
kable(data10,col.names=c("x1: 1. exam","x2: 2. exam","y: admission"),caption="First 10 training examples")
x1: 1. exam | x2: 2. exam | y: admission |
---|---|---|
34.62366 | 78.02469 | 0 |
30.28671 | 43.89500 | 0 |
35.84741 | 72.90220 | 0 |
60.18260 | 86.30855 | 1 |
79.03274 | 75.34438 | 1 |
45.08328 | 56.31637 | 0 |
61.10666 | 96.51143 | 1 |
75.02475 | 46.55401 | 1 |
76.09879 | 87.42057 | 1 |
84.43282 | 43.53339 | 1 |
# plot data
data_pos <- data[which(data[,3]==1),]
data_neg <- data[which(data[,3]==0),]
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
alpha=0.0001
iterations = 1000
theta_init = c(0,0)
testdata <- data[,c(1,3)]
grad_desc <- gradientDescent(testdata,theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotLinearFit(testdata,theta)
abline(h=0.5,col="gray",lwd=1)
abline(v=52,col="gray",lwd=1)
Hier sehen Sie die Ergebnisse der 1. Prüfung und der Zulassungen. Die lineare Regression liefert keine Möglichkeit die zugelassene von der nicht zugelassene Gruppe zu trennen.
In einem Klassifikationsproblem mit zwei Klassen kann y nur die Werte 0 und 1 annehmen. Es ist daher ungewünscht, wenn hΘ(x) beliebig kleine und beliebig große Werte annehmen kann. Wir suchen daher eine Funktion mit 0≤hΘ(x)≤h(1)
die wir wie folgt interpretieren können: hΘ(x) gibt die Wahrscheinlichkeit an, dass x die Klasse 1 zugewiesen bekommt.
Lösung: sigmoid Funktion 11+e−z
# plot sigmoid
curve(sigmoid,xlim=c(-4,4),ylim=c(-0.5,1.5),main="sigmoid function",col="blue",lwd=2)
abline(h=0,col="gray",lwd=1)
abline(h=0.5,col="gray",lwd=1)
abline(h=1,col="gray",lwd=1)
abline(v=0,col="gray",lwd=1)
Unsere neue Hypothesenfuntion ist hlogΘ(x)=11+e−hΘ(x) Wir verwenden vorübergehend hlogΘ(x) für die neue und hΘ(x) für die alte Hypothesenfunktion (zur Erinnerung hΘ(→x))=θ0+θ1x1+…θnxn), später werden wir auf die explizite Angabe des Superskripts log zumeist verzichten.
Zur Erinnerung: Die Kostenfunktion ist eine Funktion der Parameter Θ. Sie modelliert die Abweichung der Hypothesenfunktion von den beobachteten Daten: hohe Abweichungen führen zu hohen Kosten. Das Ziel ist es die Parameter der Hypothesenfunktion so zu wählen, dass die Kosten minimiert werden.
Bisher haben wir die Kosten über die Summe der Quadrate der Abweichungen berechnet:
J(Θ)=1mm∑i=112(hΘ(x(i))−y(i))2 Diese Funktion ist für hlogΘ(x) allerdings nicht konvex und somit für die lineare Regressionsanalyse ungeeignet.
Hier eine kleine Spielerei, mit einer Minitrainingsmenge und zwei Merkmalen:
# Let's plot the old cost function for a simple data set
# x y
# 1 1
# 2 0
# 3 1
# 4 0
# 5 0
# Features: x, x^2
y <- c(1,0,1,0,0)
x <- c(1:length(y))
oldcost <- function(x,y,theta0,theta1,theta2){
c <- rep(0,length(y))
for (i in 1:length(y)){
c[i] <- (sigmoid(theta0+theta1*x[i]+theta2*x[i]^2)-y[i])^2
}
return(1/(2*length(y))*sum(c))
}
# initialize J_vals to a matrix of 0's
k <- 10
theta0_vals=seq(-k,k,length.out=100)
theta1_vals=seq(-k,k,length.out=100)
theta2_vals=seq(-k,k,length.out=100)
J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
theta2 <- -1
# Fill out J_vals
for (i in 1:length(theta0_vals)){
for (j in 1:length(theta1_vals)){
J_vals[i,j] <- oldcost(x,y,theta0_vals[i], theta1_vals[j],theta2)
}
}
Wir betrachten folgende Klassifikationsdaten:
und die Hypothesenfunktion hlogΘ(x)=11+e−(θ0+θ1x+θ2x2)
Wir haben also drei Parameter θ0,θ1,θ2. Um die Kostenfunktion plotten zu können, fixieren wir θ2=-1.
# Surface plot
open3d()
## wgl
## 1
color = rev(rainbow(100, start = 0/8, end = 6/8))
persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9,
fog=TRUE,
xlab='t0', ylab='t1',zlab="cost",main="Contour plot of old cost function")
Wir sehen, dass die Kostenfunktion nicht konvex ist.
Hier noch einmal ein anderer Schnitt durch die Kostenfunktion (für θ0=0):
theta1 <- 0
# Fill out J_vals
for (i in 1:length(theta0_vals)){
for (j in 1:length(theta1_vals)){
J_vals[i,j] <- oldcost(x,y,theta0_vals[i], theta1,theta2_vals[j])
}
}
# Surface plot
open3d()
## wgl
## 3
color = rev(rainbow(100, start = 0/8, end = 6/8))
persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9,
fog=TRUE,
xlab='t0', ylab='t1',zlab="cost",main="Contour plot of old cost function")
Auch dieser Schnitt zeigt, dass die alte Kostenfunktion im Falle der logistischen Hypothesenfunktion nicht konvex ist.
Wir müssen also eine neue Kostenfunktion konstruieren, die die folgenden Bedingungen erfüllt:
Die folgende neue Kostenfunktion erfüllt diese Anforderungen: J(Θ)=1mm∑i=1cost(hΘ(x(i)),y(i)) mit
cost(hΘ(x),y)={−log(hΘ(x)) if y=1−log(1−hΘ(x)) if y=0=−ylog(hΘ(x))−(1−y)log(1−hΘ(x))# plot cost function
curve(-log(x),xlim=c(0,1),ylim=c(-0.5,5),col="red",main="Plot of function 'cost' (red: y=1, blue: y=0)",xlab="h_Theta(x)",ylab="cost")
curve(-log(1-x),col="blue",add=TRUE)
cost=
0.0010005cost=
6.9077553cost=
0.6931472cost=
6.9077553cost=
0.0010005cost=
0.6931472Fitting parameters Θ means minimizing J(Θ).
Aim: minΘJ(Θ)
Gradient Descent:
repeat: θj:=θj−αδδθjJ(Θ)
δδθjJ(Θ)=1m∑mi=1(hΘ(x(i))−y(i))x(i)j
Thus, repeatedly: θj:=θj−α1m∑mi=1(hΘ(x(i))−y(i))x(i)j
Let’s apply this to our example data:
alpha=0.002
iterations=500
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
dataNorm <- data
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
plotCostDev(grad_desc)
Look at the plot of the cost development function. We could try to decrease the learning rate further or to switch to normalized features. Let’s first decrease the learning rate:
alpha=0.001
iterations=500
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
dataNorm <- data
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
plotCostDev(grad_desc)
Remember: We should also look at the last 10 cost values:
cost_vec <- grad_desc$cost_vec
kable(last(cost_vec,10),col.names=c("last costs"),caption="after 500 iterations")
last costs |
---|
0.6274363 |
0.6274315 |
0.6274266 |
0.6274218 |
0.6274169 |
0.6274121 |
0.6274073 |
0.6274024 |
0.6273976 |
0.6273928 |
The cost is still decreasing, we should increase the iteration rate:
alpha=0.001
iterations=30000
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
dataNorm <- data
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
cost_vec <- grad_desc$cost_vec
kable(last(cost_vec,10),col.names=c("last costs"),caption="after 30000 iterations")
last costs |
---|
0.5154282 |
0.5154253 |
0.5154224 |
0.5154194 |
0.5154165 |
0.5154136 |
0.5154106 |
0.5154077 |
0.5154048 |
0.5154018 |
cost <- last(cost_vec,1)
theta <- grad_desc$theta
The values are still decreasing and our learning rate is pretty small alpha=
0.001. The current theta
is -1.8475648, 0.0234501, 0.0148901 and the cost is 0.5154018 Here a plot of the decision boundary:
theta <- grad_desc$theta
data_pos <- data[which(data[,3]==1),]
data_neg <- data[which(data[,3]==0),]
# decision boundary:
# theta[1] + theta[2]*x1 + theta[3]* x2 =0
# (theta[1] + theta[2]*x1)/(- theta[3]) =x2
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
abline(theta[1]/(-theta[3]),theta[2]/(-theta[3]),col="black")
Let’s try normalized features:
alpha=1.8
iterations=500
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
fN <- featureNormalization(X)
dataNorm <- cbind(fN$nv,y)
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
plotCostDev(grad_desc)
cost_vec <- grad_desc$cost_vec
cost <- last(cost_vec,1)
theta <- grad_desc$theta
kable(last(cost_vec,10),col.names=c("last costs"),caption="after 500 iterations")
last costs |
---|
0.2034981 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
The current theta
for the normalized data is 1.7152641, 4.0060604, 3.7372825 and the cost is 0.203498
theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma
data_pos <- data[which(data[,3]==1),]
data_neg <- data[which(data[,3]==0),]
# decision boundary:
# theta[1] + theta[2]*(x1-mu[1])/sigma[1] + theta[3]*(x2-mu[2])/sigma[2] =0
# theta[1] + theta[2]*(x1-mu[1])/sigma[1] = - theta[3]*(x2-mu[2])/sigma[2]
# -sigma[2]/theta[3] *(theta[1] + theta[2]*(x1-mu[1])/sigma[1]) = x2-mu[2]
# mu[2] -sigma[2]/theta[3] *(theta[1] + theta[2]/sigma[1]*(x1-mu[1])) = x2
# mu[2] - sigma[2]/(theta[3])*theta[1] + sigma[2]/(theta[3])*theta[2]/sigma[1]*mu[1]- sigma[2]/(theta[3])*theta[2]/sigma[1]*x1 = x2
intercept <- mu[2] - sigma[2]*theta[1]/theta[3] + sigma[2]*theta[2]*mu[1]/(theta[3]*sigma[1])
slope <- -sigma[2]*theta[2]/(theta[3]*sigma[1])
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
abline(intercept,slope,col="black")
p <- hlog(theta,ones(fN$nv))
acc <- mean(round(p) == y) * 100
Accurracy of our model with respect to our trainings data: 89 %
It turns out that feature normalization can be very crucial for logistic gradient descent. Otherwise, we are very much dependent on guessing a good initial theta value.
We are now using advanced algorithms for our regression problem. They are part of the library stats
.
library(stats)
theta_init <- c(0,0,0)
m <- dim(data)[1]
n <- dim(data)[2]
X <- ones(data[,1:(n-1)])
y <- data[,n]
# test gradient function
gradient(X,y)(theta_init)
## [,1]
## -0.10000
## V1 -12.00922
## V2 -11.26284
# test costFunction
costFunction(X,y)(theta_init)
## [,1]
## [1,] 0.6931472
# optimized regression
bfgs <- optim(par=theta_init,fn=costFunction(X,y),gr=gradient(X,y),method="BFGS",control = list(maxit = 500))
theta <- bfgs$par
# plot decision boundary
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
abline(theta[1]/(-theta[3]),theta[2]/(-theta[3]),col="black")
p <- hlog(theta,X)
p <- round(p)
acc <- mean(p == y) * 100
Accurracy of our model with respect to our trainings data: 89 %
Optimal theta value: -25.0894721, 0.2056569, 0.2008902
final cost value: 0.2034985
Task: Predict probability for a student with score 45 on exam 1 and score 85 on exam 2 to be admitted.
pval <- hlog(c(1,45,85),theta)
The chance that the student is admitted is 77.5694776 %