# graphical parameter
par(lwd=2,pch=4)
# read data
data <-as.matrix(read.table('ex2data1.txt',
sep=",",
encoding="UTF-8",
header=FALSE
))
m <- dim(data)[1]
n <- dim(data)[2]
X <- data[,1:n-1]
y <- data[,n]
# print some data
data10<-data[1:10,]
kable(data10,col.names=c("x1: 1. exam","x2: 2. exam","y: admission"),caption="First 10 training examples")
x1: 1. exam | x2: 2. exam | y: admission |
---|---|---|
34.62366 | 78.02469 | 0 |
30.28671 | 43.89500 | 0 |
35.84741 | 72.90220 | 0 |
60.18260 | 86.30855 | 1 |
79.03274 | 75.34438 | 1 |
45.08328 | 56.31637 | 0 |
61.10666 | 96.51143 | 1 |
75.02475 | 46.55401 | 1 |
76.09879 | 87.42057 | 1 |
84.43282 | 43.53339 | 1 |
# plot data
data_pos <- data[which(data[,3]==1),]
data_neg <- data[which(data[,3]==0),]
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
alpha=0.0001
iterations = 1000
theta_init = c(0,0)
testdata <- data[,c(1,3)]
grad_desc <- gradientDescent(testdata,theta_init,alpha,iterations)
plotCostDev(grad_desc)
theta <- grad_desc$theta
plotLinearFit(testdata,theta)
abline(h=0.5,col="gray",lwd=1)
abline(v=52,col="gray",lwd=1)
Hier sehen Sie die Ergebnisse der 1. PrĂ¼fung und der Zulassungen. Die lineare Regression liefert keine Möglichkeit die zugelassene von der nicht zugelassene Gruppe zu trennen.
In einem Klassifikationsproblem mit zwei Klassen kann y nur die Werte 0 und 1 annehmen. Es ist daher ungewĂ¼nscht, wenn \(h_{\Theta}(x)\) beliebig kleine und beliebig groĂŸe Werte annehmen kann. Wir suchen daher eine Funktion mit \[0\leq h_{\Theta}(x) \leq h(1)\]
die wir wie folgt interpretieren können: \(h_{\Theta}(x)\) gibt die Wahrscheinlichkeit an, dass \(x\) die Klasse \(1\) zugewiesen bekommt.
Lösung: sigmoid Funktion \(\frac{1}{1+e^{-z}}\)
# plot sigmoid
curve(sigmoid,xlim=c(-4,4),ylim=c(-0.5,1.5),main="sigmoid function",col="blue",lwd=2)
abline(h=0,col="gray",lwd=1)
abline(h=0.5,col="gray",lwd=1)
abline(h=1,col="gray",lwd=1)
abline(v=0,col="gray",lwd=1)
Unsere neue Hypothesenfuntion ist \[h^{log}_{\Theta}(x)=\frac{1}{1+e^{-h_{\Theta(x)}}}\] Wir verwenden vorĂ¼bergehend \(h^{log}_{\Theta}(x)\) fĂ¼r die neue und \(h_{\Theta}(x)\) fĂ¼r die alte Hypothesenfunktion (zur Erinnerung \(h_{\Theta}(\vec{x}))=\theta_0+\theta_1x_1+\ldots\theta_nx_n\)), später werden wir auf die explizite Angabe des Superskripts \(log\) zumeist verzichten.
Zur Erinnerung: Die Kostenfunktion ist eine Funktion der Parameter \(\Theta\). Sie modelliert die Abweichung der Hypothesenfunktion von den beobachteten Daten: hohe Abweichungen fĂ¼hren zu hohen Kosten. Das Ziel ist es die Parameter der Hypothesenfunktion so zu wählen, dass die Kosten minimiert werden.
Bisher haben wir die Kosten Ă¼ber die Summe der Quadrate der Abweichungen berechnet:
\[ J(\Theta) = \frac{1}{m} \sum_{i=1}^m \frac{1}{2} (h_{\Theta}(x^{(i)})-y^{(i)})^2\] Diese Funktion ist fĂ¼r \(h^{log}_{\Theta}(x)\) allerdings nicht konvex und somit fĂ¼r die lineare Regressionsanalyse ungeeignet.
Hier eine kleine Spielerei, mit einer Minitrainingsmenge und zwei Merkmalen:
# Let's plot the old cost function for a simple data set
# x y
# 1 1
# 2 0
# 3 1
# 4 0
# 5 0
# Features: x, x^2
y <- c(1,0,1,0,0)
x <- c(1:length(y))
oldcost <- function(x,y,theta0,theta1,theta2){
c <- rep(0,length(y))
for (i in 1:length(y)){
c[i] <- (sigmoid(theta0+theta1*x[i]+theta2*x[i]^2)-y[i])^2
}
return(1/(2*length(y))*sum(c))
}
# initialize J_vals to a matrix of 0's
k <- 10
theta0_vals=seq(-k,k,length.out=100)
theta1_vals=seq(-k,k,length.out=100)
theta2_vals=seq(-k,k,length.out=100)
J_vals = matrix(0,nrow=length(theta0_vals), ncol=length(theta1_vals))
theta2 <- -1
# Fill out J_vals
for (i in 1:length(theta0_vals)){
for (j in 1:length(theta1_vals)){
J_vals[i,j] <- oldcost(x,y,theta0_vals[i], theta1_vals[j],theta2)
}
}
Wir betrachten folgende Klassifikationsdaten:
und die Hypothesenfunktion \[h^{log}_{\Theta}(x)=\frac{1}{1+e^{-(\theta_0+\theta_1x+\theta_2x^2)}}\]
Wir haben also drei Parameter \(\theta_0, \theta_1, \theta_2\). Um die Kostenfunktion plotten zu können, fixieren wir \(\theta_2=\)-1.
# Surface plot
open3d()
## wgl
## 1
color = rev(rainbow(100, start = 0/8, end = 6/8))
persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9,
fog=TRUE,
xlab='t0', ylab='t1',zlab="cost",main="Contour plot of old cost function")
You must enable Javascript to view this page properly.
Wir sehen, dass die Kostenfunktion nicht konvex ist.
Hier noch einmal ein anderer Schnitt durch die Kostenfunktion (fĂ¼r \(\theta_0=0\)):
theta1 <- 0
# Fill out J_vals
for (i in 1:length(theta0_vals)){
for (j in 1:length(theta1_vals)){
J_vals[i,j] <- oldcost(x,y,theta0_vals[i], theta1,theta2_vals[j])
}
}
# Surface plot
open3d()
## wgl
## 3
color = rev(rainbow(100, start = 0/8, end = 6/8))
persp3d(theta0_vals, theta1_vals, J_vals,col = color,alpha=0.9,
fog=TRUE,
xlab='t0', ylab='t1',zlab="cost",main="Contour plot of old cost function")
You must enable Javascript to view this page properly.
Auch dieser Schnitt zeigt, dass die alte Kostenfunktion im Falle der logistischen Hypothesenfunktion nicht konvex ist.
Wir mĂ¼ssen also eine neue Kostenfunktion konstruieren, die die folgenden Bedingungen erfĂ¼llt:
Die folgende neue Kostenfunktion erfĂ¼llt diese Anforderungen: \[ J(\Theta) = \frac{1}{m} \sum_{i=1}^m \text{cost}(h_{\Theta}(x^{(i)}),y^{(i)})\] mit
\[\begin{eqnarray} \text{cost}(h_{\Theta}(x),y) &=& \begin{cases} -\log(h_{\Theta}(x)) & \text{ if } y=1\\ -\log(1-h_{\Theta}(x)) & \text{ if } y=0 \end{cases}\\ &=& -y\log(h_{\Theta}(x)) - (1-y)\log(1-h_{\Theta}(x)) \end{eqnarray}\]# plot cost function
curve(-log(x),xlim=c(0,1),ylim=c(-0.5,5),col="red",main="Plot of function 'cost' (red: y=1, blue: y=0)",xlab="h_Theta(x)",ylab="cost")
curve(-log(1-x),col="blue",add=TRUE)
cost=
0.0010005cost=
6.9077553cost=
0.6931472cost=
6.9077553cost=
0.0010005cost=
0.6931472Fitting parameters \(\Theta\) means minimizing \(J(\Theta)\).
Aim: \(min_{\Theta} J(\Theta)\)
Gradient Descent:
repeat: \(\theta_j := \theta_j -\alpha \frac{\delta}{\delta\theta_j} J(\Theta)\)
\(\frac{\delta}{\delta\theta_j} J(\Theta) = \frac{1}{m}\sum_{i=1}^m (h_{\Theta}(x^{(i)})-y^{(i)})x_j^{(i)}\)
Thus, repeatedly: \(\theta_j := \theta_j -\alpha \frac{1}{m}\sum_{i=1}^m (h_{\Theta}(x^{(i)})-y^{(i)})x_j^{(i)}\)
Let’s apply this to our example data:
alpha=0.002
iterations=500
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
dataNorm <- data
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
plotCostDev(grad_desc)
Look at the plot of the cost development function. We could try to decrease the learning rate further or to switch to normalized features. Let’s first decrease the learning rate:
alpha=0.001
iterations=500
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
dataNorm <- data
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
plotCostDev(grad_desc)
Remember: We should also look at the last 10 cost values:
cost_vec <- grad_desc$cost_vec
kable(last(cost_vec,10),col.names=c("last costs"),caption="after 500 iterations")
last costs |
---|
0.6274363 |
0.6274315 |
0.6274266 |
0.6274218 |
0.6274169 |
0.6274121 |
0.6274073 |
0.6274024 |
0.6273976 |
0.6273928 |
The cost is still decreasing, we should increase the iteration rate:
alpha=0.001
iterations=30000
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
dataNorm <- data
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
cost_vec <- grad_desc$cost_vec
kable(last(cost_vec,10),col.names=c("last costs"),caption="after 30000 iterations")
last costs |
---|
0.5154282 |
0.5154253 |
0.5154224 |
0.5154194 |
0.5154165 |
0.5154136 |
0.5154106 |
0.5154077 |
0.5154048 |
0.5154018 |
cost <- last(cost_vec,1)
theta <- grad_desc$theta
The values are still decreasing and our learning rate is pretty small alpha=
0.001. The current theta
is -1.8475648, 0.0234501, 0.0148901 and the cost is 0.5154018 Here a plot of the decision boundary:
theta <- grad_desc$theta
data_pos <- data[which(data[,3]==1),]
data_neg <- data[which(data[,3]==0),]
# decision boundary:
# theta[1] + theta[2]*x1 + theta[3]* x2 =0
# (theta[1] + theta[2]*x1)/(- theta[3]) =x2
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
abline(theta[1]/(-theta[3]),theta[2]/(-theta[3]),col="black")
Let’s try normalized features:
alpha=1.8
iterations=500
theta_init=c(0,0,0)
n <- dim(data)[2]
y <- data[,n]
X <- data[,1:(n-1)]
fN <- featureNormalization(X)
dataNorm <- cbind(fN$nv,y)
grad_desc <- gradientDescentLog(dataNorm,theta_init,alpha,iterations)
plotCostDev(grad_desc)
cost_vec <- grad_desc$cost_vec
cost <- last(cost_vec,1)
theta <- grad_desc$theta
kable(last(cost_vec,10),col.names=c("last costs"),caption="after 500 iterations")
last costs |
---|
0.2034981 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
0.2034980 |
The current theta
for the normalized data is 1.7152641, 4.0060604, 3.7372825 and the cost is 0.203498
theta <- grad_desc$theta
mu <- fN$mu
sigma <- fN$sigma
data_pos <- data[which(data[,3]==1),]
data_neg <- data[which(data[,3]==0),]
# decision boundary:
# theta[1] + theta[2]*(x1-mu[1])/sigma[1] + theta[3]*(x2-mu[2])/sigma[2] =0
# theta[1] + theta[2]*(x1-mu[1])/sigma[1] = - theta[3]*(x2-mu[2])/sigma[2]
# -sigma[2]/theta[3] *(theta[1] + theta[2]*(x1-mu[1])/sigma[1]) = x2-mu[2]
# mu[2] -sigma[2]/theta[3] *(theta[1] + theta[2]/sigma[1]*(x1-mu[1])) = x2
# mu[2] - sigma[2]/(theta[3])*theta[1] + sigma[2]/(theta[3])*theta[2]/sigma[1]*mu[1]- sigma[2]/(theta[3])*theta[2]/sigma[1]*x1 = x2
intercept <- mu[2] - sigma[2]*theta[1]/theta[3] + sigma[2]*theta[2]*mu[1]/(theta[3]*sigma[1])
slope <- -sigma[2]*theta[2]/(theta[3]*sigma[1])
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
abline(intercept,slope,col="black")
p <- hlog(theta,ones(fN$nv))
acc <- mean(round(p) == y) * 100
Accurracy of our model with respect to our trainings data: 89 %
It turns out that feature normalization can be very crucial for logistic gradient descent. Otherwise, we are very much dependent on guessing a good initial theta value.
We are now using advanced algorithms for our regression problem. They are part of the library stats
.
library(stats)
theta_init <- c(0,0,0)
m <- dim(data)[1]
n <- dim(data)[2]
X <- ones(data[,1:(n-1)])
y <- data[,n]
# test gradient function
gradient(X,y)(theta_init)
## [,1]
## -0.10000
## V1 -12.00922
## V2 -11.26284
# test costFunction
costFunction(X,y)(theta_init)
## [,1]
## [1,] 0.6931472
# optimized regression
bfgs <- optim(par=theta_init,fn=costFunction(X,y),gr=gradient(X,y),method="BFGS",control = list(maxit = 500))
theta <- bfgs$par
# plot decision boundary
plot(data_pos[,1],data_pos[,2],ylab="2. exam",xlab="1. exam",col="red",pch=4,xlim=c(30,100),ylim=c(30,100))
points(data_neg[,1],data_neg[,2],ylab="2. exam",xlab="1. exam",col="blue",pch=1)
abline(theta[1]/(-theta[3]),theta[2]/(-theta[3]),col="black")
p <- hlog(theta,X)
p <- round(p)
acc <- mean(p == y) * 100
Accurracy of our model with respect to our trainings data: 89 %
Optimal theta value: -25.0894721, 0.2056569, 0.2008902
final cost value: 0.2034985
Task: Predict probability for a student with score 45 on exam 1 and score 85 on exam 2 to be admitted.
pval <- hlog(c(1,45,85),theta)
The chance that the student is admitted is 77.5694776 %