Let’s build a simple logistic regression classifier from scratch.
Let’s first include our standard libraries.
# Tibbles!
library(tidyverse)
# Used for generating clusters
library(MASS)
For testing purposes let us include our data generator from Session 5:
generate_lda_clusters = function(classes, dim, num_samples) {
# generate a non-degenerate covariance matrix
repeat{
A = matrix(runif(dim^2),dim,dim)
cov_mat = A%*%t(A)
if(det(cov_mat)!=0)
break
}
# generate means
means = matrix(1.5*runif(dim*classes),dim,classes)
#initialize data frame
xvals = matrix(numeric(dim*classes*num_samples), num_samples*classes, dim)
y = matrix(numeric(classes*num_samples),classes*num_samples, 1)
t = as.tibble(cbind(xvals,y))
colnames(t)=c(paste0('x',1:dim),'y')
# Fill data frame
for(ix in 0:(num_samples*classes-1)) {
c = (ix %/% num_samples)+1
t[(ix+1),]=c(mvrnorm(1,means[,c],cov_mat),c)
}
# Generate output
output = list(data = t, means = t(means), covariance = cov_mat)
return(output)
}
Now I’m going to add some convenience functions:
bias = function(x) {
num_times = dim(x)[1]
as.matrix(cbind(x,rep(1,num_times)))
}
OHE = function(x) {
#Get list of unique entries
values = unique(sort(x))
num_values = length(values)
encoder = function(z){
#find index
ix = match(z,values)
#initialize zero vector
v = numeric(num_values)
v[ix]=1
return(v)
}
#vectorized encoder
function(y) {
matrix(unlist(map(y,encoder)),length(y),num_values,byrow=T)
}
}
Clipper = function(x) { as.matrix(apply(x, c(1,2), function(z) max(min(z,exp(30)),exp(-30)))) }
Scaler = function(x) {
num_samples = dim(x)[1]
num_features = dim(x)[2]
means = numeric(num_features)
vars = numeric(num_features)
#calculate means
means = apply(x, 2, mean)
vars = apply(x, 2, var)
function(z) {
if(dim(z)[2]!=num_features) stop("Attempt to transform data with the incorrect number of features")
num_to_transform = dim(z)[1]
w = matrix(numeric(num_features*num_to_transform), num_to_transform, num_features)
for(col in seq(num_features)) {
for(row in seq(num_to_transform)) {
w[[row,col]] = (z[[row,col]]-means[[col]])/sqrt(vars[[col]])
}
}
return(w)
}
}
I’m going to try to implement this as an ultralightweight “object”, by which I just mean an environment. This loses some of the functionality of R6 classes, but is simpler syntactically.
LogisticClassifier = function(tr_data, tr_labels, prior_std_dev = 0.1) {
num_samples = dim(tr_data)[1]
num_features = dim(tr_data)[2]
num_classes = dim(tr_labels)[2]
training_log = tibble(num_samples = c(0), accuracy = c(0), loss = c(0))
beta = matrix(rnorm(num_features*num_classes)*prior_std_dev, num_features, num_classes)
# Shuffle rows
shuffle = sample.int(num_samples)
tr_data = tr_data[shuffle, ]
tr_labels = tr_labels[shuffle, ]
train_counter = 0
probs = function(x) {
batch_size = dim(x)[1]
prbs = Clipper(exp(x %*% beta))
for(row in seq(batch_size))
prbs[row,]=prbs[row,]/sum(prbs[row,])
return(as.matrix(prbs))
}
train_on_batch = function(batch_size = 128, learning_rate = 0.1) {
to_select = ((train_counter+seq(0,batch_size-1)) %% num_samples)+1
data = as.matrix(tr_data[to_select,])
labels = as.matrix(tr_labels[to_select,])
multiplicand = probs(data)
beta <<- beta + (t(data)%*%(labels-multiplicand))*learning_rate/batch_size
train_counter <<- train_counter+batch_size
acc_counter = 0
error = 0
for(i in seq(batch_size)) {
if(which.max(multiplicand[i,])==which.max(labels[i,])) {
acc_counter = acc_counter+1
}
error=error-log(sum(multiplicand[i,]*labels[i,]))
}
acc_counter = acc_counter/batch_size
error = error/batch_size
training_log <<- add_row(training_log, num_samples = train_counter, accuracy = acc_counter,loss = error)
}
environment()
}
Test it out. Generate our data:
df = generate_lda_clusters(10,50,1000)
xvals = df$data[,1:50]
yvals = df$data[,51]
shuffle = sample.int(nrow(xvals))
See how we do:
sc = Scaler(xvals)
tc = bias(sc(xvals))
ohe = OHE(yvals$y)
ty = ohe(yvals$y)
x_train = tc[shuffle[1:9000],]
y_train = ty[shuffle[1:9000],]
x_test = tc[shuffle[9001:10000],]
y_test = ty[shuffle[9001:10000],]
logc = LogisticClassifier(x_train, y_train)
for(i in 1:4000) {
logc$train_on_batch(batch_size = 64,learning_rate = 1.0)
}
Test this on our test data.
preds = logc$probs(x_test)
acc = 0
for(i in seq(nrow(preds)))
if(which.max(preds[i,])==which.max(y_test[i,]))
acc = acc+1/nrow(preds)
print(acc)
## [1] 0.968
library(ggplot2)
ggplot(data=logc$training_log) +
geom_smooth(aes(x=num_samples, y=accuracy), color = "Green", alpha = 0.5)
ggplot(data=logc$training_log) +
geom_smooth(aes(x=num_samples, y=loss), color="Blue", alpha = 0.5)
Okay onto the digits:
library(keras)
mnist = dataset_mnist()
to_sample = 1:5000
x_train = mnist$train$x
x_train = array_reshape(x_train, c(nrow(x_train),784))
pca = prcomp(x_train[to_sample,], rank.=50)
xvals = predict(pca, x_train)
sc = Scaler(xvals)
tc = bias(sc(xvals))
ohe = OHE(as.matrix(mnist$train$y))
ty = ohe(mnist$train$y)
mnistc = LogisticClassifier(tc, ty)
for(i in 1:500) {
mnistc$train_on_batch(batch_size = 64, learning_rate = 1.0)
}
Test this on our test data.
to_test = 1:10000
x_test = mnist$test$x
x_test = array_reshape(x_test, c(nrow(x_test),784))
ttt = predict(pca, x_test[to_test,])
preds = mnistc$probs(bias(sc(ttt)))
acc = 0
y_test = ohe(mnist$test$y)
for(i in seq(nrow(preds)))
if(which.max(preds[i,])==which.max(y_test[i,]))
acc = acc+1
print(acc/nrow(preds))
## [1] 0.8995
ggplot(data=mnistc$training_log) +
geom_smooth(aes(x=num_samples, y=accuracy), color = "Green", alpha = 0.5)
ggplot(data=mnistc$training_log) +
geom_smooth(aes(x=num_samples, y=loss), color = "Blue", alpha = 0.5)