Write a function in R that will take in a vector of discrete variables and will produce the corresponding one hot encodings.
library(tidyverse)
#Function takes in data and returns a function which performs one hot encoding with respect to that data
OHE = function(x) {
#Get list of unique entries (sort for convenience)
values = unique(sort(x))
num_values = length(values)
encoder = function(z){
#find index
ix = match(z,values)
#initialize zero vector
v = numeric(num_values)
v[ix]=1
return(v)
}
#vectorized encoder
function(y) {
matrix(map(y,encoder))
}
}
Test this on some data:
Y = c(1,2,3)
encoder = OHE(Y)
A=encoder(Y)
A[,]
## [[1]]
## [1] 1 0 0
##
## [[2]]
## [1] 0 1 0
##
## [[3]]
## [1] 0 0 1
Z = c('c','e','e','a','b')
encoder2 = OHE(Z)
B = encoder2(Z)
B[,]
## [[1]]
## [1] 0 0 1 0
##
## [[2]]
## [1] 0 0 0 1
##
## [[3]]
## [1] 0 0 0 1
##
## [[4]]
## [1] 1 0 0 0
##
## [[5]]
## [1] 0 1 0 0
Write an LDA classifier from scratch.
LDAClassifier = function(x,y) {
num_features = dim(x)[2]
num_samples = dim(x)[1]
#build df
t = as.tibble(cbind(x,y))
colnames(t)=c(paste0('x',1:num_features), 'y')
#calculate group means
means = t %>%
group_by(y) %>%
summarize_all(.funs = c(mean))
num_classes = dim(means)[1]
#calculate covariance matrix
cov_mat = matrix(numeric(num_features*num_features),num_features,num_features)
for(ix in 1:num_samples) {
#get vector
v = as.matrix(t[ix,1:num_features])
#center it
v = v-as.matrix(means[t[[ix,(num_features+1)]],2:(num_features+1)])
cov_mat = cov_mat + t(v) %*% v
}
cov_mat = cov_mat / (num_samples-num_features)
prec_mat = solve(cov_mat)
denominator = (det(2*pi*cov_mat))^(0.5)
#only do this for single classes
probs = function(x) {
ve = matrix(x,ncol=num_features,nrow = 1)
prbs = numeric(num_classes)
for(c in 1:num_classes) {
mu = as.matrix(means[c,2:(num_features+1)], ncol = num_features, nrow = 1)
vec = ve-mu
exponent = - vec %*% prec_mat %*% t(vec) / 2
prbs[c]=exp(exponent)/denominator
}
return(prbs)
}
prediction = function(x) {
which(probs(x)==max(probs(x)))
}
return(list(probs = probs, prediction = prediction, means = means[,2:(num_features+1)], covariance = cov_mat))
}
Let’s test this on some data.
First build a data generator:
#need multivariate Gaussian
library(MASS)
generate_lda_clusters = function(classes, dim, num_samples) {
# generate a non-degenerate covariance matrix
repeat{
A = matrix(runif(dim^2),dim,dim)
cov_mat = A%*%t(A)
if(det(cov_mat)!=0)
break
}
# generate means
means = matrix(1.5*runif(dim*classes),dim,classes)
#initialize data frame
xvals = matrix(numeric(dim*classes*num_samples), num_samples*classes, dim)
y = matrix(numeric(classes*num_samples),classes*num_samples, 1)
t = as.tibble(cbind(xvals,y))
colnames(t)=c(paste0('x',1:dim),'y')
# Fill data frame
for(ix in 0:(num_samples*classes-1)) {
c = (ix %/% num_samples)+1
t[(ix+1),]=c(mvrnorm(1,means[,c],cov_mat),c)
}
# Generate output
output = list(data = t, means = t(means), covariance = cov_mat)
return(output)
}
Now let’s check on our data:
df = generate_lda_clusters(3,4,100)
xvals = df$data[,1:4]
yvals = df$data[,5]
ldac = LDAClassifier(xvals,yvals)
Compare means
df$means
## [,1] [,2] [,3] [,4]
## [1,] 0.5101013 0.978513 0.9386442 0.4911195
## [2,] 0.0278312 1.206325 0.9372495 0.4147086
## [3,] 0.4433162 1.289720 0.4502362 0.6973826
ldac$means
## # A tibble: 3 x 4
## x1 x2 x3 x4
## <dbl> <dbl> <dbl> <dbl>
## 1 0.38972562 0.8821986 0.8613887 0.3436112
## 2 0.01451382 1.1908754 0.9342209 0.4288744
## 3 0.45178670 1.2878750 0.4848533 0.7750743
mean((ldac$means-df$means)^2)
## [1] 0.00495244
That looks okay. Although ramping up the number of samples would help confirm this.
How about the covariance matrices:
df$covariance
## [,1] [,2] [,3] [,4]
## [1,] 1.8753805 1.5801485 1.0815625 0.8991559
## [2,] 1.5801485 1.3360393 0.9036398 0.7217824
## [3,] 1.0815625 0.9036398 0.6561120 0.5328054
## [4,] 0.8991559 0.7217824 0.5328054 0.9273198
ldac$covariance
## x1 x2 x3 x4
## x1 1.8890856 1.5905602 1.1071007 0.8717394
## x2 1.5905602 1.3439494 0.9241713 0.6989443
## x3 1.1071007 0.9241713 0.6823120 0.5211170
## x4 0.8717394 0.6989443 0.5211170 0.9083977
mean((df$covariance-ldac$covariance)^2)
## [1] 0.0004049324
Okay. Let’s test the predictions.
preds = apply(xvals, 1, ldac$prediction)
mean(preds == yvals)
## [1] 1
Let’s train the model on a subset of the data now.
df = generate_lda_clusters(4,5,400)
xvals = df$data[,1:5]
yvals = df$data[,6]
#shuffle our data
num_samples = dim(xvals)[1]
perm = sample(1:num_samples)
xvals = xvals[perm,]
yvals = yvals[perm,]
#split it into training and test sets
split = (num_samples * 9) %/% 10
tr_xvals = xvals[1:split,]
test_xvals = xvals[(split+1):num_samples, ]
tr_yvals = yvals[1:split,]
test_yvals = yvals[(split+1):num_samples,]
#Fit our classifier
ldac = LDAClassifier(tr_xvals, tr_yvals)
preds = apply(test_xvals, 1, ldac$prediction)
mean(preds == test_yvals)
## [1] 0.98125
Okay, onto QDA:
QDAClassifier = function(x,y) {
num_features = dim(x)[2]
num_samples = dim(x)[1]
#build df
t = as.tibble(cbind(x,y))
colnames(t)=c(paste0('x',1:num_features), 'y')
#calculate group means
means = t %>%
group_by(y) %>%
summarize_all(.funs = c(mean))
num_classes = dim(means)[1]
#calculate covariance matrices
cov_mats = array(numeric(num_classes*num_features^2), dim = c(num_classes, num_features, num_features))
prec_mats = array(dim = c(num_classes, num_features, num_features))
denominators = numeric(num_classes)
for(c in 1:num_classes) {
#get the samples in the class
class_samples = t %>%
filter(y == c) %>%
dplyr::select(-y)
num_in_class = dim(class_samples)[1]
for(ix in 1:num_in_class) {
#get vector
v = as.matrix(class_samples[ix,])
#center it
v = v-as.matrix(means[c,2:(num_features+1)])
cov_mats[c,,] = cov_mats[c,,] + t(v) %*% v
}
cov_mats[c,,] = cov_mats[c,,] / (num_in_class-1)
prec_mats[c,,] = solve(cov_mats[c,,])
denominators[c] = (det(2*pi*cov_mats[c,,]))^(0.5)
}
#only do this for single classes
probs = function(x) {
ve = matrix(x,ncol=num_features,nrow = 1)
prbs = numeric(num_classes)
for(c in 1:num_classes) {
mu = as.matrix(means[c,2:(num_features+1)], ncol = num_features, nrow = 1)
vec = ve-mu
exponent = - vec %*% prec_mats[c,,] %*% t(vec) / 2
prbs[c]=exp(exponent)/denominators[c]
}
return(prbs)
}
prediction = function(x) {
which(probs(x)==max(probs(x)))
}
return(list(probs = probs, prediction = prediction, means = means[,2:(num_features+1)], covariance = cov_mats))
}
Generate synthetic data:
generate_qda_clusters = function(classes, dim, num_samples) {
# generate non-degenerate covariance matrices
# initialize matrices
cov_mats = array(dim = c(classes, dim, dim))
for(c in 1:classes) {
repeat{
A = matrix(runif(dim^2),dim,dim)
cov_mat = A%*%t(A)
if(det(cov_mat)!=0) {
cov_mats[c,,]=cov_mat
break
}
}
}
# generate means
means = matrix(1.5*runif(dim*classes),dim,classes)
#initialize data frame
xvals = matrix(numeric(dim*classes*num_samples), num_samples*classes, dim)
y = matrix(numeric(classes*num_samples),classes*num_samples, 1)
t = as.tibble(cbind(xvals,y))
colnames(t)=c(paste0('x',1:dim),'y')
# Fill data frame
for(ix in 0:(num_samples*classes-1)) {
c = (ix %/% num_samples)+1
t[(ix+1),]=c(mvrnorm(1,means[,c],cov_mats[c,,]),c)
}
# Generate output
output = list(data = t, means = t(means), covariance = cov_mats)
return(output)
}
Let’s see if that worked:
df = generate_qda_clusters(3,4,100)
xvals = df$data[,1:4]
yvals = df$data[,5]
qdac = QDAClassifier(xvals,yvals)
Compare means:
qdac$means
## # A tibble: 3 x 4
## x1 x2 x3 x4
## <dbl> <dbl> <dbl> <dbl>
## 1 0.08260264 0.4301034 0.1457182 0.8632887
## 2 1.62192712 1.0937153 0.6649225 0.2333357
## 3 0.99022041 0.4829438 1.2868795 0.5151341
df$means
## [,1] [,2] [,3] [,4]
## [1,] 0.2111792 0.5984161 0.3451419 0.8686134
## [2,] 1.4986965 1.0632256 0.5463299 0.1748167
## [3,] 0.7605555 0.1218101 0.8682180 0.2659771
mean((qdac$means - df$means)^2)
## [1] 0.04489864
Okay covariances:
qdac$covariance
## , , 1
##
## [,1] [,2] [,3] [,4]
## [1,] 0.7020239 1.1557111 1.116209 0.2763056
## [2,] 1.1591497 0.8529673 1.013734 0.4396454
## [3,] 1.2527756 1.5844386 1.282907 0.9630064
##
## , , 2
##
## [,1] [,2] [,3] [,4]
## [1,] 1.1557111 2.4098661 2.279968 0.6962249
## [2,] 0.8529673 0.9934195 1.062523 0.7691141
## [3,] 1.5844386 2.3884430 2.228916 1.4798311
##
## , , 3
##
## [,1] [,2] [,3] [,4]
## [1,] 1.116209 2.279968 2.420025 0.7316594
## [2,] 1.013734 1.062523 1.365315 1.0462900
## [3,] 1.282907 2.228916 2.928235 2.1355284
##
## , , 4
##
## [,1] [,2] [,3] [,4]
## [1,] 0.2763056 0.6962249 0.7316594 0.604256
## [2,] 0.4396454 0.7691141 1.0462900 1.091922
## [3,] 0.9630064 1.4798311 2.1355284 1.962096
df$covariance
## , , 1
##
## [,1] [,2] [,3] [,4]
## [1,] 0.6185236 0.9856408 0.9222466 0.2425210
## [2,] 1.3226684 1.0053535 1.2490927 0.6222098
## [3,] 1.2128638 1.4290042 1.0692735 0.8469089
##
## , , 2
##
## [,1] [,2] [,3] [,4]
## [1,] 0.9856408 2.027609 1.894024 0.6192662
## [2,] 1.0053535 1.101553 1.235157 0.8816886
## [3,] 1.4290042 2.092696 1.891997 1.2578949
##
## , , 3
##
## [,1] [,2] [,3] [,4]
## [1,] 0.9222466 1.894024 2.014706 0.6885483
## [2,] 1.2490927 1.235157 1.638747 1.2208347
## [3,] 1.0692735 1.891997 2.520242 1.8370461
##
## , , 4
##
## [,1] [,2] [,3] [,4]
## [1,] 0.2425210 0.6192662 0.6885483 0.5889582
## [2,] 0.6222098 0.8816886 1.2208347 1.1779506
## [3,] 0.8469089 1.2578949 1.8370461 1.7613251
mean((qdac$covariance-df$covariance)^2)
## [1] 0.04631098
Great those estimates look okay. How are the predictions:
df = generate_qda_clusters(2,5,400)
xvals = df$data[,1:5]
yvals = df$data[,6]
#shuffle our data
num_samples = dim(xvals)[1]
perm = sample(1:num_samples)
xvals = xvals[perm,]
yvals = yvals[perm,]
#split it into training and test sets
split = (num_samples * 9) %/% 10
tr_xvals = xvals[1:split,]
test_xvals = xvals[(split+1):num_samples, ]
tr_yvals = yvals[1:split,]
test_yvals = yvals[(split+1):num_samples,]
#Fit our classifier
qdac = QDAClassifier(tr_xvals, tr_yvals)
tr_preds = as.matrix(apply(tr_xvals, 1, qdac$prediction))
test_preds = as.matrix(apply(test_xvals,1,qdac$prediction))
# Compare with LDA
ldac = LDAClassifier(tr_xvals, tr_yvals)
tr_preds2 = as.matrix(apply(tr_xvals, 1, ldac$prediction))
test_preds2 = as.matrix(apply(test_xvals,1,ldac$prediction))
Let’s see how we did
output = tibble(LDA = c(mean(tr_preds2 == tr_yvals), mean(test_preds2 == test_yvals)), QDA = c(mean(tr_preds == tr_yvals), mean(test_preds == test_yvals)))
output
## # A tibble: 2 x 2
## LDA QDA
## <dbl> <dbl>
## 1 0.8166667 0.9819444
## 2 0.8500000 0.9500000
Digits. For this we will first apply a PCA dimension reduction to our data. This both makes the problem more tractable and removes
library(keras)
mnist = dataset_mnist()
to_sample = 1:10000
x_train = mnist$train$x
x_train = array_reshape(x_train, c(nrow(x_train),784))
pca = prcomp(x_train[to_sample,], rank.=30)
tt = pca$x
y_train = mnist$train$y+1
ldac = LDAClassifier(tt, y_train[to_sample])
qdac = QDAClassifier(tt, y_train[to_sample])
tr_preds = apply(tt, 1, ldac$prediction)
tr_preds2 = apply(tt, 1, qdac$prediction)
lda_tr = mean(tr_preds == y_train[to_sample])
qda_tr = mean(tr_preds2 == y_train[to_sample])
Try this on test data
to_test = 1:10000
x_test = mnist$test$x
x_test = array_reshape(x_test, c(nrow(x_test),784))
ttt = predict(pca, x_test[to_test,])
y_test = mnist$test$y+1
test_preds = apply(ttt, 1, ldac$prediction)
test_preds2 = apply(ttt, 1, qdac$prediction)
lda_test = mean(test_preds == y_test[to_test])
qda_test = mean(test_preds2 == y_test[to_test])
output = tibble(LDA = c(lda_tr, lda_test), QDA = c(qda_tr, qda_test))
output
## # A tibble: 2 x 2
## LDA QDA
## <dbl> <dbl>
## 1 0.8551 0.9654
## 2 0.8556 0.9566