R Lab

Write a function in R that will take in a vector of discrete variables and will produce the corresponding one hot encodings.

library(tidyverse)
#Function takes in data and returns a function which performs one hot encoding with respect to that data
OHE = function(x) {
  #Get list of unique entries (sort for convenience)
  values = unique(sort(x))
  num_values = length(values)
  encoder = function(z){
    #find index
    ix = match(z,values)
    #initialize zero vector
    v = numeric(num_values)
    v[ix]=1
    return(v)
  }
  
  #vectorized encoder
  function(y) {
    matrix(map(y,encoder))
  }
}

Test this on some data:

Y = c(1,2,3)
encoder = OHE(Y)
A=encoder(Y)
A[,]
## [[1]]
## [1] 1 0 0
## 
## [[2]]
## [1] 0 1 0
## 
## [[3]]
## [1] 0 0 1
Z = c('c','e','e','a','b')
encoder2 = OHE(Z)
B = encoder2(Z)
B[,]
## [[1]]
## [1] 0 0 1 0
## 
## [[2]]
## [1] 0 0 0 1
## 
## [[3]]
## [1] 0 0 0 1
## 
## [[4]]
## [1] 1 0 0 0
## 
## [[5]]
## [1] 0 1 0 0

Write an LDA classifier from scratch.

LDAClassifier = function(x,y) {
  num_features = dim(x)[2]
  num_samples = dim(x)[1]
  
  #build df
  t = as.tibble(cbind(x,y))
  colnames(t)=c(paste0('x',1:num_features), 'y')
  
  #calculate group means
  means = t %>%
    group_by(y) %>%
    summarize_all(.funs = c(mean))
  num_classes = dim(means)[1]
  
  #calculate covariance matrix
  cov_mat = matrix(numeric(num_features*num_features),num_features,num_features)
  for(ix in 1:num_samples) {
    #get vector
    v = as.matrix(t[ix,1:num_features])
    #center it
    v = v-as.matrix(means[t[[ix,(num_features+1)]],2:(num_features+1)])
    cov_mat = cov_mat + t(v) %*% v
  }
  cov_mat = cov_mat / (num_samples-num_features)
  prec_mat = solve(cov_mat)
  denominator = (det(2*pi*cov_mat))^(0.5)
  
  #only do this for single classes
  probs = function(x) {
    ve = matrix(x,ncol=num_features,nrow = 1)
    prbs = numeric(num_classes)
    for(c in 1:num_classes) {
      mu = as.matrix(means[c,2:(num_features+1)], ncol = num_features, nrow = 1)
      vec = ve-mu
      exponent = - vec %*% prec_mat %*% t(vec) / 2
      prbs[c]=exp(exponent)/denominator
    }
    return(prbs)
  }
  
  prediction = function(x) {
    which(probs(x)==max(probs(x)))
  }
  
  return(list(probs = probs, prediction = prediction, means = means[,2:(num_features+1)], covariance = cov_mat))
}

Let’s test this on some data.

First build a data generator:

#need multivariate Gaussian
library(MASS)
generate_lda_clusters = function(classes, dim, num_samples) {
  # generate a non-degenerate covariance matrix
  repeat{
    A = matrix(runif(dim^2),dim,dim)  
    cov_mat = A%*%t(A)
    if(det(cov_mat)!=0)
      break
  }
  # generate means
  means = matrix(1.5*runif(dim*classes),dim,classes)
  
  #initialize data frame
  xvals = matrix(numeric(dim*classes*num_samples), num_samples*classes, dim)
  y = matrix(numeric(classes*num_samples),classes*num_samples, 1)
  t = as.tibble(cbind(xvals,y))
  colnames(t)=c(paste0('x',1:dim),'y')
  
  # Fill data frame
  for(ix in 0:(num_samples*classes-1)) {
      c = (ix %/% num_samples)+1
      t[(ix+1),]=c(mvrnorm(1,means[,c],cov_mat),c)
  }
  # Generate output
  output = list(data = t, means = t(means), covariance = cov_mat)
  return(output)
}

Now let’s check on our data:

df = generate_lda_clusters(3,4,100)
xvals = df$data[,1:4]
yvals = df$data[,5]
ldac = LDAClassifier(xvals,yvals)

Compare means

df$means
##           [,1]     [,2]      [,3]      [,4]
## [1,] 0.5101013 0.978513 0.9386442 0.4911195
## [2,] 0.0278312 1.206325 0.9372495 0.4147086
## [3,] 0.4433162 1.289720 0.4502362 0.6973826
ldac$means
## # A tibble: 3 x 4
##           x1        x2        x3        x4
##        <dbl>     <dbl>     <dbl>     <dbl>
## 1 0.38972562 0.8821986 0.8613887 0.3436112
## 2 0.01451382 1.1908754 0.9342209 0.4288744
## 3 0.45178670 1.2878750 0.4848533 0.7750743
mean((ldac$means-df$means)^2)
## [1] 0.00495244

That looks okay. Although ramping up the number of samples would help confirm this.

How about the covariance matrices:

df$covariance
##           [,1]      [,2]      [,3]      [,4]
## [1,] 1.8753805 1.5801485 1.0815625 0.8991559
## [2,] 1.5801485 1.3360393 0.9036398 0.7217824
## [3,] 1.0815625 0.9036398 0.6561120 0.5328054
## [4,] 0.8991559 0.7217824 0.5328054 0.9273198
ldac$covariance
##           x1        x2        x3        x4
## x1 1.8890856 1.5905602 1.1071007 0.8717394
## x2 1.5905602 1.3439494 0.9241713 0.6989443
## x3 1.1071007 0.9241713 0.6823120 0.5211170
## x4 0.8717394 0.6989443 0.5211170 0.9083977
mean((df$covariance-ldac$covariance)^2)
## [1] 0.0004049324

Okay. Let’s test the predictions.

preds = apply(xvals, 1, ldac$prediction)
mean(preds == yvals)
## [1] 1

Let’s train the model on a subset of the data now.

df = generate_lda_clusters(4,5,400)
xvals = df$data[,1:5]
yvals = df$data[,6]
#shuffle our data
num_samples = dim(xvals)[1]
perm = sample(1:num_samples)
xvals = xvals[perm,]
yvals = yvals[perm,]

#split it into training and test sets
split = (num_samples * 9) %/% 10
tr_xvals = xvals[1:split,]
test_xvals = xvals[(split+1):num_samples, ]
tr_yvals = yvals[1:split,]
test_yvals = yvals[(split+1):num_samples,]

#Fit our classifier
ldac = LDAClassifier(tr_xvals, tr_yvals)
preds = apply(test_xvals, 1, ldac$prediction)
mean(preds == test_yvals)
## [1] 0.98125

Okay, onto QDA:

QDAClassifier = function(x,y) {
  num_features = dim(x)[2]
  num_samples = dim(x)[1]
  
  #build df
  t = as.tibble(cbind(x,y))
  colnames(t)=c(paste0('x',1:num_features), 'y')
  
  #calculate group means
  means = t %>%
    group_by(y) %>%
    summarize_all(.funs = c(mean))
  num_classes = dim(means)[1]
  
  #calculate covariance matrices
  cov_mats = array(numeric(num_classes*num_features^2), dim = c(num_classes, num_features, num_features))
  prec_mats = array(dim = c(num_classes, num_features, num_features))
  denominators = numeric(num_classes)
  for(c in 1:num_classes) {
    #get the samples in the class
    class_samples = t %>%
      filter(y == c) %>%
      dplyr::select(-y)
    
    num_in_class = dim(class_samples)[1]
    
    for(ix in 1:num_in_class) {
      #get vector
      v = as.matrix(class_samples[ix,])
      #center it
      v = v-as.matrix(means[c,2:(num_features+1)])
      cov_mats[c,,] = cov_mats[c,,] + t(v) %*% v
      
    }
    cov_mats[c,,] = cov_mats[c,,] / (num_in_class-1)
    prec_mats[c,,] = solve(cov_mats[c,,])
    denominators[c] = (det(2*pi*cov_mats[c,,]))^(0.5)
  }
  
  
  #only do this for single classes
  probs = function(x) {
    ve = matrix(x,ncol=num_features,nrow = 1)
    prbs = numeric(num_classes)
    for(c in 1:num_classes) {
      mu = as.matrix(means[c,2:(num_features+1)], ncol = num_features, nrow = 1)
      vec = ve-mu
      exponent = - vec %*% prec_mats[c,,] %*% t(vec) / 2
      prbs[c]=exp(exponent)/denominators[c]
    }
    return(prbs)
  }
  
  prediction = function(x) {
    which(probs(x)==max(probs(x)))
  }
  
  return(list(probs = probs, prediction = prediction, means = means[,2:(num_features+1)], covariance = cov_mats))
}

Generate synthetic data:

generate_qda_clusters = function(classes, dim, num_samples) {

  # generate non-degenerate covariance matrices
  # initialize matrices
  cov_mats = array(dim = c(classes, dim, dim))
  for(c in 1:classes) {
    repeat{
      A = matrix(runif(dim^2),dim,dim)  
      cov_mat = A%*%t(A)
      if(det(cov_mat)!=0) {
        cov_mats[c,,]=cov_mat
        break
      }
    }
  }
  # generate means
  means = matrix(1.5*runif(dim*classes),dim,classes)
  
  #initialize data frame
  xvals = matrix(numeric(dim*classes*num_samples), num_samples*classes, dim)
  y = matrix(numeric(classes*num_samples),classes*num_samples, 1)
  t = as.tibble(cbind(xvals,y))
  colnames(t)=c(paste0('x',1:dim),'y')
  
  # Fill data frame
  for(ix in 0:(num_samples*classes-1)) {
      c = (ix %/% num_samples)+1
      t[(ix+1),]=c(mvrnorm(1,means[,c],cov_mats[c,,]),c)
  }
  # Generate output
  output = list(data = t, means = t(means), covariance = cov_mats)
  return(output)
}

Let’s see if that worked:

df = generate_qda_clusters(3,4,100)
xvals = df$data[,1:4]
yvals = df$data[,5]
qdac = QDAClassifier(xvals,yvals)

Compare means:

qdac$means
## # A tibble: 3 x 4
##           x1        x2        x3        x4
##        <dbl>     <dbl>     <dbl>     <dbl>
## 1 0.08260264 0.4301034 0.1457182 0.8632887
## 2 1.62192712 1.0937153 0.6649225 0.2333357
## 3 0.99022041 0.4829438 1.2868795 0.5151341
df$means
##           [,1]      [,2]      [,3]      [,4]
## [1,] 0.2111792 0.5984161 0.3451419 0.8686134
## [2,] 1.4986965 1.0632256 0.5463299 0.1748167
## [3,] 0.7605555 0.1218101 0.8682180 0.2659771
mean((qdac$means - df$means)^2)
## [1] 0.04489864

Okay covariances:

qdac$covariance
## , , 1
## 
##           [,1]      [,2]     [,3]      [,4]
## [1,] 0.7020239 1.1557111 1.116209 0.2763056
## [2,] 1.1591497 0.8529673 1.013734 0.4396454
## [3,] 1.2527756 1.5844386 1.282907 0.9630064
## 
## , , 2
## 
##           [,1]      [,2]     [,3]      [,4]
## [1,] 1.1557111 2.4098661 2.279968 0.6962249
## [2,] 0.8529673 0.9934195 1.062523 0.7691141
## [3,] 1.5844386 2.3884430 2.228916 1.4798311
## 
## , , 3
## 
##          [,1]     [,2]     [,3]      [,4]
## [1,] 1.116209 2.279968 2.420025 0.7316594
## [2,] 1.013734 1.062523 1.365315 1.0462900
## [3,] 1.282907 2.228916 2.928235 2.1355284
## 
## , , 4
## 
##           [,1]      [,2]      [,3]     [,4]
## [1,] 0.2763056 0.6962249 0.7316594 0.604256
## [2,] 0.4396454 0.7691141 1.0462900 1.091922
## [3,] 0.9630064 1.4798311 2.1355284 1.962096
df$covariance
## , , 1
## 
##           [,1]      [,2]      [,3]      [,4]
## [1,] 0.6185236 0.9856408 0.9222466 0.2425210
## [2,] 1.3226684 1.0053535 1.2490927 0.6222098
## [3,] 1.2128638 1.4290042 1.0692735 0.8469089
## 
## , , 2
## 
##           [,1]     [,2]     [,3]      [,4]
## [1,] 0.9856408 2.027609 1.894024 0.6192662
## [2,] 1.0053535 1.101553 1.235157 0.8816886
## [3,] 1.4290042 2.092696 1.891997 1.2578949
## 
## , , 3
## 
##           [,1]     [,2]     [,3]      [,4]
## [1,] 0.9222466 1.894024 2.014706 0.6885483
## [2,] 1.2490927 1.235157 1.638747 1.2208347
## [3,] 1.0692735 1.891997 2.520242 1.8370461
## 
## , , 4
## 
##           [,1]      [,2]      [,3]      [,4]
## [1,] 0.2425210 0.6192662 0.6885483 0.5889582
## [2,] 0.6222098 0.8816886 1.2208347 1.1779506
## [3,] 0.8469089 1.2578949 1.8370461 1.7613251
mean((qdac$covariance-df$covariance)^2)
## [1] 0.04631098

Great those estimates look okay. How are the predictions:

df = generate_qda_clusters(2,5,400)
xvals = df$data[,1:5]
yvals = df$data[,6]
#shuffle our data
num_samples = dim(xvals)[1]
perm = sample(1:num_samples)
xvals = xvals[perm,]
yvals = yvals[perm,]

#split it into training and test sets
split = (num_samples * 9) %/% 10
tr_xvals = xvals[1:split,]
test_xvals = xvals[(split+1):num_samples, ]
tr_yvals = yvals[1:split,]
test_yvals = yvals[(split+1):num_samples,]

#Fit our classifier
qdac = QDAClassifier(tr_xvals, tr_yvals)
tr_preds = as.matrix(apply(tr_xvals, 1, qdac$prediction))
test_preds = as.matrix(apply(test_xvals,1,qdac$prediction))

# Compare with LDA
ldac = LDAClassifier(tr_xvals, tr_yvals)
tr_preds2 = as.matrix(apply(tr_xvals, 1, ldac$prediction))
test_preds2 = as.matrix(apply(test_xvals,1,ldac$prediction))

Let’s see how we did

output = tibble(LDA = c(mean(tr_preds2 == tr_yvals), mean(test_preds2 == test_yvals)), QDA = c(mean(tr_preds == tr_yvals), mean(test_preds == test_yvals)))
output
## # A tibble: 2 x 2
##         LDA       QDA
##       <dbl>     <dbl>
## 1 0.8166667 0.9819444
## 2 0.8500000 0.9500000

Digits. For this we will first apply a PCA dimension reduction to our data. This both makes the problem more tractable and removes

library(keras)
mnist = dataset_mnist()
to_sample = 1:10000
x_train = mnist$train$x
x_train = array_reshape(x_train, c(nrow(x_train),784))
pca = prcomp(x_train[to_sample,], rank.=30)
tt = pca$x
y_train = mnist$train$y+1
ldac = LDAClassifier(tt, y_train[to_sample])
qdac = QDAClassifier(tt, y_train[to_sample])
tr_preds = apply(tt, 1, ldac$prediction)
tr_preds2 = apply(tt, 1, qdac$prediction)
lda_tr = mean(tr_preds == y_train[to_sample])
qda_tr = mean(tr_preds2 == y_train[to_sample])

Try this on test data

to_test = 1:10000
x_test = mnist$test$x
x_test = array_reshape(x_test, c(nrow(x_test),784))
ttt = predict(pca, x_test[to_test,])
y_test = mnist$test$y+1
test_preds = apply(ttt, 1, ldac$prediction)
test_preds2 = apply(ttt, 1, qdac$prediction)
lda_test = mean(test_preds == y_test[to_test])
qda_test = mean(test_preds2 == y_test[to_test])
output = tibble(LDA = c(lda_tr, lda_test), QDA = c(qda_tr, qda_test))
output
## # A tibble: 2 x 2
##      LDA    QDA
##    <dbl>  <dbl>
## 1 0.8551 0.9654
## 2 0.8556 0.9566