First the exercise:
Now let us load our standard libraries.
import numpy as np
import pandas as pd
Let us load the credit card dataset and extract a small dataframe of numerical features to test on.
big_df = pd.read_csv("UCI_Credit_Card.csv")
big_df.head()
len(big_df)
len(big_df.dropna())
df = big_df.drop(labels = ['ID'], axis = 1)
labels = df['default.payment.next.month']
df.drop('default.payment.next.month', axis = 1, inplace = True)
num_samples = 25000
train_x, train_y = df[0:num_samples], labels[0:num_samples]
test_x, test_y = df[num_samples:], labels[num_samples:]
test_x.head()
train_y.head()
Now let us write our transformation function.
class bin_transformer(object):
def __init__(self, df, num_quantiles = 2):
self.quantiles = df.quantile(np.linspace(1./num_quantiles, 1.-1./num_quantiles,num_quantiles-1))
def transform(self, df):
new = pd.DataFrame()
fns = {}
for col_name in df.axes[1]:
for ix, q in self.quantiles.iterrows():
quart = q[col_name]
new[col_name+str(ix)] = (df[col_name] >= quart)
fns[col_name+str(ix)] =(col_name, lambda x: x[col_name]>=quart)
return new, fns
transformer = bin_transformer(df,5)
train_x_t, tr_fns = transformer.transform(train_x)
test_x_t, test_fns = transformer.transform(test_x)
train_x_t.head()
tr_fns
Now let us build some simple loss functions for 1d labels.
def bdd_cross_entropy(pred, label):
return -np.mean(label*np.log(pred+10**(-20)))
def MSE(pred,label):
return np.mean((pred-label)**2)
def acc(pred,label):
return np.mean((pred>=0.5)==(label == 1))
Now let us define the find split function.
def find_split(x, y, loss, verbose = False):
min_ax = None
base_loss = loss(np.mean(y),y)
min_loss = base_loss
N = len(x)
for col_name in x.axes[1]:
mask = x[col_name]
num_pos = np.sum(mask)
num_neg = N - num_pos
pos_y = np.mean(y[mask])
neg_y = np.mean(y[~mask])
l = (num_pos*loss(pos_y, y[mask]) + num_neg*loss(neg_y, y[~mask]))/N
if verbose:
print("Column {0} split has improved loss {1}".format(col_name, base_loss-l))
if l < min_loss:
min_loss = l
min_ax = col_name
return min_ax, min_loss
find_split(train_x_t, train_y, MSE, verbose = True)
find_split(train_x_t, train_y, bdd_cross_entropy, verbose = 0)
find_split(train_x_t, train_y, acc, verbose = 0)
np.mean(train_y[train_x_t['PAY_00.8']])
np.mean(train_y[~train_x_t['PAY_00.8']])
np.mean(train_y[train_x_t['AGE0.2']])
np.mean(train_y[~train_x_t['AGE0.2']])