The purpose of the notebook is to implement entity embeddings for categorical variables. The paper for the same can be found here https://arxiv.org/pdf/1604.06737.pdf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
df_train = pd.read_csv('../input/tabular-playground-series-aug-2022/train.csv', low_memory=False)
df_train.head(10)
df_train.columns
df_train.isna().sum()
dep_var = 'failure'
df_train = df_train.drop('id', axis=1)
cont,cat = cont_cat_split(df_train, max_card=9000, dep_var=dep_var)
cont, cat
splits = RandomSplitter(seed=42)(df_train)
procs_nn = [Categorify, FillMissing, Normalize]
df_train = df_train.astype({dep_var: np.float16})
to_nn = TabularPandas(df_train, procs_nn, cat, cont,
splits=splits, y_names=dep_var)
dls = to_nn.dataloaders(1024)
len(to_nn.train), len(to_nn.valid)
to_nn.train.y.min(), to_nn.train.y.max()
learn = tabular_learner(dls, y_range=(0,1),layers=[500,250], n_out=1, loss_func=F.mse_loss)
learn.lr_find()
learn.fit_one_cycle(10, 1e-3)
preds,targs = learn.get_preds()
roc_auc_score(targs, preds)
def r_mse(pred,y):
return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y):
return r_mse(m.predict(xs), y)
to_nn.train.xs
xs,y = to_nn.train.xs,to_nn.train.y
valid_xs,valid_y = to_nn.valid.xs,to_nn.valid.y
def rf(xs, y, n_estimators=100, max_samples=21_000,
max_features=0.5, min_samples_leaf=5, **kwargs):
return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
max_samples=max_samples, max_features=max_features,
min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
m = rf(xs, y);
m_rmse(m, xs, y)
m_rmse(m, valid_xs, valid_y)
procs = [Categorify, FillMissing]
to = TabularPandas(df_train, procs, cat, cont, dep_var, splits = splits)
cat_x = cat
learn.embeds[0].weight
def add_embeds(learn, x):
x = x.copy()
for i, cat in enumerate(cat_x):
emb = learn.embeds[i]
vec = tensor(x[cat], dtype=torch.int64) # this is on cpu
emb_data = emb(vec)
emb_names = [f'{cat}_{j}' for j in range(emb_data.shape[1])]
emb_df = pd.DataFrame(emb_data, index=x.index, columns=emb_names)
x = x.drop(columns=cat)
x = x.join(emb_df)
return x
emb_xs = add_embeds(learn, to.train.xs)
emb_valid_xs = add_embeds(learn, to.valid.xs)
emb_xs_full = add_embeds(learn, to.xs)
emb_xs_full
to.train.xs.head(2), to.train.xs.shape
emb_xs.head(2), emb_xs.shape
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
def RF_wrapper(Xt, yt, Xv, yv, fold=-1):
model = RandomForestRegressor(n_jobs=-1, n_estimators=100,
max_samples=15000, max_features=0.5,
min_samples_leaf=5, oob_score=False).fit(Xt, yt)
print(f'Training fold {fold}...')
score_train = np.sqrt(mean_squared_error(model.predict(Xt), yt))
oof = model.predict(Xv)
score = np.sqrt(mean_squared_error(oof, yv))
print(f'Fold {fold}: training RMSLE: {score_train}, validation RMSLE: {score}\n')
return model, oof, score
def perform_CV(model_wrap, xs, ys, n_splits=3):
kf = KFold(n_splits=n_splits, shuffle=True)
models = []
scores = []
oof_total = np.zeros(xs.shape[0])
for fold, (train_idx, val_idx) in enumerate(kf.split(xs), start=1):
Xt = xs.iloc[train_idx]
yt = ys[train_idx]
Xv, yv = xs.iloc[val_idx], ys[val_idx]
model, oof, score = model_wrap(Xt, yt, Xv, yv, fold)
models.append(model)
scores.append(score)
oof_total[val_idx] = oof
print('Training completed.')
print(f'> Mean RMSLE across folds: {np.mean(scores)}, std: {np.std(scores)}')
print(f'> OOF RMSLE: {np.sqrt(mean_squared_error(ys, oof_total))}')
return models, scores, oof_total
y_x = to["failure"]
p = []
for c in to.cat_names:
p.append(c)
for c in to.cont_names:
p.append(c)
to_x = to[p]
%%time
n_splits = 3
models, _, _ = perform_CV(RF_wrapper, to_x, y_x, n_splits=n_splits)
importance = pd.DataFrame([model.feature_importances_ for model in models],
columns=p,
index=[f'Fold {i}' for i in range(1, n_splits + 1)])
importance = importance.T
importance['Average importance'] = importance.mean(axis=1)
importance = importance.sort_values(by='Average importance', ascending=False)
plt.figure(figsize=(10,7))
sns.barplot(x='Average importance', y=importance.index, data=importance);
%%time
models_emb, _, _ = perform_CV(RF_wrapper, emb_xs_full, y_x, n_splits=n_splits)
emb_xs.columns
importance = pd.DataFrame([model.feature_importances_ for model in models_emb],
columns=emb_xs.columns,
index=[f'Fold {i}' for i in range(1, n_splits + 1)])
importance = importance.T
importance['Average importance'] = importance.mean(axis=1)
importance = importance.sort_values(by='Average importance', ascending=False)[:20]
plt.figure(figsize=(10,7))
sns.barplot(x='Average importance', y=importance.index, data=importance);