Problem Statement: From a list of collision events and their properties, predict whether a τ → 3μ decay happened in this collision. This τ → 3μ is currently assumed by scientists not to happen, and the goal of this competition is to discover τ → 3μ happening more frequently than scientists currently can understand.
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import math
import dateutil.parser
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score,confusion_matrix, f1_score, confusion_matrix, f1_score, classification_report, roc_curve, roc_auc_score, auc, precision_recall_curve, precision_score, recall_score
import xgboost as xgb
np.set_printoptions(linewidth=140)
pd.set_option('display.width', 140)
path = '/kaggle/input/flavours-of-physics-kernels-only/'
train = pd.read_csv(path+'training.csv.zip', index_col='id')
test = pd.read_csv(path+'test.csv.zip', index_col = "id")
train.head(3)
train.shape, test.shape
train.isna().sum()
train.describe()
all_num_cols = ['LifeTime', 'dira', 'FlightDistance', 'FlightDistanceError', 'IP', 'IPSig', 'VertexChi2', 'pt', 'DOCAone', 'DOCAtwo', 'DOCAthree',
'IP_p0p2', 'IP_p1p2', 'isolationa', 'isolationb', 'isolationc', 'isolationd', 'isolatione', 'isolationf', 'iso', 'CDF1', 'CDF2',
'CDF3', 'ISO_SumBDT', 'p0_IsoBDT', 'p1_IsoBDT', 'p2_IsoBDT', 'p0_track_Chi2Dof', 'p1_track_Chi2Dof', 'p2_track_Chi2Dof', 'p0_IP',
'p1_IP', 'p2_IP', 'p0_IPSig', 'p1_IPSig', 'p2_IPSig', 'p0_pt', 'p1_pt', 'p2_pt', 'p0_p', 'p1_p', 'p2_p', 'p0_eta', 'p1_eta',
'p2_eta', 'SPDhits']
y = train["signal"]
x = train[all_num_cols]
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled= pd.DataFrame(sc.fit_transform(x), columns=all_num_cols)
X_scaled.var()
X_scaled.head()
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.25, random_state=42)
train.shape, x.shape, X_train.shape, X_val.shape
print("Random Forest")
rf = RandomForestClassifier(n_estimators=100, random_state=11)
rf.fit(X_train, y_train)
def rf_feat_importance(m, df):
return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
).sort_values('imp', ascending=False)
fi = rf_feat_importance(rf, X_train)
fi.sort_values("imp", ascending = False)[:10]
def plot_fi(fi):
return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
plot_fi(fi[:20]);
selected_cols = ["IPSig", "IP", "dira", "SPDhits", "VertexChi2", "p0_track_Chi2Dof", "ISO_SumBDT", "iso", "LifeTime", "p0_IsoBDT", "p0_IPSig", "isolatione" ]
X_train = X_train[selected_cols]
X_train.shape
X_train.hist(bins=30, figsize=(20, 14))
y_train.hist()
X_train['SPDhits'] = np.log(X_train['SPDhits'])
X_train['p0_IPSig'] = np.log(X_train['p0_IPSig'])
X_train["SPDhits"].hist(bins=30, figsize=(5, 3))
X_train["p0_IPSig"].hist(bins=30, figsize=(5, 3))
plt.figure(figsize=(15,10))
sns.heatmap(X_train.corr(), annot=True, cbar=False, fmt='.1f', cmap='summer_r')
plt.show()
def r_mse(pred,y):
return round(math.sqrt(((pred-y)**2).mean()), 6)
def con_matrix(y_val, val_preds_bin):
matrix = confusion_matrix(y_val, val_preds_bin)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
# Build the plot
plt.figure(figsize=(8,5))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
cmap=plt.cm.Greens, linewidths=0.2)
# Add labels to the plot
class_names = ['Signal', 'Background']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=0)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Ensemble Model')
plt.show()
def metric_report(y_val, pred_val):
print("Accuracy Random_Forest:", accuracy_score(y_val, pred_val))
print("Precision Random_Forest:", precision_score(y_val, pred_val))
print("Recall Random_Forest:", recall_score(y_val, pred_val))
print("F1 Score Random_Forest:", f1_score(y_val, pred_val))
def roc_curvePlot(y_val, val_preds):
fpr, tpr, threshold = metrics.roc_curve(y_val, val_preds)
roc_auc = metrics.auc(fpr, tpr)
import matplotlib.pyplot as plt
plt.style.use('default')
plt.title('ROC curve')
plt.plot(tpr, 1-fpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [1, 0],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('Signal efficiency (TPR)')
plt.ylabel('Background rejection (1-FPR)')
plt.show()
m = DecisionTreeRegressor()
m.fit(X_train, y_train);
X_val = X_val[selected_cols]
X_val.shape
pred_val = m.predict(X_val)
y_val.shape, pred_val.shape
r_mse(pred_val, y_val)
print("Random Forest")
trees = [10, 50, 100, 150, 200,250, 300,350, 400,450, 500, 550, 600, 650]
err = []
tm = []
for t in trees:
start = time.time()
print("Preparing trees: ", t)
rf = RandomForestClassifier(n_estimators=t, random_state=11)
rf.fit(X_train, y_train)
pred_val_rf = rf.predict(X_val)
err.append(r_mse(pred_val_rf, y_val))
end = time.time()
tm.append(end-start)
print(f" For Tree with depth {i}, it took {end - start} second")
plt.plot(trees, err)
plt.plot(trees, tm)
plt.plot([r_mse(pred_val[:i+1].mean(0), y_val) for i in range(200)]);
metric_report(y_val, pred_val_rf)
confusion_matrix(y_val, pred_val_rf)
con_matrix(y_val, pred_val_rf)
print("XGBoost")
params = {"objective": "binary:logistic",
"base_Score": 0.5,
"eta": 0.3,
"max_depth": 5,
"min_child_weight": 3,
"silent": 1,
"subsample": 0.7,
"colsample_bytree": 0.7,
"seed": 1}
num_trees=200
gbm = xgb.train(params, xgb.DMatrix(X_train, y_train), num_trees)
pred_gbm = gbm.predict(xgb.DMatrix(X_val))
r_mse(pred_gbm, y_val)
pred_gbm
pred_gbm_bin =[]
for val in pred_gbm:
if val > 0.5:
pred_gbm_bin.append(1)
else:
pred_gbm_bin.append(0)
metric_report(y_val, pred_gbm_bin)
confusion_matrix(y_val, pred_gbm_bin)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
pred_lr = clf.predict(X_val)
metric_report(y_val, pred_lr)
val_preds = (pred_lr + pred_gbm_bin + pred_val_rf)/3
val_rmse = r_mse(val_preds , y_val)
val_rmse
np.unique(val_preds)
val_preds_bin =[]
for val in val_preds:
if val >= 0.5:
val_preds_bin.append(1)
else:
val_preds_bin.append(0)
metric_report(y_val, val_preds_bin)
target_names = ['signal', 'Background']
print(classification_report(y_val, val_preds_bin, target_names=target_names))
con_matrix(y_val, val_preds_bin)
roc_curvePlot(y_val, val_preds_bin)
X_test = test[selected_cols]
cols = X_test.columns
X_scaled_test = pd.DataFrame(sc.fit_transform(X_test), columns=cols)
test_probs = xgb.predict(X_scaled_test)
test_probs
np.unique(test_probs)
test.shape
p = pd.read_csv(path+'test.csv.zip')
df_submit = pd.DataFrame()
df_submit["id"] = p["id"]
df_submit["prediction"] = test_probs
df_submit.to_csv('submission.csv', index=False, sep=',')