Credit Card Fraud Detection
__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model, datasets, metrics
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.utils import compute_class_weight
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import BaseLogger, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
import seaborn as sns
# Reading dataset
input_data = pd.read_csv("creditcard.csv")
input_data = input_data.drop(['Time', 'Amount'], axis=1) #Testing shows that with the `Time` column the accuracy of the mosdels decreases
# Setting labels and features
y = input_data['Class']
X = input_data.drop(['Class'], axis=1)
# Shuffle and split the data into training and testing subsets
X_training, X_testing, y_training, y_testing = train_test_split(X, y, test_size = 0.4, random_state = 0)
# Classifying with Logistic Regression
lm_classifier = LogisticRegression()
lm_classifier.fit(X,y)
y_pred = lm_classifier.predict(X_training)
lm_classifier.score(X_testing, y_testing) # Accuracy
precision = precision_score(y_training, y_pred, average='binary')
recall = recall_score(y_training, y_pred, average='binary')
cm = confusion_matrix(y_training, y_pred)
df_cm = pd.DataFrame(cm2, index = ['TP', 'TN'])
df_cm.columns = ['Predicted Pos', 'Predicted Neg']
print("\n Precision: ", precision, "\n Recall: ", recall)
sns.heatmap(df_cm, annot=True, fmt="d")
Precision: 0.868544600939 Recall: 0.631399317406
<matplotlib.axes._subplots.AxesSubplot at 0x11a489128>
# Classifying with Random Forest
rf_classifier = RandomForestClassifier()
rf_classifier = rf_classifier.fit(X_training, y_training)
y_pred = rf_classifier.predict(X_training)
rf_classifier.score(X_testing, y_testing) # Accuracy
precision = precision_score(y_training, y_pred, average='binary')
recall = recall_score(y_training, y_pred, average='binary')
cm3 = confusion_matrix(y_training, y_pred)
df_cm3 = pd.DataFrame(cm3, index = ['TP', 'TN'])
df_cm3.columns = ['Predicted Pos', 'Predicted Neg']
print("\n Precision: ", precision, "\n Recall: ", recall)
sns.heatmap(df_cm3, annot=True, fmt="d")
Precision: 0.996402877698 Recall: 0.945392491468
<matplotlib.axes._subplots.AxesSubplot at 0x11a489128>
# Re-Importing and normalizing data with StandardScaler
input_data = pd.read_csv('creditcard.csv')
input_data.iloc[:, 1:29] = StandardScaler().fit_transform(input_data.iloc[:, 1:29])
data_matrix = input_data.as_matrix()
X = data_matrix[:, 1:29]
Y = data_matrix[:, 30]
class_weights = dict(zip([0, 1], compute_class_weight('balanced', [0, 1], Y)))
# k-fold cross-validation
seed = 3
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
cvscores = []
predictions = np.empty(len(Y))
predictions[:] = np.NAN
proba = np.empty([len(Y), kfold.n_splits])
proba[:] = np.NAN
k = 0
for train, test in kfold.split(X, Y):
model = Sequential()
model.add(Dense(28, input_dim=28))
model.add(Dropout(0.2))
model.add(Dense(22))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
bl = BaseLogger()
cp = ModelCheckpoint(filepath="checkpoint.hdf5", verbose=1, save_best_only=True)
es = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=8, verbose=0, mode='auto')
rlop = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=8, min_lr=0.001)
tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
model.fit(X[train], Y[train], batch_size=1000, nb_epoch=100, verbose=0, shuffle=True, validation_data=(X[test], Y[test]), class_weight=class_weights, callbacks=[bl, cp, es, rlop, tb])
# Current iteration score
scores = model.evaluate(X[test], Y[test], verbose=1)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
# Storing predicted probs
proba[train, k] = model.predict_proba(X[train]).flatten()
k += 1
/Users/Micah/anaconda3/lib/python3.6/site-packages/keras/models.py:834: UserWarning: The `nb_epoch` argument in `fit` has been renamed `epochs`. warnings.warn('The `nb_epoch` argument in `fit` '
Epoch 00000: val_loss improved from inf to 0.32502, saving model to checkpoint.hdf5 Epoch 00001: val_loss improved from 0.32502 to 0.16435, saving model to checkpoint.hdf5 Epoch 00002: val_loss improved from 0.16435 to 0.12352, saving model to checkpoint.hdf5 Epoch 00003: val_loss did not improve Epoch 00004: val_loss improved from 0.12352 to 0.10091, saving model to checkpoint.hdf5 Epoch 00005: val_loss did not improve Epoch 00006: val_loss improved from 0.10091 to 0.09958, saving model to checkpoint.hdf5 Epoch 00007: val_loss did not improve Epoch 00008: val_loss did not improve Epoch 00009: val_loss did not improve Epoch 00010: val_loss did not improve Epoch 00011: val_loss did not improve Epoch 00012: val_loss did not improve Epoch 00013: val_loss did not improve Epoch 00014: val_loss did not improve Epoch 00015: val_loss improved from 0.09958 to 0.09540, saving model to checkpoint.hdf5 Epoch 00016: val_loss did not improve Epoch 00017: val_loss did not improve Epoch 00018: val_loss did not improve Epoch 00019: val_loss did not improve Epoch 00020: val_loss did not improve Epoch 00021: val_loss did not improve Epoch 00022: val_loss did not improve Epoch 00023: val_loss did not improve Epoch 00024: val_loss did not improve 93856/94936 [============================>.] - ETA: 0sbinary_accuracy: 97.42% 187296/189871 [============================>.] - ETA: 0s Epoch 00000: val_loss improved from inf to 0.25271, saving model to checkpoint.hdf5 Epoch 00001: val_loss improved from 0.25271 to 0.14718, saving model to checkpoint.hdf5 Epoch 00002: val_loss improved from 0.14718 to 0.11362, saving model to checkpoint.hdf5 Epoch 00003: val_loss improved from 0.11362 to 0.10580, saving model to checkpoint.hdf5 Epoch 00004: val_loss did not improve Epoch 00005: val_loss did not improve Epoch 00006: val_loss did not improve Epoch 00007: val_loss did not improve Epoch 00008: val_loss did not improve Epoch 00009: val_loss did not improve Epoch 00010: val_loss did not improve Epoch 00011: val_loss did not improve Epoch 00012: val_loss did not improve 92800/94936 [============================>.] - ETA: 0sbinary_accuracy: 97.58% 188128/189871 [============================>.] - ETA: 0s Epoch 00000: val_loss improved from inf to 0.30139, saving model to checkpoint.hdf5 Epoch 00001: val_loss improved from 0.30139 to 0.18896, saving model to checkpoint.hdf5 Epoch 00002: val_loss improved from 0.18896 to 0.16150, saving model to checkpoint.hdf5 Epoch 00003: val_loss improved from 0.16150 to 0.14016, saving model to checkpoint.hdf5 Epoch 00004: val_loss improved from 0.14016 to 0.12446, saving model to checkpoint.hdf5 Epoch 00005: val_loss improved from 0.12446 to 0.12398, saving model to checkpoint.hdf5 Epoch 00006: val_loss improved from 0.12398 to 0.10872, saving model to checkpoint.hdf5 Epoch 00007: val_loss did not improve Epoch 00008: val_loss did not improve Epoch 00009: val_loss did not improve Epoch 00010: val_loss did not improve Epoch 00011: val_loss did not improve Epoch 00012: val_loss did not improve Epoch 00013: val_loss did not improve Epoch 00014: val_loss did not improve Epoch 00015: val_loss did not improve 93920/94935 [============================>.] - ETA: 0sbinary_accuracy: 97.53% 188320/189872 [============================>.] - ETA: 0s
pred = np.nanmean(proba, 1) > 0.5
pred = pred.astype(int)
print(classification_report(Y, pred))
# Print
pd.crosstab(Y, pred, rownames=['Actual values'], colnames=['Predictions'])
precision recall f1-score support 0.0 1.00 0.98 0.99 284315 1.0 0.06 0.92 0.12 492 avg / total 1.00 0.98 0.99 284807
Predictions | 0 | 1 |
---|---|---|
Actual values | ||
0.0 | 277752 | 6563 |
1.0 | 40 | 452 |