Credit Card Fraud Detection
%reset
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
data = pd.read_csv("/Users/marin/Documents/TEACHING/2526/M2-GLM-HAX912X/TP/creditcard.csv")
data.head()
X = data.loc[:, data.columns != 'Class']
y = data['Class']
pd.Series(y).value_counts()
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X).astype(np.float32)
number_records_fraud = len(data[y == 1])
fraud_indices = data[y == 1].index
normal_indices = data[y == 0].index
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
under_sample_indices = np.concatenate((fraud_indices,random_normal_indices))
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data['Class']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 2003)
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample,y_undersample,test_size = 0.3, random_state = 2005)
print("Train:", pd.Series(y_train).value_counts().to_dict())
print("Test:", pd.Series(y_test).value_counts().to_dict())
print("Train undersample:", pd.Series(y_train_undersample).value_counts().to_dict())
print("Test undersample:", pd.Series(y_test_undersample).value_counts().to_dict())
logit = LogisticRegression(penalty=None)
logit.fit(X_train,y_train)
y_chap = logit.predict(X_test)
table = pd.crosstab(y_test,y_chap)
print(table)
logit = LogisticRegression(penalty=None,class_weight="balanced")
logit.fit(X_train,y_train)
y_chap = logit.predict(X_test)
table = pd.crosstab(y_test,y_chap)
print(table)
logit = LogisticRegression(penalty=None,max_iter=5000,solver="newton-cg")
logit.fit(X_train_undersample,y_train_undersample)
y_chap_undersample = logit.predict(X_test_undersample)
table = pd.crosstab(y_test_undersample,y_chap_undersample)
print(table)
from sklearn.ensemble import RandomForestClassifier
from math import sqrt
forest = RandomForestClassifier(n_estimators=100, max_features=int(sqrt(30)),oob_score=True,n_jobs=-1)
forest.fit(X_train,y_train)
y_chap = forest.predict(X_test)
table = pd.crosstab(y_test,y_chap)
print(table)
forest = RandomForestClassifier(n_estimators=100, max_features=int(sqrt(30)),oob_score=True,n_jobs=-1)
forest.fit(X_train_undersample,y_train_undersample)
y_chap = forest.predict(X_test_undersample)
table = pd.crosstab(y_test_undersample,y_chap)
print(table)
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
device = torch.device("mps")
X_train_t = torch.from_numpy(X_train.astype(np.float32))
y_train_t = torch.from_numpy(y_train.values.astype(np.float32)).unsqueeze(1)
X_test_t = torch.from_numpy(X_test.astype(np.float32))
y_test_t = torch.from_numpy(y_test.values.astype(np.float32)).unsqueeze(1)
X_test_t = X_test_t.to(device)
y_test_t = y_test_t.to(device)
train_ds = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True, drop_last=False)
n_pos = (y_train == 1).sum()
n_neg = (y_train == 0).sum()
pos_weight_value = torch.tensor([n_neg / n_pos], dtype=torch.float32, device=device)
pos_weight_value.item()
def train_model(model, epochs=50, lr=1e-3, weight_decay=0.0):
model.to(device)
opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_value.to(device))
for epoch in range(1, epochs+1):
model.train()
running = 0.0
for xb, yb in train_loader:
xb = xb.to(device, dtype=torch.float32)
yb = yb.to(device, dtype=torch.float32)
opt.zero_grad()
logits = model(xb)
loss = criterion(logits, yb)
loss.backward()
opt.step()
running += loss.item() * xb.size(0)
train_loss = running / len(train_loader.dataset)
if epoch == 1 or epoch % 5 == 0 or epoch == epochs:
model.eval()
with torch.no_grad():
logits = model(X_test_t)
test_loss = criterion(logits, y_test_t).item()
print(f"Epoch {epoch:02d} | Train loss = {train_loss:.5f} | Test loss={test_loss:.5f}")
return model
logreg_torch = nn.Linear(X_train.shape[1], 1)
sum(p.numel() for p in logreg_torch.parameters())
train_model(logreg_torch, epochs=50, lr=1e-3, weight_decay=0.0)
logreg_torch.eval()
with torch.no_grad():
logits = logreg_torch(X_test_t)
y_pred = (torch.sigmoid(logits).squeeze(1) >= 0.5).int()
y_pred = y_pred.detach().cpu().numpy().ravel()
table = pd.crosstab(y_test,pd.Series(y_pred, index=y_test.index))
print(table)
mlp = nn.Sequential(
nn.Linear(X_train.shape[1], 128),
nn.GELU(),
nn.Linear(128, 64),
nn.GELU(),
nn.Linear(64, 32),
nn.GELU(),
nn.Linear(32, 1)
)
sum(p.numel() for p in mlp.parameters())
train_model(mlp, epochs=80, lr=1e-3, weight_decay=1e-5)
mlp.eval()
with torch.no_grad():
logits = mlp(X_test_t)
y_pred = (torch.sigmoid(logits).squeeze(1) >= 0.5).int()
y_pred = y_pred.detach().cpu().numpy().ravel()
table = pd.crosstab(y_test,pd.Series(y_pred, index=y_test.index))
print(table)