# %reset import os import pandas as pd import numpy as np from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from math import sqrt path = os.getcwd() os.chdir('/Users/marin/TEACHING/2324/M2-GLM-HAX912X/TP') data = pd.read_csv("creditcard.csv") data.shape count_classes = pd.value_counts(data['Class'], sort = True).sort_index() count_classes X = data.loc[:, data.columns != 'Class'] y = data['Class'] scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 1974) print("") print("Number transactions train dataset: ", len(X_train)) print("Number transactions test dataset: ", len(X_test)) print("Total number of transactions: ", len(X_train)+len(X_test)) number_records_fraud = len(data[data.Class == 1]) fraud_indices = np.array(data[data.Class == 1].index) normal_indices = data[data.Class == 0].index random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False) random_normal_indices = np.array(random_normal_indices) under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) under_sample_data = data.iloc[under_sample_indices,:] X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class'] y_undersample = under_sample_data['Class'] X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample ,y_undersample,test_size = 0.3,random_state = 1981) print("") print("Number transactions train dataset: ",len(X_train_undersample)) print("Number transactions test dataset: ",len(X_test_undersample)) print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample)) forest = RandomForestClassifier(n_estimators=1000, max_features=int(sqrt(30)),oob_score=True,n_jobs=-1) forest = forest.fit(X_train_undersample,y_train_undersample) y_chap = forest.predict(X_test_undersample) table = pd.crosstab(y_test_undersample,y_chap) table