# %reset import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression data = pd.read_csv("/Users/jmm/TEACHING/2122/M2-GLM-HAX912X/TP/creditcard.csv") data.head() count_classes = pd.value_counts(data['Class'], sort = True).sort_index() count_classes.plot(kind = 'bar') plt.title("Fraud class histogram") plt.xlabel("Class") plt.ylabel("Frequency") X = data.loc[:, data.columns != 'Class'] y = data['Class'] scaler = preprocessing.StandardScaler().fit(X) X = scaler.transform(X) number_records_fraud = len(data[data.Class == 1]) fraud_indices = np.array(data[data.Class == 1].index) normal_indices = data[data.Class == 0].index random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False) random_normal_indices = np.array(random_normal_indices) under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) under_sample_data = data.iloc[under_sample_indices,:] X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class'] y_undersample = under_sample_data['Class'] X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 1974) print("") print("Number transactions train dataset: ", len(X_train)) print("Number transactions test dataset: ", len(X_test)) print("Total number of transactions: ", len(X_train)+len(X_test)) X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample ,y_undersample,test_size = 0.3,random_state = 1981) print("") print("Number transactions train dataset: ",len(X_train_undersample)) print("Number transactions test dataset: ",len(X_test_undersample)) print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample)) logit = LogisticRegression() logit.fit(X_train,y_train) y_chap = logit.predict(X_test) table=pd.crosstab(y_test,y_chap) print(table) print(y_test.sum()) table.values[1,1]/y_test.sum() logit = LogisticRegression() logit.fit(X_train_undersample,y_train_undersample) y_chap_undersample = logit.predict(X_test_undersample) table=pd.crosstab(y_test_undersample,y_chap_undersample) print(table) print(y_test_undersample.sum()) table.values[1,1]/y_test_undersample.sum() from sklearn.ensemble import RandomForestClassifier from math import sqrt forest = RandomForestClassifier(n_estimators=1000, max_features=int(sqrt(30)),oob_score=True,n_jobs=-1) forest = forest.fit(X_train,y_train) y_chap = forest.predict(X_test) table = pd.crosstab(y_test,y_chap) print(table) print(y_test.sum()) table.values[1,1]/y_test.sum() forest = RandomForestClassifier(n_estimators=1000, max_features=int(sqrt(30)),oob_score=True,n_jobs=-1) forest = forest.fit(X_train_undersample,y_train_undersample) y_chap = forest.predict(X_test) table = pd.crosstab(y_test,y_chap) print(table) print(y_test.sum()) table.values[1,1]/y_test.sum()