# %reset

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("/Users/jmm/TEACHING/2122/M2-GLM-HAX912X/TP/creditcard.csv")
data.head()

count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

X = data.loc[:, data.columns != 'Class']
y = data['Class']

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
normal_indices = data[data.Class == 0].index
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
random_normal_indices = np.array(random_normal_indices)
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 1974)

print("")
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample                                                                                ,y_undersample,test_size = 0.3,random_state = 1981)

print("")
print("Number transactions train dataset: ",len(X_train_undersample))
print("Number transactions test dataset: ",len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))

logit = LogisticRegression()
logit.fit(X_train,y_train)
y_chap = logit.predict(X_test)
table=pd.crosstab(y_test,y_chap)
print(table)
print(y_test.sum())
table.values[1,1]/y_test.sum()

logit = LogisticRegression()
logit.fit(X_train_undersample,y_train_undersample)
y_chap_undersample = logit.predict(X_test_undersample)
table=pd.crosstab(y_test_undersample,y_chap_undersample)
print(table)
print(y_test_undersample.sum())
table.values[1,1]/y_test_undersample.sum()


from sklearn.ensemble import RandomForestClassifier
from math import sqrt

forest = RandomForestClassifier(n_estimators=1000, max_features=int(sqrt(30)),oob_score=True,n_jobs=-1)
forest = forest.fit(X_train,y_train) 
y_chap = forest.predict(X_test)
table = pd.crosstab(y_test,y_chap)
print(table)
print(y_test.sum())
table.values[1,1]/y_test.sum()

forest = RandomForestClassifier(n_estimators=1000, max_features=int(sqrt(30)),oob_score=True,n_jobs=-1)
forest = forest.fit(X_train_undersample,y_train_undersample) 
y_chap = forest.predict(X_test)
table = pd.crosstab(y_test,y_chap)
print(table)
print(y_test.sum())
table.values[1,1]/y_test.sum()