# %reset import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn import datasets from sklearn.metrics import confusion_matrix from math import sqrt digits = datasets.load_digits() images_and_labels = list(zip(digits.images,digits.target)) for index, (image, label) in enumerate(images_and_labels[:8]): plt.subplot(2, 4, index + 1) plt.axis('off') plt.imshow(image,cmap=plt.cm.gray_r,interpolation='nearest') plt.title('Training: %i' % label) X = digits.data y = digits.target target_name = [0,1,2,3,4,5,6,7,8,9] # on analyse un peu les donnees import seaborn as sn import matplotlib.pyplot as plt df = pd.DataFrame(X) corrMatrix = df.iloc[:,0:31].corr() sn.heatmap(corrMatrix) corrMatrix = df.iloc[:,32:63].corr() sn.heatmap(corrMatrix) for i in range(0,63): print('la somme pour la variable', i, 'vaut', df.iloc[:,i].sum()) for i in range(0,63): if (df.iloc[:,i].sum()<=20): print(i) X = np.delete(X,(0,8,16,24,31,32,39,40,48,56),1) X.shape df = pd.DataFrame(X) compteur = 0 for i in range(0,52): for j in range(i+1,53): if (abs(df.iloc[:,[i,j]].corr().iloc[0,1])>0.8): compteur = compteur + 1 print(i,j) compteur X = np.delete(X,(0,1),1) X.shape df = pd.DataFrame(X) df.to_csv('images.csv') # on partage les donnes en deux groupes from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=14) # pour la validation croisée from sklearn.model_selection import RepeatedKFold Valid_croisee = RepeatedKFold(n_splits=10, n_repeats=10) from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV # analyse discriminante lineaire from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda = LinearDiscriminantAnalysis() lda.fit(X_train, y_train) print('Accuracy test: %.3f %%' % (lda.score(X_test,y_test)*100.0)) scores = cross_val_score(lda, X_train, y_train, cv = Valid_croisee) print('Accuracy cross val: %.3f %%' % (scores.mean()*100.0)) # analyse discriminante quadratique from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) print('Accuracy test: %.3f %%' % (qda.score(X_test,y_test)*100.0)) scores = cross_val_score(qda, X_train, y_train, cv = Valid_croisee) print('Accuracy cross val: %.3f %%' % (scores.mean()*100.0)) # multilayer perceptron from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(hidden_layer_sizes = (2000,2000,), activation = 'logistic' ) mlp = mlp.fit(X_train, y_train) print('Accuracy test: %.3f %%' % (mlp.score(X_test,y_test)*100.0)) # svm from sklearn.svm import SVC parameters = {'kernel':('linear', 'rbf'), 'C':range(1,15)} svm = GridSearchCV(SVC(), parameters, cv = Valid_croisee, n_jobs=-1) svm = svm.fit(X_train, y_train) print(svm.best_params_) print(svm.score(X_test,y_test)) y_chap = svm.predict(X_test) print(confusion_matrix(y_chap,y_test)) # k-plus-proches-voisins from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=10) knn = knn.fit(X_train, y_train) print('Accuracy test: %.3f %%' % (knn.score(X_test,y_test)*100.0)) from sklearn.model_selection import GridSearchCV parameters = {'n_neighbors':range(1,15)} knn = GridSearchCV(KNeighborsClassifier(),parameters,cv=Valid_croisee, n_jobs=-1) knn = knn.fit(X_train, y_train) print(knn.best_params_) print('Accuracy test: %.3f %%' % (knn.score(X_test,y_test)*100.0)) y_chap = knn.predict(X_test) print(confusion_matrix(y_chap,y_test)) # arbres from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier() tree = tree.fit(X_train, y_train) print(tree.score(X_test,y_test)) y_chap = tree.predict(X_test) print(confusion_matrix(y_chap,y_test)) # forests from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators=50000, max_features=int(sqrt(52)),oob_score=True,n_jobs=-1) forest = forest.fit(X_train,y_train) print('Accuracy oob: %.3f %%' % (forest.oob_score_*100.0)) print('Accuracy test: %.3f %%' % (forest.score(X_test,y_test)*100.0)) y_chap = forest.predict(X_test) print(confusion_matrix(y_chap,y_test)) parameters = {'max_features':range(1,20)} rf = GridSearchCV(RandomForestClassifier(n_estimators=500,oob_score=True,n_jobs=-1),parameters,cv=Valid_croisee,n_jobs=-1) rf = rf.fit(X_train, y_train) print(rf.best_params_) print(rf.score(X_test,y_test))