test <- read.csv(file="test.csv",sep=",") test$Pclass <- ordered(test$Pclass) train <- read.csv(file="train.csv",sep=",") train$Survived <- factor(train$Survived) train$Pclass <- ordered(train$Pclass) train$Embarked <- factor(train$Embarked,levels=c("C","Q","S")) library(stringr) help(str_sub) train$title <- str_sub(train$Name, str_locate(train$Name, ",")[ , 1] + 2, str_locate(train$Name, "\\.")[ , 1] - 1) female_noble_names <- c("Dona", "Lady", "Mlle", "Mme", "Ms", "the Countess") male_noble_names <- c("Capt", "Col", "Don", "Dr", "Jonkheer", "Major", "Rev", "Sir") train$title[train$title %in% female_noble_names] <- "female_noble" train$title[train$title %in% male_noble_names] <- "male_noble" test$title <- str_sub(test$Name, str_locate(test$Name, ",")[ , 1] + 2, str_locate(test$Name, "\\.")[ , 1] - 1) test$title[test$title %in% female_noble_names] <- "female_noble" test$title[test$title %in% male_noble_names] <- "male_noble" train$title <- as.factor(train$title) test$title <- as.factor(test$title) train <- train[,-c(1,4,9,11)] test <- test[,-c(3,8,10)] summary(train) summary(test) data.all <- rbind(train[,-1],test[,-1]) for (j in which(is.na(test$Fare))) { test$Fare[j] <- median(data.all$Fare[data.all$Pclass==test$Pclass[j] & data.all$Sex==test$Sex[j]],na.rm=TRUE) } for (j in which(is.na(test$Age))) { test$Age[j] <- median(data.all$Age[data.all$title==test$title[j]],na.rm=TRUE) } for (j in which(is.na(train$Age))) { train$Age[j] <- median(data.all$Age[data.all$title==train$title[j]],na.rm=TRUE) } library(plyr) ddply(data.all,~Embarked,summarize,median(Age,na.rm=TRUE)) train$Fare[62] train$Fare[830] train$Embarked[62] <- "C" train$Embarked[830] <- "C" model.logit <- glm(Survived~.+Sex*Pclass,data=train,family="binomial") anova(model.logit,test="Chisq") res.logit <- round(predict(model.logit,test,type="response")) kaggle.submit <- cbind(test$PassengerId,res.logit) write.table(kaggle.submit,file="submission.csv",sep=",", quote=FALSE,col.names=c("PassengerId","Survived"),row.names=FALSE) # submission score 0.77990 model.logit <- step(model.logit,direction="backward") anova(model.logit,test="Chisq") res.logit <- round(predict(model.logit,test,type="response")) kaggle.submit <- cbind(test$PassengerId,res.logit) write.table(kaggle.submit,file="submission.csv",sep=",", quote=FALSE,col.names=c("PassengerId","Survived"),row.names=FALSE) # submission score 0.77990 library(rpart) library(rpart.plot) library(party) model.cart <- rpart(Survived~.,data=train) rpart.plot(model.tree,type=2,extra=1) printcp(model.tree) cv.cart <- train(Survived~.,data=train,method="rpart", metric="Accuracy",trControl=trainControl(method="repeatedcv", repeats=50,number=10),tuneGrid=data.frame(cp=seq(0.01,0.05,length=30))) model.cart <- prune(model.cart,cp=as.numeric(cv.cart$best)) rpart.plot(model.cart) res.cart <- predict(model.cart,test,type="class") kaggle.submit <- cbind(test$PassengerId,as.numeric(res.cart)-1) write.table(kaggle.submit,file="submission.csv",sep=",", quote=FALSE,col.names=c("PassengerId","Survived"),row.names=FALSE) # submission score 0.79426 library(randomForest) model.rf <- randomForest(Survived~.,data=train,ntree=500) varImpPlot(model.rf) plot(model.rf) kaggle.submit <- cbind(test$PassengerId,as.numeric(predict(model.rf,test))-1) write.table(kaggle.submit,file="submission.csv",sep=",", quote=FALSE,col.names=c("PassengerId","Survived"),row.names=FALSE) # submission score 0.77990 library(e1071) calibration <- tune.svm(model.matrix(Survived~.-1,data=train),train$Survived,gamma=seq(0.001,1,by=0.01)) model.svm <- svm(model.matrix(Survived~.-1,data=train),train$Survived,gamma=as.numeric(calibration$best.para)) test$Survived <- rep(2,dim(test)[1]) kaggle.submit <- cbind(test$PassengerId,as.numeric(predict(model.svm,model.matrix(Survived~.-1-PassengerId,data=test)))-1) write.table(kaggle.submit,file="submission.csv",sep=",", quote=FALSE,col.names=c("PassengerId","Survived"),row.names=FALSE) # submission score 0.80383