# Set seed for reproducibility set.seed(345678) # Initialize parameters n <- 50 # Sample size x1 <- rnorm(n) # Generate random normal variables for predictors x2 <- rnorm(n) x3 <- rnorm(n) u <- rnorm(n, 0, sqrt(2)) # Random noise with mean 0 and standard deviation sqrt(2) d <- 30 # Dimensionality for additional variables z <- matrix(runif(n * d), n) # Generate a matrix of uniform random variables y <- 1 + 0.5 * x1 + 2 * x2 - 0.2 * x3 + u # Response variable defined by linear relationship and noise # Create data frame with the response and predictors app <- data.frame(y = y, x1 = x1, x2 = x2, x3 = x3, z = z) names(app)[-(1:4)] <- paste("z", 1:30, sep = "") # Rename z variables as z1 to z30 # Scale the predictors (excluding response variable y) xapp.scale <- scale(app[, -1]) app.scale <- data.frame(y = y, xapp.scale) # Create scaled version of the data # Generate test set ntest <- 1000 # Size of the test set x1test <- rnorm(ntest) x2test <- rnorm(ntest) x3test <- rnorm(ntest) utest <- rnorm(ntest, 0, sqrt(2)) # Noise for the test set ztest <- matrix(runif(ntest * d), ntest) test <- data.frame(x1 = x1test, x2 = x2test, x3 = x3test, z = ztest) names(test)[-(1:3)] <- paste("z", 1:30, sep = "") # Rename z variables in test set # Define the true y for the test set ytest <- 1 + 0.5 * x1test + 2 * x2test - 0.2 * x3test + utest # Scale the test data using the same scaling applied to the training data test.scale <- data.frame(scale(test, center = attr(xapp.scale, "scaled:center"), scale = attr(xapp.scale, "scaled:scale"))) # Fit linear model on the original data model1 <- lm(y ~ ., data = app) yhat <- predict(model1, test) # Predict on test set sqrt(mean((yhat - ytest)^2)) # Calculate RMSE for model1 # Fit linear model on the scaled data model1.scale <- lm(y ~ ., data = app.scale) yhat <- predict(model1.scale, test.scale) # Predict on scaled test set sqrt(mean((yhat - ytest)^2)) # RMSE for scaled model # Fit a linear model with only x1, x2, x3 as predictors (without z variables) model2.scale <- lm(y ~ x1 + x2 + x3, data = app.scale) yhat <- predict(model2.scale, test.scale) # Predict on scaled test set sqrt(mean((yhat - ytest)^2)) # RMSE for reduced model # Compute "oracle" predictions (knowing the true model) yoracle <- 1 + 0.5 * test$x1 + 2 * test$x2 - 0.2 * test$x3 sqrt(mean((yoracle - ytest)^2)) # RMSE for oracle model # Load necessary libraries for regularization models library(glmnet) library(plotmo) # Fit Lasso model model.lasso <- glmnet(xapp.scale, y, family = "gaussian", nlambda = 50, alpha = 1) model.lasso # Display the model plot_glmnet(model.lasso, label = 10) # Plot coefficients # Predict using Lasso model yhat <- predict(model.lasso, as.matrix(test.scale)) dim(yhat) # Check dimensions of predictions # Load caret for model tuning library(caret) # Tune the Lasso model using cross-validation model.lasso.tune <- train( xapp.scale, y, method = "glmnet", metric = "RMSE", trControl = trainControl(method = "repeatedcv", number = 5, repeats = 100), tuneGrid = data.frame(alpha = 1, lambda = model.lasso$lambda) ) lambda_opt_lasso <- as.numeric(model.lasso.tune$bestTune[2]) # Optimal lambda model.lasso.tune$results[model.lasso.tune$results[, 2] == lambda_opt_lasso, ] # Best result glmnet(xapp.scale, y, family = "gaussian", lambda = lambda_opt_lasso, alpha = 1)$beta # Coefficients for optimal model plot_glmnet(model.lasso, label = 5, s = lambda_opt_lasso) # Plot optimal Lasso model # Predict using tuned Lasso model yhat <- predict(model.lasso.tune, as.matrix(test.scale)) length(yhat) # Check length of predictions sqrt(mean((yhat - ytest)^2)) # RMSE for tuned Lasso model # Fit Ridge regression model model.ridge <- glmnet(xapp.scale, y, family = "gaussian", nlambda = 50, alpha = 0) model.ridge # Display Ridge model plot_glmnet(model.ridge, label = 5) # Plot Ridge coefficients # Tune the Ridge model using cross-validation model.ridge.tune <- train( xapp.scale, y, method = "glmnet", metric = "RMSE", trControl = trainControl(method = "repeatedcv", number = 5, repeats = 100), tuneGrid = data.frame(alpha = 0, lambda = model.ridge$lambda) ) lambda_opt_ridge <- as.numeric(model.ridge.tune$bestTune[2]) # Optimal lambda for Ridge model.ridge.tune$results[model.ridge.tune$results[, 2] == lambda_opt_ridge, ] # Best result glmnet(xapp.scale, y, family = "gaussian", lambda = lambda_opt_ridge, alpha = 0)$beta # Coefficients for optimal Ridge model plot_glmnet(model.ridge, label = 5, s = lambda_opt_ridge) # Plot optimal Ridge model # Predict using tuned Ridge model yhat <- predict(model.ridge.tune, test.scale) length(yhat) # Check length of predictions sqrt(mean((yhat - ytest)^2)) # RMSE for tuned Ridge model