library(rrBLUP) library(BGLR) source("http://zzlab.net/GAPIT/GAPIT.library.R") source("http://zzlab.net/GAPIT/gapit_functions.txt") source("FUNCTIONS_lab11.R") # Read in the data myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) row.names(myGD) = myGD[,1] myGD_mat<-myGD[,-1] myGM <- read.table("http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", header = T, stringsAsFactors = F, sep = "\t") myY <- read.table("http://zzlab.net/GAPIT/data/mdp_traits.txt", header = T, stringsAsFactors = F, sep = "\t") ###### Data Formatting ###### { # Remove monomorphic markers maf <- calc_maf_apply(myGD_mat, encoding = c(0, 1, 2)) mono_indices <- which(maf == 0) taxa <- row.names(myGD) myGD_mat = myGD_mat[,-mono_indices] myGM = myGM[-mono_indices,] # Subset a single phenotype by taxa myY <- myY[,1:2] taxa_match <- match(myGD$taxa, myY$Taxa) myY <- myY[taxa_match,] # Split data into training and test sets myCV_sets <- make_CV_sets(response = myY$EarHT, predictors = myGD[,-1], k = 5) } ###### MAS modeling with GWAS (BLINK) ###### # Perform GWAS with BLINK MAS_results <- list() for (i in 1:length(myCV_sets)) { train = which(myCV_sets[[i]]) test = which(!myCV_sets[[i]]) # Split the data into training and testing sets myGD_train <- cbind(taxa[train], myGD_mat[train,]) myY_train <- myY[train,] myY_test <- myY[test,2] # Train a BLINK model BLINK_results <- GAPIT(Y = myY_train, GD = myGD_train, GM = myGM, model = "BLINK")$GWAS sig_markers <- which(BLINK_results$P.value <= 0.05/length(BLINK_results$P.value)) MAS_mat <- myGD_mat[train,sig_markers] GLM_data <- cbind(myY_train[2], MAS_mat) names(GLM_data)[1] <- "Y" MAS_model <- lm(Y ~ ., data = GLM_data) # Predicting with the model test_MAS_mat <- myGD_mat[test,sig_markers] test_Y_pred <- predict(MAS_model, test_MAS_mat) # Calculate an accuracy acc <- cor(test_Y_pred, myY_test, use = "pairwise.complete") results <- list(BLINK_results, MAS_model, test_Y_pred, acc) names(results) <- c("BLINK_results", "MAS_model", "predictions", "acc") MAS_results[[i]] <- results } ###### GS modeling with training data ###### rrBLUP_results <- list() for (i in 1:length(myCV_sets)) { train = which(myCV_sets[[i]]) test = which(!myCV_sets[[i]]) myGD_train <- as.matrix(myGD_mat[train,]) myGD_test <- as.matrix(myGD_mat[test,]) myY_train <- myY[train,2] myY_test <- myY[test,2] rrBLUP_model <- mixed.solve(y = myY_train, Z = myGD_train) pred_effects <- myGD_test %*% rrBLUP_model$u predictions <- c(pred_effects) + c(rrBLUP_model$beta) acc <- cor(predictions, myY_test, use = "pairwise.complete") results <- list(rrBLUP_model, predictions, acc) names(results) <- c("rrBLUP_model", "predictions", "acc") rrBLUP_results[[i]] <- results } ###### Compare Accuracies ###### BLINK_MAS_acc <- c(NULL) rrBLUP_acc <- c(NULL) for (i in 1:length(myCV_sets)) { BLINK_MAS_acc[i] <- MAS_results[[i]]$acc rrBLUP_acc[i] <- rrBLUP_results[[i]]$acc } ###### Boxplot of CV-accuracies ###### accuracy <- c(BLINK_MAS_acc, rrBLUP_acc) method <- c(rep("BLINK_MAS", length(BLINK_MAS_acc)), rep("rrBLUP", length(rrBLUP_acc))) plot_data <- data.frame(accuracy, method) cv_acc_plot <- ggplot(plot_data, aes(x = method, y = accuracy)) + geom_boxplot() + labs(title = "5-fold Cross-Validation accuracy of GS methods: GAPIT Demo data", x = "Method", y = "Accuracy (r)")