library(rrBLUP)
library(BGLR)

source("http://zzlab.net/GAPIT/GAPIT.library.R")
source("http://zzlab.net/GAPIT/gapit_functions.txt")
source("FUNCTIONS_lab11.R")

# Read in the data
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
row.names(myGD) = myGD[,1]
myGD_mat<-myGD[,-1]

myGM <- read.table("http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", header = T, stringsAsFactors = F, sep = "\t")
myY <- read.table("http://zzlab.net/GAPIT/data/mdp_traits.txt", header = T, stringsAsFactors = F, sep = "\t")

###### Data Formatting ######
{
  # Remove monomorphic markers
  maf <- calc_maf_apply(myGD_mat, encoding = c(0, 1, 2))
  mono_indices <- which(maf == 0)
  taxa <- row.names(myGD)
  myGD_mat = myGD_mat[,-mono_indices]
  myGM = myGM[-mono_indices,]
  
  # Subset a single phenotype by taxa
  myY <- myY[,1:2]
  taxa_match <- match(myGD$taxa, myY$Taxa)
  myY <- myY[taxa_match,]
  
  # Split data into training and test sets
  myCV_sets <- make_CV_sets(response = myY$EarHT,
                            predictors = myGD[,-1],
                            k = 5)
}


###### MAS modeling with GWAS (BLINK) ######
# Perform GWAS with BLINK
MAS_results <- list()
for (i in 1:length(myCV_sets))
{
  train = which(myCV_sets[[i]])
  test = which(!myCV_sets[[i]])
  
  # Split the data into training and testing sets
  myGD_train <- cbind(taxa[train], myGD_mat[train,])
  myY_train <- myY[train,]
  myY_test <- myY[test,2]
  
  # Train a BLINK model
  BLINK_results <- GAPIT(Y = myY_train,
                   GD = myGD_train,
                   GM = myGM,
                   model = "BLINK")$GWAS
  
  sig_markers <- which(BLINK_results$P.value <= 0.05/length(BLINK_results$P.value))
  
  MAS_mat <- myGD_mat[train,sig_markers]
  
  GLM_data <- cbind(myY_train[2], MAS_mat)
  names(GLM_data)[1] <- "Y"
  MAS_model <- lm(Y ~ .,  
                  data = GLM_data)
  
  
  # Predicting with the model
  test_MAS_mat <- myGD_mat[test,sig_markers]
  test_Y_pred <- predict(MAS_model, test_MAS_mat)
  
  # Calculate an accuracy
  acc <- cor(test_Y_pred, myY_test, use = "pairwise.complete")
  results <- list(BLINK_results, MAS_model, test_Y_pred, acc)
  names(results) <- c("BLINK_results", "MAS_model", "predictions", "acc")
  
  MAS_results[[i]] <- results
  }


###### GS modeling with training data ######
rrBLUP_results <- list()
for (i in 1:length(myCV_sets))
{
  train = which(myCV_sets[[i]])
  test = which(!myCV_sets[[i]])
  
  myGD_train <- as.matrix(myGD_mat[train,])
  myGD_test <- as.matrix(myGD_mat[test,])
  myY_train <- myY[train,2]
  myY_test <- myY[test,2]
  rrBLUP_model <- mixed.solve(y = myY_train,
                                Z = myGD_train)
  
  pred_effects <- myGD_test %*% rrBLUP_model$u
  predictions <- c(pred_effects) + c(rrBLUP_model$beta)
  acc <- cor(predictions, myY_test, use = "pairwise.complete")
  results <- list(rrBLUP_model, predictions, acc)
  names(results) <- c("rrBLUP_model", "predictions", "acc")
  
  rrBLUP_results[[i]] <- results
}


###### Compare Accuracies ######
BLINK_MAS_acc <- c(NULL)
rrBLUP_acc <- c(NULL)
for (i in 1:length(myCV_sets))
{
  BLINK_MAS_acc[i] <- MAS_results[[i]]$acc
  rrBLUP_acc[i] <- rrBLUP_results[[i]]$acc
}

###### Boxplot of CV-accuracies ######
accuracy <- c(BLINK_MAS_acc, rrBLUP_acc)
method <- c(rep("BLINK_MAS", length(BLINK_MAS_acc)), rep("rrBLUP", length(rrBLUP_acc)))

plot_data <- data.frame(accuracy, method)
cv_acc_plot <- ggplot(plot_data, aes(x = method, y = accuracy)) +
  geom_boxplot() +
  labs(title = "5-fold Cross-Validation accuracy of GS methods: GAPIT Demo data",
       x = "Method",
       y = "Accuracy (r)")