library(BGLR) # A function that will filter a genotype matrix based on maf and missingness # Calculates the proportion of missing data for every marker in a genotype matrix (mising data is NA) calc_missrate <- function(gt_mat) { col_func <- function(gt_col) { missrate <- sum(is.na(gt_col)) / length(gt_col) return(missrate) } missrate_vect <- apply(gt_mat, 2, col_func) return(missrate_vect) } # Calculates the minor allele frequency for every marker in a genotype matrix (coded as c(-1,0,1)) calc_maf_apply <- function(gt_mat, encoding = c(-1, 0, 1)) { col_func1 <- function(gt_col) { allele1_ct <- (sum(gt_col == -1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T) allele2_ct <- (sum(gt_col == 1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T) maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2) } col_func2 <- function(gt_col) { allele1_ct <- (sum(gt_col == 0, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T) allele2_ct <- (sum(gt_col == 2, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T) maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2) } if (all(encoding == c(-1, 0, 1))) { maf_vect <- apply(gt_mat, 2, col_func1) } else if (all(encoding == c(0, 1, 2))) { maf_vect <- apply(gt_mat, 2, col_func2) } else { print('Encoding not recognized, returning NULL') maf_vect <- NULL } return(maf_vect) } # This is a function that will split data into a list of k-folds make_CV_sets <- function(list_length, k = 5) { rand_values <- rnorm(list_length) k_quantiles <- quantile(rand_values, 0:k/k) k_assign <- cut(rand_values, k_quantiles, include.lowest = T, labels = F) cv_list <- list() for (i in 1:k) { fold_assignment <- k_assign != i cv_list[[i]] <- fold_assignment } return(cv_list) } test_all_models_BGLR_cv <- function(genotypes, phenotype, nIter = 10000, burnIn = 2000, folds = 5) { # Make the CV list fold_list <- make_CV_sets(length(phenotype), k = 5) BGLR_acc_results <- list() for (i in 1:length(fold_list)) { fold_indices <- which(fold_list[[i]]) # Split into training and testing data pheno_train <- phenotype pheno_train[-fold_indices] <- NA # Calculate the GS model using BGLR bayesA_ETA<-list(list(X=as.matrix(genotypes),model="BayesA")) bayesA_results <- BGLR(y = pheno_train, ETA = bayesA_ETA, nIter=nIter, burnIn=burnIn) bayesA_predictions <- predict(bayesA_results) bayesA_acc <- cor(phenotype[-fold_indices], bayesA_predictions[-fold_indices]) bayesB_ETA<-list(list(X=as.matrix(genotypes),model="BayesB")) bayesB_results <- BGLR(y = pheno_train, ETA = bayesB_ETA, nIter=nIter, burnIn=burnIn) bayesB_predictions <- predict(bayesB_results) bayesB_acc <- cor(phenotype[-fold_indices], bayesB_predictions[-fold_indices]) bayesC_ETA<-list(list(X=as.matrix(genotypes),model="BayesC")) bayesC_results <- BGLR(y = pheno_train, ETA = bayesC_ETA, nIter=nIter, burnIn=burnIn) bayesC_predictions <- predict(bayesC_results) bayesC_acc <- cor(phenotype[-fold_indices], bayesC_predictions[-fold_indices]) BRR_ETA<-list(list(X=as.matrix(genotypes),model="BRR")) BRR_results <- BGLR(y = pheno_train, ETA = BRR_ETA, nIter=nIter, burnIn=burnIn) BRR_predictions <- predict(BRR_results) BRR_acc <- cor(phenotype[-fold_indices], BRR_predictions[-fold_indices]) BL_ETA<-list(list(X=as.matrix(genotypes),model="BL")) BL_results <- BGLR(y = pheno_train, ETA = BL_ETA, nIter=nIter, burnIn=burnIn) BL_predictions <- predict(BL_results) BL_acc <- cor(phenotype[-fold_indices], BL_predictions[-fold_indices]) BGLR_acc_results[[i]] <- list(bayesA_acc, bayesB_acc, bayesC_acc, BRR_acc, BL_acc) } model_vect <- c("BayesA", "BayesB", "BayesC", "BRR", "BL") BGLR_acc_table <- data.frame(matrix(nrow = 0, ncol = 3)) for (i in 1:length(BGLR_acc_results)) { results_long <- data.frame(rep(i, length(model_vect)), model_vect, unlist(BGLR_acc_results[[i]])) BGLR_acc_table <- rbind(BGLR_acc_table, results_long) } names(BGLR_acc_table) <- c("fold", "model", "r") result_plot <- ggplot(BGLR_acc_table, aes(x = model, y = r)) + geom_boxplot() results <- list(BGLR_acc_table, result_plot) names(results) <- c("BGLR_cv_acc_table", "BGLR_model_cv_boxplot") return(results) }