# A function that will filter a genotype matrix based on maf and missingness # Calculates the proportion of missing data for every marker in a genotype matrix (mising data is NA) calc_missrate <- function(gt_mat) { col_func <- function(gt_col) { missrate <- sum(is.na(gt_col)) / length(gt_col) return(missrate) } missrate_vect <- apply(gt_mat, 2, col_func) return(missrate_vect) } # Calculates the minor allele frequency for every marker in a genotype matrix (coded as c(-1,0,1)) calc_maf_apply <- function(gt_mat, encoding = c(-1, 0, 1)) { col_func1 <- function(gt_col) { allele1_ct <- (sum(gt_col == -1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T) allele2_ct <- (sum(gt_col == 1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T) maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2) } col_func2 <- function(gt_col) { allele1_ct <- (sum(gt_col == 0, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T) allele2_ct <- (sum(gt_col == 2, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T) maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2) } if (all(encoding == c(-1, 0, 1))) { maf_vect <- apply(gt_mat, 2, col_func1) } else if (all(encoding == c(0, 1, 2))) { maf_vect <- apply(gt_mat, 2, col_func2) } else { print('Encoding not recognized, returning NULL') maf_vect <- NULL } return(maf_vect) } # This is a function that will split data into a list of k-folds make_CV_sets <- function(response, predictors, k = 5) { rand_values <- rnorm(length(response)) k_quantiles <- quantile(rand_values, 0:k/k) k_assign <- cut(rand_values, k_quantiles, include.lowest = T, labels = F) cv_list <- list() for (i in 1:k) { fold_assignment <- k_assign != i cv_list[[i]] <- fold_assignment } return(cv_list) }