# A function that will filter a genotype matrix based on maf and missingness
# Calculates the proportion of missing data for every marker in a genotype matrix (mising data is NA)
calc_missrate <- function(gt_mat)
{
  col_func <- function(gt_col)
  {
    missrate <- sum(is.na(gt_col)) / length(gt_col)
    return(missrate)
  }
  
  missrate_vect <- apply(gt_mat, 2, col_func)
  
  return(missrate_vect)
}

# Calculates the minor allele frequency for every marker in a genotype matrix (coded as c(-1,0,1))
calc_maf_apply <- function(gt_mat, encoding = c(-1, 0, 1))
{
  col_func1 <- function(gt_col)
  {
    allele1_ct <- (sum(gt_col == -1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T)
    allele2_ct <- (sum(gt_col == 1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T)
    
    maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2)
  }
  
  col_func2 <- function(gt_col)
  {
    allele1_ct <- (sum(gt_col == 0, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T)
    allele2_ct <- (sum(gt_col == 2, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T)
    
    maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2)
  }
  
  if (all(encoding == c(-1, 0, 1)))
  {
    maf_vect <- apply(gt_mat, 2, col_func1)
  } else if (all(encoding == c(0, 1, 2)))
  {
    maf_vect <- apply(gt_mat, 2, colfunc2)
  } else
  {
    print('Encoding not recognized, returning NULL')
    maf_vect <- NULL
  }
  
  return(maf_vect)
}

# This is a function that will split data into a list of k-folds
make_CV_sets <- function(response, predictors, k = 5)
{
  rand_values <- rnorm(length(response))
  k_quantiles <- quantile(rand_values, 0:k/k)
  k_assign <- cut(rand_values, k_quantiles, include.lowest = T, labels = F)
  
  cv_list <- list()
  for (i in 1:k)
  {
    train_x <- data.frame(predictors[k_assign != i,])
    train_y <- response[k_assign != i]
    
    test_x <- data.frame(predictors[k_assign == i,])
    test_y <- response[k_assign == i]
    
    fold <- list(train_x, train_y, test_x, test_y)
    names(fold) <- c("train_x", "train_y", "test_x", "test_y")
    cv_list[[i]] <- fold
  }
  
  return(cv_list)
}

# This function takes in a set of folds from make_CV_sets(), builds a GLM model using the training set, and calculates model accuracy (r^2) using a testing set
lm_jank <- function(cv_sets)
{
  validation_results <- sapply(cv_sets, function(fold) {
    train_data <- cbind(fold$train_y, fold$train_x)
    names(train_data)[1] <- "y"
    
    # Calculate the model
    train_model <- lm(y ~ ., data = train_data)
    
    # Use the model to predict the test set
    test_predict <- predict(train_model, fold$test_x)
    
    # Correlate test predicted vs observed and square to get r-squared
    accuracy <- cor(fold$test_y, test_predict)^2
    
    return(accuracy)
  })
}