# CrpSci545: Statistical Genomics 
# Homework 1
# Matthew McGowan

# Checking for installation and loading package dependencies
require("ggplot2")
require('reshape2')
# Problem 1
# Create multiple random variables that follow the standard normal distribution and combine them to create a new distribution.
# Name the random variable as your last name
# Include n = the number of variables, 

# My distribution function includes two arguments. In plainspeak, I am taking the absolute values of 'vnum' normal distributions with sample size 'snum' and summing them together.
mcgowan <- function(n=500, v=2, f=0) {
  result <- abs(rnorm(n, mean = f, sd = 1))
  for (i in 1:(v-1)) {
    result <- result + abs(rnorm(n, mean = f, sd = 1))
  }
  return(result)
}

# Verifying that this function works correctly.
n <- 10000
check1 <- abs(rnorm(n, mean = 0, sd = 1))
check2 <- abs(rnorm(n, mean = 0, sd = 1))
check_sum <- check1 + check2
function_test <- mcgowan(n, 2, 0)
test_frame <- data.frame(check_sum, function_test)
# Combining these results into long format for plotting.
test_frame <- melt(test_frame)
verify_plot <- ggplot(test_frame, aes(x = value, fill = variable))
verify_plot +
  geom_density(alpha = 0.25) +
  labs(title = "Comparing manual check to my function")
  
# Problem 2
# Sample 10,000 observations from the distribution you defined. Graph their properties and describe the potential application of your distribution in nature.
test <- mcgowan(5, 10000, 0)

# Plotting the distribution of 5 variables sampled 10,000 times.
qplot(test, bins = 50) +
  theme(panel.grid.minor = element_blank())

# This distribution could have potential application to situations where a polarized input (either positive or negative) pulse is always converted into a positive response (thus the use of the 'abs' function). 
# By further including a loop that cumulatively sums multiple successive distributions, the final distribution is the result of a specified number of input pulses.


# Problem 3
# Develop a custom function that creates an F distribution with n,df1,df2 arguments using the normal distribution function built into R.

# This function has the same arguments as the built in function. 
# First I create two chi-squared distributions from df1 or df2 numbers of normal distributions with sample size n.
mcgowan_fdist <- function(n = 10, df1 = 1, df2 = 1) {
  chi1 <- rnorm(n, mean = 0, sd = 1)^2
  for (x in 1:(df1-1)) {
    chi1 <- chi1 + rnorm(n, mean = 0, sd = 1)^2
  }
  chi2 <- rnorm(n, mean = 0, sd = 1)^2
  for (x in 1:(df2-1)) {
    chi2 <- chi2 + rnorm(n, mean = 0, sd = 1)^2
  }
#  Then I calculate the final f-distribution from the two chi-square distributions 
  fdist_result <- (chi1/df1)/(chi2/df2)
  return(fdist_result)
}

# Verifying my function against the built-in t-distribution function.
check_f <- rf(10000, 10, 15)
test_f <- mcgowan_fdist(10000, 10, 15)
test_frame <- melt(data.frame(check_f, test_f))
verify_plot <- ggplot(test_frame, aes(x = value, fill = variable))
verify_plot +
  geom_density(alpha = 0.25) +
  labs(title = "Comparing rf function to my f-distribution function")

# Problem 4
# Sample 10, 100, 1000, and 100,000 F distributed variables using the custom function. Calculate means and test them against the null hypothesis that samples have expected mean of df2/(df2-2)

# Creating a list of samples I need to take.
samplist <- c(10,100,1000,10000,500000)
# Because the problem did not stipulate what df1 and df2 parameters should be, I decided to randomly draw an integer value from 500:600 to use as my df arguments.
df1 <- 100
df2 <- 150

# Creating a table to store mean values in.
table_mean_var <- as.data.frame(matrix(ncol = 7, nrow = length(samplist)))
names(table_mean_var) <- c('nsamp', 'variance', 'mean', 't-value', 'p-value1', 'chi-sq', 'p-value2')
table_mean_var[,1] <- samplist

# Generating my distributions for each sample number, calculating the mean and the variance, performing a t-test, and storing the values in my table.
for (x in 1:length(samplist)) {
  fdist <- mcgowan_fdist(n = samplist[x], df1, df2)
  varname <- paste('test_', samplist[x], sep = '')
  assign(varname, fdist)
  table_mean_var[x,2] <- mean(fdist)
  table_mean_var[x,3] <- var(fdist)
  ttest_results <- t.test(fdist, mu = (df2/(df2-2)), paired = F, conf.level = 0.95)
  table_mean_var[x,4] <- ttest_results$statistic[['t']]
  table_mean_var[x,5] <- ttest_results$p.value[[1]]
}

# Problem 5

# Sample 10, 100, 1000, and 100,000 F distributed variables using the R function previously developed.
# This was done already for problem 4.

# Calculate the variance of these samples and test them on the null hypothesis that the samples have expected variance of (2n[1]^2(n[1] + n[2] - 2))/(n[1](n[2]-2)^2 * (n[2]-4))

# Calculating expected variance
var_expect <- (2 * df2^2 * (df1 + df2 - 2)) / (df1 * (df2 - 2)^2 * (df2 - 4))

# Creating a function to perform one sided test of variance using the equation provided in the lecture.
var.test <- function(dist, expect) {
  n = length(dist)
  dist_var <- var(dist)
  v = (n - 1) * dist_var/expect
  return(c(v, 1-pchisq(v,(n-1))))
}
# Performing chi-square tests and outputting the test p-value
z <- var.test(test_10, var_expect)
table_mean_var[1,6] <- z[1]
table_mean_var[1,7] <- z[2]
z <- var.test(test_100, var_expect)
table_mean_var[2,6] <- z[1]
table_mean_var[2,7] <- z[2]
z <- var.test(test_1000, var_expect)
table_mean_var[3,6] <- z[1]
table_mean_var[3,7] <- z[2]
z <- var.test(test_10000, var_expect)
table_mean_var[4,6] <- z[1]
table_mean_var[4,7] <- z[2]