# Homework assignment 1, CROPS545 Statistical Genomics by Ryan Oliveira

#Before anything we set the working directory (for my computer)
setwd("~/R/RyanDirectory")

#Part 1: Generating a function derived from the normal distribution

#Begins with normal distribution that has a mean of 15 with variance of 1, approximating distance from sound
#The function requires the user to set constant value P, for source power of sound
#The inverse square law (P/4pi(r^2)) is performed to determine sound intensity
#This is averaged for "df" observers/samples and repeated n instances, e.g. concerts

rOliveira=function(n=10,df=3,P=100){
  z=replicate(n,{
    r=rnorm(df,0,1)
    y=mean(P/(4*pi*(r^2)))  
  })
  return(z)
}

#Part 2: Exploring properties of the distribution
#Per instructions, we run the function at n=10000
#The results are stored as vector in variable x1
#Because a df is needed, I use 3 concertgoers each time
#I keep P at 100 per the written example

x1=rOliveira(n=10000,df=3,P=100)

#We first return the mean, variance, standard deviation, skewness, and kurtosis
#They are stored as variables m1, v1, sd1, sk1, and k1 respectively

m1=mean(x1)

v1=var(x1)

sd1=sqrt(var(x1))

sk1=skewness(x1)

k1=kurtosis(x1)

#We plot the distribution, and generate a histogram and density plot

plot(x1)

hist(x1)

plot(density(x1))

#Quantiles showing the 5% outlier one-tailed distribution for less than the mean

quantile(x1,0.05)

#Part 3: Deriving the F distribution from the manual

#We set up our function, rOliveira2
rOliveira2=function(n,df1,df2){
  #We define the first chi2 distribution, x1
  x1=(replicate(n,{
    x=rnorm(df1,0,1)
    y=sum(x^2)
  }))
  #We define the second chi2 distribution, x2
  x2=(replicate(n,{
    x=rnorm(df2,0,1)
    y=sum(x^2)
  }))
  #We now perform F=(x1/df1)/(x2/df2) which ~ F(df1, df2)
  x3=(x1/df1)/(x2/df2)
  return(x3)
}

#Generates the Roliveira 2 plot (blue) and rf plot(red) to test accuracy
graph1=rOliveira2(10000,5,7)
graph2=rf(10000,5,7)
plot(density(graph1),col="blue")
lines(density(graph2),col="red")
#It works!

#Part 4 and 5: Testing sample means/variance against null hypothesis

#First, we can take the theoretical route
#We set the dfs
df1=8
df2=10

#We store the expected mean as variable "expected.mean", same with variance
expected.mean=df2/(df2-2)
expected.variance=(2*(df2^2)*(df1+df2-2))/(df1*((df2-2)^2)*(df2-4))

#We then set n=10 to take 10 variables
n=10
#Our function is run
s1=rOliveira2(n,df1,df2)
#And we take the mean and standard deviation
m1=mean(s1)
sd1=sqrt(var(s1))
#We can use a t-test to determine if the two differ
t1=(m1-expected.mean)/(sd1/sqrt(n))
p1=1-pt(t1,(n-1))
c(t1,p1)

#The same thing but for n=100
n=100
s2=rOliveira2(n,df1,df2)
m2=mean(s2)
sd2=sqrt(var(s2))
t2=(m2-expected.mean)/(sd2/sqrt(n))
p2=1-pt(t2,(n-1))
c(t2,p2)

#The same thing but for n=1000
n=1000
s3=rOliveira2(n,df1,df2)
m3=mean(s3)
sd3=sqrt(var(s3))
t3=(m3-expected.mean)/(sd3/sqrt(n))
p3=1-pt(t3,(n-1))
c(t3,p3)

#The same thing but for n=100,000
n=100000
s4=rOliveira2(n,df1,df2)
m4=mean(s4)
sd4=sqrt(var(s4))
t4=(m4-expected.mean)/(sd4/sqrt(n))
p4=1-pt(t4,(n-1))
c(t4,p4)

#While we are at it, we can test the variances
#Variance for n=10
var1=var(s1)
#We can plug into this value to generate a chi-squared value with df=n-1
v1=(10-1)*var1/expected.variance
p5=1-pchisq(v1,10-1)
  
#Similar equations for n=100, n=1000, and n=100,000
var2=var(s2)
v2=(100-1)*var2/expected.variance
p6=1-pchisq(v2,100-1)

var3=var(s3)
v3=(1000-1)*var3/expected.variance
p7=1-pchisq(v3,1000-1)

var4=(var(s4))
v4=(100000-1)*var4/expected.variance
p8=1-pchisq(v4,100000-1)

#Generates plots of our F distributions at differing "n" for reference
plot1=s1
plot2=s2
plot3=s3
plot4=s4
plot(density(plot1),col="blue",
     ylim=c(0,1.2))
lines(density(plot2),col="red")
lines(density(plot3),col="green")
lines(density(plot4),col="yellow")

#Last, we can replicate these results via the "empirical" (brute force) method

#Mean of n=10 performed 10,000 times and plotted
emp.mean10=replicate(10000,{
  emp.s1=rOliveira2(10,df1,df2)
  temp.mean=mean(emp.s1)
  return(temp.mean)
})

#We calculate p by seeing what proportion of the distribution of means is higher than the expected
emp.mean.p10=(length(emp.mean10[emp.mean10>expected.mean]))/10000


#We do something similar for n=100
emp.mean100=replicate(10000,{
  emp.s2=rOliveira2(100,df1,df2)
  temp.mean=mean(emp.s2)
  return(temp.mean)
})

emp.mean.p100=(length(emp.mean100[emp.mean100>expected.mean]))/10000

#Same operation for n=1000
emp.mean1000=replicate(10000,{
  emp.s3=rOliveira2(1000,df1,df2)
  temp.mean=mean(emp.s3)
  return(temp.mean)
})

emp.mean.p1000=(length(emp.mean1000[emp.mean1000>expected.mean]))/10000

#Same thing for n=100000
#Code commented out because it requires a prohibitively high # of operations
#emp.mean100000=replicate(10000,{
#  emp.s3=rOliveira2(100000,df1,df2)
#  temp.mean=mean(emp.s3)
#  return(temp.mean)
#})
#
#emp.mean.p100000=(length(emp.mean100000[emp.mean100000>expected.mean]))/10000

#Now for the variance

#Variance of n=10 performed 10,000 times and plotted
emp.var10=replicate(10000,{
  emp.s1=rOliveira2(10,df1,df2)
  temp.var=var(emp.s1)
  return(temp.var)
})

#We calculate p by seeing what proportion of the distribution of variances is higher than the expected
emp.p10=(length(emp.var10[emp.var10>expected.variance]))/10000

#Same thing for n=100
emp.var100=replicate(10000,{
  emp.s2=rOliveira2(100,df1,df2)
  temp.var=var(emp.s2)
  return(temp.var)
})

emp.p100=(length(emp.var100[emp.var100>expected.variance]))/10000

#Same for n=1000
emp.var1000=replicate(10000,{
  emp.s3=rOliveira2(1000,df1,df2)
  temp.var=var(emp.s3)
  return(temp.var)
})

emp.p1000=length((emp.var1000[emp.var1000>expected.variance]))/10000

#Same for n=100000
#Code commented out because it requires a prohibitively high # of operations
#emp.var100000=replicate(10000,{
#  emp.s4=rOliveira2(100000,df1,df2)
#  temp.var=var(emp.s4)
#  return(temp.var)
#})
#
#emp.p100000=(length(emp.var100000[emp.var100000>expected.variance]))/10000