# Homework assignment 1, CROPS545 Statistical Genomics by Ryan Oliveira #Before anything we set the working directory (for my computer) setwd("~/R/RyanDirectory") #Part 1: Generating a function derived from the normal distribution #Begins with normal distribution that has a mean of 15 with variance of 1, approximating distance from sound #The function requires the user to set constant value P, for source power of sound #The inverse square law (P/4pi(r^2)) is performed to determine sound intensity #This is averaged for "df" observers/samples and repeated n instances, e.g. concerts rOliveira=function(n=10,df=3,P=100){ z=replicate(n,{ r=rnorm(df,0,1) y=mean(P/(4*pi*(r^2))) }) return(z) } #Part 2: Exploring properties of the distribution #Per instructions, we run the function at n=10000 #The results are stored as vector in variable x1 #Because a df is needed, I use 3 concertgoers each time #I keep P at 100 per the written example x1=rOliveira(n=10000,df=3,P=100) #We first return the mean, variance, standard deviation, skewness, and kurtosis #They are stored as variables m1, v1, sd1, sk1, and k1 respectively m1=mean(x1) v1=var(x1) sd1=sqrt(var(x1)) sk1=skewness(x1) k1=kurtosis(x1) #We plot the distribution, and generate a histogram and density plot plot(x1) hist(x1) plot(density(x1)) #Quantiles showing the 5% outlier one-tailed distribution for less than the mean quantile(x1,0.05) #Part 3: Deriving the F distribution from the manual #We set up our function, rOliveira2 rOliveira2=function(n,df1,df2){ #We define the first chi2 distribution, x1 x1=(replicate(n,{ x=rnorm(df1,0,1) y=sum(x^2) })) #We define the second chi2 distribution, x2 x2=(replicate(n,{ x=rnorm(df2,0,1) y=sum(x^2) })) #We now perform F=(x1/df1)/(x2/df2) which ~ F(df1, df2) x3=(x1/df1)/(x2/df2) return(x3) } #Generates the Roliveira 2 plot (blue) and rf plot(red) to test accuracy graph1=rOliveira2(10000,5,7) graph2=rf(10000,5,7) plot(density(graph1),col="blue") lines(density(graph2),col="red") #It works! #Part 4 and 5: Testing sample means/variance against null hypothesis #First, we can take the theoretical route #We set the dfs df1=8 df2=10 #We store the expected mean as variable "expected.mean", same with variance expected.mean=df2/(df2-2) expected.variance=(2*(df2^2)*(df1+df2-2))/(df1*((df2-2)^2)*(df2-4)) #We then set n=10 to take 10 variables n=10 #Our function is run s1=rOliveira2(n,df1,df2) #And we take the mean and standard deviation m1=mean(s1) sd1=sqrt(var(s1)) #We can use a t-test to determine if the two differ t1=(m1-expected.mean)/(sd1/sqrt(n)) p1=1-pt(t1,(n-1)) c(t1,p1) #The same thing but for n=100 n=100 s2=rOliveira2(n,df1,df2) m2=mean(s2) sd2=sqrt(var(s2)) t2=(m2-expected.mean)/(sd2/sqrt(n)) p2=1-pt(t2,(n-1)) c(t2,p2) #The same thing but for n=1000 n=1000 s3=rOliveira2(n,df1,df2) m3=mean(s3) sd3=sqrt(var(s3)) t3=(m3-expected.mean)/(sd3/sqrt(n)) p3=1-pt(t3,(n-1)) c(t3,p3) #The same thing but for n=100,000 n=100000 s4=rOliveira2(n,df1,df2) m4=mean(s4) sd4=sqrt(var(s4)) t4=(m4-expected.mean)/(sd4/sqrt(n)) p4=1-pt(t4,(n-1)) c(t4,p4) #While we are at it, we can test the variances #Variance for n=10 var1=var(s1) #We can plug into this value to generate a chi-squared value with df=n-1 v1=(10-1)*var1/expected.variance p5=1-pchisq(v1,10-1) #Similar equations for n=100, n=1000, and n=100,000 var2=var(s2) v2=(100-1)*var2/expected.variance p6=1-pchisq(v2,100-1) var3=var(s3) v3=(1000-1)*var3/expected.variance p7=1-pchisq(v3,1000-1) var4=(var(s4)) v4=(100000-1)*var4/expected.variance p8=1-pchisq(v4,100000-1) #Generates plots of our F distributions at differing "n" for reference plot1=s1 plot2=s2 plot3=s3 plot4=s4 plot(density(plot1),col="blue", ylim=c(0,1.2)) lines(density(plot2),col="red") lines(density(plot3),col="green") lines(density(plot4),col="yellow") #Last, we can replicate these results via the "empirical" (brute force) method #Mean of n=10 performed 10,000 times and plotted emp.mean10=replicate(10000,{ emp.s1=rOliveira2(10,df1,df2) temp.mean=mean(emp.s1) return(temp.mean) }) #We calculate p by seeing what proportion of the distribution of means is higher than the expected emp.mean.p10=(length(emp.mean10[emp.mean10>expected.mean]))/10000 #We do something similar for n=100 emp.mean100=replicate(10000,{ emp.s2=rOliveira2(100,df1,df2) temp.mean=mean(emp.s2) return(temp.mean) }) emp.mean.p100=(length(emp.mean100[emp.mean100>expected.mean]))/10000 #Same operation for n=1000 emp.mean1000=replicate(10000,{ emp.s3=rOliveira2(1000,df1,df2) temp.mean=mean(emp.s3) return(temp.mean) }) emp.mean.p1000=(length(emp.mean1000[emp.mean1000>expected.mean]))/10000 #Same thing for n=100000 #Code commented out because it requires a prohibitively high # of operations #emp.mean100000=replicate(10000,{ # emp.s3=rOliveira2(100000,df1,df2) # temp.mean=mean(emp.s3) # return(temp.mean) #}) # #emp.mean.p100000=(length(emp.mean100000[emp.mean100000>expected.mean]))/10000 #Now for the variance #Variance of n=10 performed 10,000 times and plotted emp.var10=replicate(10000,{ emp.s1=rOliveira2(10,df1,df2) temp.var=var(emp.s1) return(temp.var) }) #We calculate p by seeing what proportion of the distribution of variances is higher than the expected emp.p10=(length(emp.var10[emp.var10>expected.variance]))/10000 #Same thing for n=100 emp.var100=replicate(10000,{ emp.s2=rOliveira2(100,df1,df2) temp.var=var(emp.s2) return(temp.var) }) emp.p100=(length(emp.var100[emp.var100>expected.variance]))/10000 #Same for n=1000 emp.var1000=replicate(10000,{ emp.s3=rOliveira2(1000,df1,df2) temp.var=var(emp.s3) return(temp.var) }) emp.p1000=length((emp.var1000[emp.var1000>expected.variance]))/10000 #Same for n=100000 #Code commented out because it requires a prohibitively high # of operations #emp.var100000=replicate(10000,{ # emp.s4=rOliveira2(100000,df1,df2) # temp.var=var(emp.s4) # return(temp.var) #}) # #emp.p100000=(length(emp.var100000[emp.var100000>expected.variance]))/10000