######################################################################### ######### HW3 ###################### # set working directory setwd("C:/Users/stephanie.sjoberg/Dropbox/2018 Spring Classes/Stat Gen/HW3") set.seed(99164) ##### Question 1 ###### # load genotype data myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) # load genetic map myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) #Sampling QTN NQTN=10 X=myGD[,-1] n=nrow(X); m=ncol(X) QTN.position=sample(m,NQTN,replace=F) SNPQ=as.matrix(X[,QTN.position]) QTN.position head(SNPQ) # plot QTN plot(myGM[,c(2,3)]) lines(myGM[QTN.position,c(2,3)],type="p",col="red") points(myGM[QTN.position,c(2,3)],type="p",col="blue",cex = 5) dev.copy(png,'QTNpositions.png') dev.off() # randomly simulate some additive effect. # recall the add effect follows a normal distribution addeffect=rnorm(NQTN,0,1) effect=SNPQ%*%addeffect # %*% stands for matrix multiplication head(effect) # descriptive plot of the effect par(mfrow=c(2,2)) plot(effect, main="Scatter plot of effect") hist(effect) boxplot(effect, main="Boxplot of effect") plot(ecdf(effect), main="Cumulative density of effect") dev.copy(png,'DistGenEffect.png') dev.off() # assume an fixed h-sqaure score, and figure residual variance var(e) h2=.75 effectvar=var(effect) effectvar residualvar=(effectvar-h2*effectvar)/h2 # see the lecture for this equation residualvar residual=rnorm(n,0,sqrt(residualvar)) head(residual) par(mfrow=c(2,2)) plot(residual,main="Scatter plot of residual") hist(residual) boxplot(residual, main="Boxplot of residual") plot(ecdf(residual), main="Cumulative density of residual") dev.copy(png,'DistResidual.png') dev.off() y=effect+residual par(mfrow=c(2,2)) plot(y,main="Scatter plot of simulated phenotype") hist(y,main="Histogram of simulated phenotype") boxplot(y,main="Boxplot of simulated phenotype") plot(ecdf(y),main="Cumulative density of simulated phenotype") dev.copy(png,'Phenotype.png') dev.off() # How much of variation in the phenotypic trait was due to genetic # variation va=var(effect) ve=var(residual) vp=var(y) v=matrix(c(va,ve,vp),1,3) colnames(v)=c("A", "E","P") # assign names to columns par(mfrow=c(1,1)) plot(density(y),ylim=c(0,.25), main="Density of Y, G, and e") lines(density(effect),col="blue") lines(density(residual),col="red") polygon(density(effect), col="forestgreen") polygon(density(residual), col="grey") polygon(density(y), col="deepskyblue") legend("topright", legend = c("Phenotype","Genetic effect","Residual"),fill=c("deepskyblue","forestgreen","grey"),bty="n" ) dev.copy(png,'Variances1.png') dev.off() barplot(v,col="gray") dev.copy(png,'Variances2.png') dev.off() # Correlations between y and effects # Plot par(mfrow=c(1,3)) plot(y,effect) plot(y,residual) plot(residual,effect) cor(y,effect) dev.copy(png,'CorXY.png') dev.off() ## Question 2 ## ### 10.2.1 first run, general idea ---- # use G2P function, randomly select 10 QTN and simulate phenotype G2P=function(X,h2,alpha,NQTN,distribution){ n=nrow(X) m=ncol(X) #Sampling QTN QTN.position=sample(m,NQTN,replace=F) SNPQ=as.matrix(X[,QTN.position]) QTN.position #QTN effects if(distribution=="norm") {addeffect=rnorm(NQTN,0,1) }else {addeffect=alpha^(1:NQTN)} #Simulate phenotype effect=SNPQ%*%addeffect effectvar=var(effect) residualvar=(effectvar-h2*effectvar)/h2 residual=rnorm(n,0,sqrt(residualvar)) y=effect+residual return(list(addeffect = addeffect, y=y, add = effect, residual = residual, QTN.position=QTN.position, SNPQ=SNPQ)) } X=myGD[,-1] mySim=G2P(X,h2=.75, alpha=1, NQTN=10, distribution="norm") # with 75% heritability, and assume normally distributed residual, we ramdomly selected 10 QTN's # and simulated their corresponding phenotype. str(mySim) ### r, t, p # the idea here is that the statistic t=t(r) follows a t-distribution. # use such distribution information, we can calculate the probability # calculate correlation coefficient between y and each col of X r=cor(mySim$y,X) n=nrow(X) t=r/sqrt((1-r^2)/(n-2)) # see lecture slide 10.5 for this equation # t is a t-distributed r.v. # use the t-distribution calculate the p=2*(1-pt(abs(t),n-2)) ### Manhattan plot color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10) # defining a color array # note1: all colors have specific names or ID# in R # note2: also there'are packages deal with colors in R. Please google if interested m=ncol(X) par(mfrow=c(1,1)) plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) # plot the -log(p) # myGM[, 2] is the chromosome column # color.vector[myGM[,2]] markers on same chromosome get same color abline(v=mySim$QTN.position, lty = 2, lwd=1, col = "black") # add an vertical line to mark the QTN position dev.copy(png,'ManhattanPlot.png') dev.off() ## Question 3 ## # top 10 SNPs index=order(p) top10=index[1:10] detected=intersect(top10,mySim$QTN.position) falsePositive=setdiff(top10, mySim$QTN.position) top10 mySim$QTN.position detected length(detected) falsePositive plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black") abline(v= falsePositive, lty = 2, lwd=2, col = "red") dev.copy(png,'Top10SNP.png') dev.off() ## Question 4 ## #sort the p values with increasing order output_1to7= sort(p[mySim$QTN.position],decreasing = F)[1:7] #the seventh significant p value seventh_p=output_1to7[7] seventh_p # 0.06087149 # count the number of QTN whose p value is less than the seventh_p length(p[p