# Crop 545 - lab 7 # An annotated script for Lec#8 - Lec#12 PCA # In this script, section 8.15 meanse the code is provided in lecture 8, slide 15 ## 000000000000000000000000000000000000000000 ## ## Lec 9 Phynometric simulation & G2P ----- ## ## 000000000000000000000000000000000000000000 # 9.1: phenometric distribution ---- # m=20 # number of gene n=100 # number of individual x=runif(m) # x = a uniform r.v. with the length of m gene=matrix(x,n,m,byrow = T) # gene = a matrix with n row and m col # rows are repeats of the x we created above head(gene) # The above create the matrix in 8.6 "assign to individuals" # next we will assign True/False values to each elemant in # gene matrix by 50/50 chance. This is a review of how to # flip a coin with R. Please make sure you understand. galton=matrix(runif(n*m),n,m) # galton is a matrix full of uniform (0,1) r.v., with same # dimension as 'gene' head(galton) galton.binary=galton<.5 head(galton.binary) # the above generate the matrix in 8.6 "Uniform random variable" gene[galton.binary]=0 # at the 'TRUE' location, the origrinal 'gene' value # will be turned into zero head(gene) # till here, certain genes in some individuals are 'expressed', # in some are 'muted'. y=rowSums(gene) # sum up the total gene effect in each individual y[1:6] hist(y, freq=F); lines(density(y)) # wrap up into a function phe.sim = function(m, n){ x=runif(m) gene=matrix(x,n,m,byrow = T) galton=matrix(runif(n*m),n,m) gene[galton<.5]=0 y=rowSums(gene) hist(y, freq=F, breaks=round(n/10), col="skyblue",lty="blank", main="Simulated Phenometric Distribution") lines(density(y), col="red") } phe.sim(m=1000, n=10000) ## 9.2 QTN position plot---- # the hapmap example data is a subset of the dataset downloaded from the Dryland website. hapmapEX=read.table("hapmapEX.txt", header = T) hapmapEX[1:4, 1:6] # col3 = chrommosome # col4 = position # It should be a standard in all Hapmap format plot(hapmapEX[, c(3,4)], col="skyblue", xlab="Chromosome", ylab="Position", cex.lab=0.7) # the following steps randomly select some markers source("hapmap2df.txt") n=nrow(hapmapEX);m=ncol(hapmapEX) NQTN=10 # define a number QTN.position=sample(n,NQTN,replace=F) QTN.position # randomly generate 10(NQTN) numbers from 1 to the number of m SNPQ=as.matrix(hapmapEX[QTN.position,]) # extract the selected QTN head(SNPQ) # note: this example uses hapmap file, in which the rows stands for markers and cols for # individual. This is different from the example given in lecture. ## 9.3 simulate phenotype ---- #Sampling QTN myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) NQTN=10 X=myGD[,-1] n=nrow(X); m=ncol(X) QTN.position=sample(m,NQTN,replace=F) SNPQ=as.matrix(X[,QTN.position]) QTN.position head(SNPQ) # randomly simulate some additive effect. # recall the add effect follows a normal distribution addeffect=rnorm(NQTN,0,1) effect=SNPQ%*%addeffect # %*% stands for matrix multiplication head(effect) # descriptive plot of the effect par(mfrow=c(2,2)) plot(effect, main="Scatter plot of effect") hist(effect) boxplot(effect, main="Boxplot of effect") plot(ecdf(effect), main="Cumulative density of effect") # assume an fixed h-sqaure score, and figure residual variance var(e) h2=.7 effectvar=var(effect) effectvar residualvar=(effectvar-h2*effectvar)/h2 # see the lecture for this equation residualvar residual=rnorm(n,0,sqrt(residualvar)) head(residual) par(mfrow=c(2,2)) plot(residual,main="Scatter plot of residual") hist(residual) boxplot(residual, main="Boxplot of residual") plot(ecdf(residual), main="Cumulative density of residual") y=effect+residual par(mfrow=c(2,2)) plot(y,main="Scatter plot of simulated phenotype") hist(y,main="Histogram of simulated phenotype") boxplot(y,main="Boxplot of simulated phenotype") plot(ecdf(y),main="Cumulative density of simulated phenotype") ## 9.4 Heritability ---- # How much of variation in the phenotypic trait was due to genetic # variation va=var(effect) ve=var(residual) vp=var(y) v=matrix(c(va,ve,vp),1,3) colnames(v)=c("A", "E","P") # assign names to columns par(mfrow=c(1,2)) plot(density(y),ylim=c(0,.3), main="Density of Y, G, and e") lines(density(effect),col="blue") lines(density(residual),col="red") barplot(v,col="gray") # Correlations between y and effects # Plot par(mfrow=c(1,3)) plot(y,effect) plot(y,residual) plot(residual,effect) cor(y,effect) ## 9.5 G2P Function ---- # wrapping the above into the G2P function G2P=function(X,h2,alpha,NQTN,distribution){ n=nrow(X) m=ncol(X) #Sampling QTN QTN.position=sample(m,NQTN,replace=F) SNPQ=as.matrix(X[,QTN.position]) QTN.position #QTN effects if(distribution=="norm") {addeffect=rnorm(NQTN,0,1) }else {addeffect=alpha^(1:NQTN)} #Simulate phenotype effect=SNPQ%*%addeffect effectvar=var(effect) residualvar=(effectvar-h2*effectvar)/h2 residual=rnorm(n,0,sqrt(residualvar)) y=effect+residual return(list(addeffect = addeffect, y=y, add = effect, residual = residual, QTN.position=QTN.position, SNPQ=SNPQ)) } ## 000000000000000000000000000000000000000000 ## ## ---- Lec 10 GWASbyR ----- ## ## 000000000000000000000000000000000000000000 ## 10.1 corr. coef. and t distribution ---- cort=function(n=10000,df=100){ z=replicate(n,{ x=rnorm(df+2) y=rnorm(df+2) r=cor(x,y) t=r/sqrt((1-r^2)/(df)) }) return(z)} x=cort(10000,5) t=rt(100000,5) plot(density(x),col="blue") lines(density(t),col="red") ## 10.2 GWAS ---- myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) source("G2P.r") # In this section, we run the GWAS algorithom twice. The idea and the codes are almost the same. # The first time, we don't do any complicated indexing, focusing on how GWAS works in general. # Second time, we will use index to mark our QTN position, and run the same process. ### 10.2.1 first run, general idea ---- # use G2P function, randomly select 10 QTN and simulate phenotype X=myGD[,-1] mySim=G2P(X,h2=.75, alpha=1, NQTN=10, distribution="norm") # with 75% heritability, and assume normally distributed residual, we ramdomly selected 10 QTN's # and simulated their corresponding phenotype. str(mySim) plot(myGM[,c(2,3)]) lines(myGM[mySim$QTN.position,c(2,3)],type="p",col="red") points(myGM[mySim$QTN.position,c(2,3)],type="p",col="blue",cex = 5) # use a bubble plot to check the locations of the selected 10 QTN # note how we get the location: mySim$QTN.position ### r, t, p # the idea here is that the statistic t=t(r) follows a t-distribution. # use such distribution information, we can calculate the probability # calculate correlation coefficient between y and each col of X r=cor(mySim$y,X) n=nrow(X) t=r/sqrt((1-r^2)/(n-2)) # see lecture slide 10.5 for this equation # t is a t-distributed r.v. # use the t-distribution calculate the p=2*(1-pt(abs(t),n-2)) ### Manhattan plot color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10) # defining a color array # note1: all colors have specific names or ID# in R # note2: also there'are packages deal with colors in R. Please google if interested m=ncol(X) plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) # plot the -log(p) # myGM[, 2] is the chromosome column # color.vector[myGM[,2]] markers on same chromosome get same color abline(v=mySim$QTN.position, lty = 2, lwd=1, col = "black") # add an vertical line to mark the QTN position ## 10.2.2 second run: GWAS with known marker location ---- # same thing as 10.2, but when simulate the phenotype, we only # use markers on the 1 - 5 chromosome index1to5=myGM[,2]<6 X1to5 = X[,index1to5] set.seed(99164) mySim=G2P(X=X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm") str(mySim) # we only sampled markers plot(myGM[,c(2,3)]) lines(myGM[mySim$QTN.position,c(2,3)],type="p",col="red") points(myGM[mySim$QTN.position,c(2,3)],type="p",col="blue",cex = 5) r=cor(mySim$y,X) n=nrow(X) t=r/sqrt((1-r^2)/(n-2)) p=2*(1-pt(abs(t),n-2)) color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10) m=ncol(X) plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) abline(v=mySim$QTN.position, lty = 2, lwd=1.5, col = "black") sort(p)[1:5] zeros=p==0 p[zeros]=1e-10 plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) abline(v=mySim$QTN.position, lty = 2, lwd=1.5, col = "black") ## 10.4 GWASbyCor function ---- # Similar as G2P, we can wrap up things we did above into a function: GWASbyCor=function(X,y){ n=nrow(X) r=cor(y,X) n=nrow(X) t=r/sqrt((1-r^2)/(n-2)) p=2*(1-pt(abs(t),n-2)) zeros=p==0 p[zeros]=1e-10 return(p) } # X - numerical genotype data # y - phenotype (numerical) ## 10.5 False positive ---- index=order(p) top10=index[1:10] # the marker location of most 'significant' markers detected=intersect(top10,mySim$QTN.position) # an intersection of two marker locations set # top10 is the most significant 10 # mySim$QTN.position is the selected QTN positions for simulation in G2P function falsePositive=setdiff(top10, mySim$QTN.position) # falsePositive are those who are identified as 'significant' but are actually # not contributing to the phenotype top10 mySim$QTN.position detected length(detected) falsePositive plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black") abline(v= falsePositive, lty = 2, lwd=2, col = "red") # same plot as before: manhattan plot with black line on marker location # add red line to mark false positive locations p.obs=p[!index1to5] # the P values for markers on 6 -10 m2=length(p.obs) p.uni=runif(m2,0,1) # null distribution - markers on 6-10 have no extrodinary contribution to the phenotype order.obs=order(p.obs) order.uni=order(p.uni) plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs]), xlim = c(0,5), ylim=c(0,5)) # QQ plot - if things are distributed in a similar way # If yes, the curve will roughly lie on a 1-1 line # in our case here, no abline(a = 0, b = 1, col = "red") plot(ecdf(-log10(p.obs))) type1=c(0.01, 0.05, 0.1, 0.2) cutoff=quantile(p.obs,type1,na.rm=T) cutoff plot(type1, cutoff,type="b")