--- title: "Lab4_impute" author: "Zhou" date: "2/8/2021" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## 1.Stochastic imputation with allele frequency ### 1.1 Data prepare Missing value simulation ```{r} #Import data myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) X.raw=myGD[,-1] #keep as original for comparison X=X.raw # Create a new variable than may be changed later #Set missing values n=nrow(X) m=ncol(X) dp=m*n #total data points uv=runif(dp) hist(uv) mr=.2 #missing rate missing=uv1 X.knn.I[Index1]= 0 X.knn.I[Index2]= 2 #Correlation accuracy.r=cor(X.raw[index.m], (X.knn.I)[index.m]) #Match proportion index.match=X.raw== X.knn.I index.mm=index.match&index.m accuracy.m=length(X[index.mm])/length(X[index.m]) accuracy.r accuracy.m ``` ### 2.2 Individual as features ```{r} #Impute using individuals as features X.knn= impute.knn(as.matrix(t(X)), k=10) #change to integer X.knn.I= X.knn$data Index1= X.knn$data<1 Index2= X.knn$data>1 X.knn.I[Index1]= 0 X.knn.I[Index2]= 2 #Correlation accuracy.r=cor(X.raw[index.m], t(X.knn.I)[index.m]) #Match proportion index.match=X.raw== t(X.knn.I) index.mm=index.match&index.m accuracy.m=length(X[index.mm])/length(X[index.m]) accuracy.r accuracy.m ``` ## 3. BEAGLE ### 3.1 Install the BEAGLE Firstly, install the JRE in computer Secondly, Download the BEAGLE from zzlab website ### 3.2 Run the BEAGLE ```{r} #Convert to BEAGLE input format index0=X==0 index1=X==1 index2=X==2 indexna=is.na(X) X2=X X2[index0]="A\tA" X2[index1]="A\tB" X2[index2]="B\tB" X2[indexna]="?\t?" myGD2=cbind("M",myGD[,1],X2) setwd("/Users/ZhouTang/Documents/WSU_courses/2021_spring/545/2021/Lab/Lab4/lab4_imputation") write.table(myGD2,file="test.bgl",quote=F,sep="\t",col.name=F,row.name=F) #Impute with BEAGLE system("java -Xmx12g -jar /Users/ZhouTang/Documents/WSU_courses/2021_spring/545/2021/Lab/Lab4/lab4_imputation/beagle.jar unphased=test.bgl missing=? out=test1" ) #Convert output format genotype.full <- read.delim("test1.test.bgl.phased.gz",sep=" ",head=T) genotype.c=as.matrix(genotype.full[,-(1:2)]) index.A=genotype.c=="A" index.B=genotype.c=="B" nr=nrow(genotype.c) nc=ncol(genotype.c) genotype.n=matrix(0,nr,nc) genotype.n[index.A]=0 genotype.n[index.B]=1 n2=ncol(genotype.n) odd=seq(1,n2-1,2) even=seq(2,n2,2) g0=genotype.n[,odd] g1=genotype.n[,even] X.bgl=g0+g1 #Impute and calculate correlation accuracy.r=cor(X.raw[index.m], X.bgl[index.m]) index.match=X.raw==X.bgl index.mm=index.match&index.m accuracy.m=length(X[index.mm])/length(X[index.m]) accuracy.r accuracy.m # Result comparison Method=c("Stochastic", "KNN(Ind)", "KNN(MK)", "BEAGLE") Cor=c(.39,.63,.60,.61) Match=c(.67,.77,.75,.75) par(mfrow=c(2,1),mar = c(3,4,1,1)) barplot(Cor, names.arg=Method, ylab="Correlation") barplot(Match, names.arg=Method, ylab="Match") ```