## crop545 - lab 4 ## Imputation ## Y.Song 1/31/2018 # 000000000000000000000000000000000000000000000 # # ---- 0. Review ---- # # 000000000000000000000000000000000000000000000 # to set the working directory. silver = "/Users/song/Dropbox/COURSE/CROP_545/2018/lab4" black = 'C:/Users/yuanh/Documents/R' usb = 'D:/crop545-4' setwd(usb) # --- 0.1. importing data ----------------- indexEX = read.csv("./indexEX.csv", header = T) head(indexEX) # --- 0.2. Index/subsampling -------------- indexEX[3, ] # the third row indexEX[, 3] # the 3rd col indexEX$U # 3rd col by name indexEX[which(indexEX$set=="C"), ] # notice the "==" indexEX[which(indexEX$order>40 & indexEX$L=="-Inf"), ] # notice the "&" (indx = c(2,3,4)) # creating index numbers for subsampling indexEX[indx,] head(indexEX[, indx]) # head() to print only the 6 rows of the result # --- 0.3. For loop -------------- # this is a silly example, just try to get the idea of how the for-loop works indexEX$read = NA for (i in 5:nrow(indexEX)) { indexEX$read[i]= indexEX$U + 100*i } head(indexEX, 10) # note: # 1. the index can be called as anything j, k, l, ...z, or # don't even have to be a letter # 2. in most cases, i is for 1:N. But it can be any number # e.g. for(i in c(1,3,5,7)) # 3. you can have multiple indices (j and j and k at the same # time). We will discuss that when we got to such senarioes. # 000000000000000000000000000000000000000000000 # # ---- 1. stochastic imputation ---- # # 000000000000000000000000000000000000000000000 # --- 1.1 Import data & data cleaning --------- myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGD[1:10,1:10] X.raw=myGD[,-1] # dropping the individual's name X=X.raw # make an copy of the raw data and working on it X[1:5, 1:5] set.seed(545) # data don't have NA values yet, adding some missing # values for the data # ranRow = floor(runif(30, 1, 281)) # ranCol = floor(runif(30, 1, 3093)) # X[ranRow, ranCol] = NA # --- 1.2 Implete the algorithm -------------------- StochasticImpute = function(X){ n=nrow(X); m=ncol(X) # fn - sum of genotypes for all individuals # na.rm - remove NA values fn=colSums(X, na.rm=T) # count number of non missing individuals fc1=colSums(floor(X/3+1),na.rm=T) # or equivelently fc2 = colSums(!is.na(X)) fc = fc1 # Frequency of allele "2" fa=fn/(2*fc) # m is the number of col for(i in 1:m){ index.a=runif(n)