## crop545 - lab 1 ## Introduction to R ## Y.Song 1/9/2018 # 000000000000000000000000000000000000000000000 # # ---- 1. Primer ---- # # 000000000000000000000000000000000000000000000 # --- 1.1. Use R as a calculator -------------- # arithmetic 1+2 56-20 2*2 10/4 # algebra 2^3 # 2 to the 3rd power sqrt(25) # square root of 25 sin(pi/2) tan(pi/4) log(2.7182818) # ln(2.7182818) log(10) # ln(10) pi e exp(1) # exponential # --- 1.2. Getting help -------------- # what does a functor do? ?log() # try the example provided by the document log(exp(3)) log10(1e7) # = 7 # how to do a specific task? ??mean # most of the time, Google is your best friend # 00000000000000000000000000000000000000000000000 # # --- 2. R Objects - Variables & Constants ---- # # 00000000000000000000000000000000000000000000000 ## Values a=2 b=3 c=a+b c (d=a*b) # brackets let the result will show in the console ## Vectors (x = c(1,2,3,5,7)) # c() is a function to combine single values to a vector (y = 6:10) # take a sequance of natural numbers with the interval of 1 (z = seq(1, 5, 0.5)) # the sequence function help defines an interval of choice ## Matrix (M = matrix(c(11, 21, 12, 22), 2, 2)) ## dataframe # we will use x, y to generate a dataframe (data = cbind(x,y)) # this is a matrix is(data) (data = as.data.frame(data)) is(data) # colomns in dataframe can get names names(data) = c("XX", "YY") data$XX ## logic values tt = TRUE ff = FALSE # logic values are very helpful and important, # we don't normally use it in this way. # one useful example of using logic values x x > 3 x[x>3] # we will discuss more in the subsamping part ## string values (text) names = c("ava", "leah", "emma") ## list (l.list = list(data, M, names)) ## to check what type of data it is is(x) is(data) is(names) is(l.list) # 00000000000000000000000000000 # # ---- 3. Files---- # # 00000000000000000000000000000 # It is important to know where on the computer (a # folder on the hard drive, for example) we are # working with. getwd() # ----------------- 3.1. Reading (Importing) ------------------- # copy Zhiwu's orignial code: # (the code is provided on our course website) myGD=read.table(file="~/Dropbox/Current/ZZLab/WSUCourse/CROPS545/Demo/mdp_numeric.txt",head=T) # how to make it correct: myGD.1=read.table(file="C:/Users/yuanh/Documents/Dropbox/COURSE/CROP_545/2018/R intro/mdp_numeric.txt",head=T) myGM.1=read.table(file="C:/Users/yuanh/Documents/Dropbox/COURSE/CROP_545/2018/R intro/mdp_SNP_information.txt",head=T) # It is tedious to type the long folder path each time. # We can set a 'working directory' to tell R we are working # in a specific folder and stick there setwd("C:/Users/yuanh/Documents/Dropbox/COURSE/CROP_545/2018/R intro") myGD.2=read.table(file="mdp_numeric.txt", head=T) myGM.2=read.table(file="mdp_SNP_information.txt", head=T) # we can also let R find a data online myGD.3=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM.3=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myGD = myGD.1 # make a copy of the data, and make it myGM = myGM.1 # ------------------- 3.2. Writing (Exporting) ------------------ # let's save new copy of the data on the hard drive write.table(myGD,file="mygd.txt",row.names = T) # 0000000000000000000000000000000000000000000000000000000 # # ---- 4. data manipulation ---- # # 0000000000000000000000000000000000000000000000000000000 # -------------- 4.1. examine the data ------------------ (mx = matrix(1:27, 9, 3)) (df = as.data.frame(mx)) # what is the data type? is(mx) is(df) is(myGD) is(myGM) # how many colums and dim(mx) dim(df) dim(myGD) dim(myGM) # preview head(mx) # head function shows the first 6 rows of the data str(df) # structure function gives a brief description of the data View(df) # View function open the data viewer and display the data summary(df) # summary function gives basic descriptive statistics # when dataset get too big, it is not a good idea to preview top 6 rows # let's just rendering a portion of the data myGD[1:10, 1:5] # print first 10 rows and first 5 columns of the data # -------------- 4.2. subsample ------------------ # SUBSAMPLING WITH LOCATION df # observe the output, notice the [,1], [2,] # [1, ] means first row # [ ,2] means second column # [i, j] means element at i-th row, j-th column df[1,] # show the first row of df df[,2] # show the second col of df df[1,2] # show the element at first row second col df[,-2] # everything but second column df[-1,-3] # everything but the first row, third column x x[3] # the third element in x # SUBSAMPLING WITH VARIABLE df # observe the output, notice the "V1, V2, V3" # we can use that to select data df$V1 df$V1 >=4 df[df$V1>=4,] # be careful about the comma # -------------- 4.3. combine and merge ------------------ # create a text string vector for the example (type = c("a", "b", "c")) (type = rep(type, 3)) # combine df (the 9X3 dataframe) with new elements (big.df = cbind(df, type)) # add type as a new col (big.df = rbind(big.df, c(109, 118, 127, "c"))) # add a new row # -------------- 4.4. basic operations ------------------ # vector operation # we will use the x and y vectors as example. x y x+y # add each element correspondingly x*y # multiply each element correspondingly (dot-multiply) 2*x # scalar multiplication mean(x) # the mean of x sum(x) # the sum of all elements # matrix operation # we use the matrix M we created before M t(M) # transpose solve(M) # inverse # 0000000000000000000000000000000000000000000000000000000 # # ---- 5. Functions ---- # # 0000000000000000000000000000000000000000000000000000000 # ---------- 5.1. define a function of your own ---------- # define a function called "poi.p" that takes input value # of lambda and k, and operates the calculation as desired, # and return the value. poi.p = function (lambda, k) { p = (lambda^k)*exp(-lambda)/factorial(k) return(p) } poi.p(lambda = 3, k=2) poi.p(lambda = 5, k=2) poi.p(lambda = 5, k=10) # some part of the function is optional. In the following # example, we calculate the same thing, and the optional # part has been taken out: poi.p2 = function (lambda, k) { (lambda^k)*exp(-lambda)/factorial(k) # return(p) } poi.p2(3, 2) # ---------- 5.2. IF condition ---------- x <- c("what","is","truth") # IF: if(criteria){ # the expression to be executes if criteria meets # } if("Truth" %in% x) { print("Truth is found") } # nothing happened because no Truth is found # notice Truth and truth is not the same in R # IF ELSE: if(criteria) {expression # } else { expression # } if("Truth" %in% x) { print("Truth is found") } else { print("Truth is not found") } # IF ELSE: if(criteria) {expression # } else if { expression # } else {expression # } if("Truth" %in% x) { print("Truth is found the first time") } else if ("truth" %in% x) { print("truth is found the second time") } else { print("No truth found") } # Example found at https://www.tutorialspoint.com/r/r_if_else_statement.htm # ---------- 5.3. for loops ------------------ # creating an empty vector save a space for later calculation z=c() for(i in 1:5){z[i] = i+5} z # combine if condition and for loop we will find the # number of even numbers in the vector x x count = 0 for (v in x) { if(v %% 2 == 0) {count = count+1 # what does %% do? } } print(count) # ---------- 5.4. packages # packages are collections of functions. chisq.test(data) # without loading the correct package, # the function won't work install.packages("MASS") library(MASS) # now try the same thing again chisq.test(data)