# BIOS601 Tutorial - 2009-09-21 # # Author: Ben Rich (benjamin.rich@mail.mcgill.ca) # # ----------------------------------------------- # # R is "... a language and environment..." # # R website: www.r-project.org # # R is free! # # ----------------------------------------------- # # # # # # # # # # Notes: # # R blurs the distinction between 'user' and 'programmer' # # R contains features you would expect to find in: # * A procedural language (assignment, control flow, ...) # * A functional language (functions as first-class citizens, apply, ...) # * An object-oriented language (polymorphism, inheritence, ...) # * A linear algebra system (matrix operations) # # R could be used as a general-purpose programming language # but for this purpose it is: # (a) the opposite of elegant # (b) slow # # These are traded off for practicality. R trades off speed # (one type of efficiency) for convenience (another type of efficiency). # # R has a large number of features and behaves in a way that is not always # intuitive. Since R is used interactively, one can (and should) check # what is happening at every step. # # The best way to learn will be to jump in and get our hands dirty. # One of the most important things to know is how to get documentation. help(rnorm) ?rnorm help.search("normal distribution") ?Normal # Some basics... # -------------- # Assigning to variables (can also use '=') x <- 5 # Unassigned values are 'print'ed print(x) x 5 # Primitive data types 3.4 # numeric "hello" # character TRUE # logical 1+1i # complex # Note that these are actually vectors (of length 1) # Longer vectors can be created by... c(3, 8, 12) rep(5, 8) seq(0, 1, 0.1) 1:10 # useful shorthand # c() is for 'concatenation' x <- 1:5 y <- c(3, x) c(x, y, x) c("hello", "world") # works on all types # Basis of more complex data structures is the 'list', # which can be heterogeneous. example <- list(x=c(2,3), y="hello", z=list(a=TRUE, b=5)) example # list elements accessed with '$' or [[...]] example$x example$z$b example[["x"]] # Intra-class correlation (ICC) # ----------------------------- # We will look at the heights data we collected in class. # Read in heights data # Notes: # 1. .csv mean "comma separated values", can be created by e.g. Excel # 2. read.csv is a function. It returns a "data.frame" # 3. the arrow (<-) means assignment. Equals (=) can also be used heights <- read.csv("heights.csv", row.names=1) class(heights) # Some checks I usually do dim(heights) names(heights) sapply(heights, class) sapply(heights, function(x) sum(is.na(x))) heights # Let's add the mean of each row as a new column heights$mean <- apply(heights[,2:10], 1, mean, na.rm=TRUE) heights # Repeated measures data can be in "long" or "wide" format. # In long format, all heights are in a single column. # Which height is in a row is indicated by "factors". # Sometimes long format can be useful, e.g. for the plotting I am about to do. heights.long <- reshape(heights, varying=2:10, # which columns contain repeated measures v.names="estim", # what should the new column be called timevar="rater", # what should the new column be called times=1:9, # what are the values to put here idvar="name", # what should the new column be called ids=factor(rownames(heights)), # what are the values to put here direction="long") # For fun, let's sort the data.frame by the column 'name' heights.long <- heights.long[order(heights.long$name),] dim(heights.long) heights.long # This loads the lattice package for "trellis" graphics, # an alternative set of plotting functions with powerful # features and a consistent interface. require(lattice) dotplot(name ~ GS + mean + estim, data=heights.long, auto.key=list(text=c("Gold Standard", "Mean", "Estimate")), par.settings=list(superpose.symbol=list(cex=c(1.2,1.0,1.0), pch=c(23,22,21), col=c("tomato", "chartreuse4", "darkcyan"), fill=c("tomato", "chartreuse4", "transparent")))) # There are two ICC functions in external packages, # but neither can account for missing values. require(psy) icc(heights[,2:10]) detach(package:psy) require(irr) icc(heights[,2:10]) detach(package:irr) # We will do our own version of ICC for unbalanced data, # making use of the following result: # Handbook of tables for probability and statistics, 2nd ed. # Page 28 # ----------- # analysis of variance for the one-way classification # # # df ms E[ms] # between groups k-1 s_1^2 * # within groups n.-k s_2^2 \sigma^2 # total n.-1 # # * = \sigma^2 + \frac{1}{k-1} \left( # n. - \frac{\sum n_i^2}{n.} \right) \sigma^2_\alpha # # n. = \sum n_i hmat <- as.matrix(heights[,2:10]) # coerse to a matrix k <- nrow(hmat) # k = number of subjects n.i <- apply(hmat, 1, function(x) sum(!is.na(x))) # num of (non-missing) values/subject mu.i <- apply(hmat, 1, mean, na.rm=TRUE) # subject-specific means ss.between <- sum(n.i * (mu.i - mean(mu.i))^2) df.between <- k-1 ms.between <- ss.between / df.between ss.within <- sum((sweep(hmat, 1, mu.i))^2, na.rm=TRUE) # check ?sweep for details df.within <- sum(n.i) - k ms.within <- ss.within / df.within sigma2.within <- ms.within sigma2.between <- (ms.between - sigma2.within) * (k-1) / (sum(n.i) - (sum(n.i^2) / sum(n.i))) sigma2.between / (sigma2.between + sigma2.within) # ICC # We can write our own function. # Advantages: # 1. Reuseable # 2. Code is more structured # 3. Less clutter in top-level namespace (try ls() to see) myicc <- function(xmat) { k <- nrow(xmat) # k = number of subjects mu.i <- apply(xmat, 1, mean, na.rm=TRUE) n.i <- apply(xmat, 1, function(x) sum(!is.na(x))) ss.between <- sum(n.i * (mu.i - mean(mu.i))^2) df.between <- k-1 ms.between <- ss.between / df.between ss.within <- sum((sweep(xmat, 1, mu.i))^2, na.rm=TRUE) df.within <- sum(n.i) - k ms.within <- ss.within / df.within sigma2.within <- ms.within sigma2.between <- (ms.between - sigma2.within) * (k-1) / (sum(n.i) - (sum(n.i^2) / sum(n.i))) sigma2.between / (sigma2.between + sigma2.within) } myicc(hmat) # We want to make sure our function works. How about simulating some data? sim.icc.data <- function(k, n, sigma2.between=1, sigma2.within=1, mu=0) { mu.i <- rnorm(k, mu, sqrt(sigma2.between)) # Caution: vectors that are too short get 'recycled' xmat <- matrix(rnorm(k*n, mu.i, sqrt(sigma2.within)), k, n, byrow=FALSE) return(xmat) } # This data should have an ICC of 0.9 sim.mat <- sim.icc.data(6, 9, sigma2.between=9, sigma2.within=1) myicc(sim.mat) # Another way to compute ICC for heights data. # ICC is based on a random-effects model. # We can use linear mixed-effects model (lme) to fit this. require(nlme) lme.fit <- lme(estim ~ 1, data=na.omit(heights.long), random=~1 | name) lme.fit