# BIOS601 Tutorial - 2009-09-21
#
# Author: Ben Rich (benjamin.rich@mail.mcgill.ca)
#
# -----------------------------------------------
#
# R is "... a language and environment..."
#
# R website: www.r-project.org
#
#  R is free!
#
# -----------------------------------------------
#
#
#
#
#
#
#
#
#
#  Notes:
#
# R blurs the distinction between 'user' and 'programmer'
#
# R contains features you would expect to find in:
#  * A procedural language (assignment, control flow, ...)
#  * A functional language (functions as first-class citizens, apply, ...)
#  * An object-oriented language (polymorphism, inheritence, ...)
#  * A linear algebra system (matrix operations)
#
# R could be used as a general-purpose programming language
# but for this purpose it is:
#  (a) the opposite of elegant
#  (b) slow
#
# These are traded off for practicality.  R trades off speed
# (one type of efficiency) for convenience (another type of efficiency).
#
# R has a large number of features and behaves in a way that is not always
# intuitive.  Since R is used interactively, one can (and should) check
# what is happening at every step.
#
# The best way to learn will be to jump in and get our hands dirty.
# One of the most important things to know is how to get documentation.
help(rnorm)
?rnorm
help.search("normal distribution")
?Normal

# Some basics...
# --------------

# Assigning to variables (can also use '=')
x <- 5

# Unassigned values are 'print'ed
print(x)
x
5

# Primitive data types
3.4         # numeric
"hello"     # character
TRUE        # logical
1+1i        # complex

# Note that these are actually vectors (of length 1)
# Longer vectors can be created by...
c(3, 8, 12)
rep(5, 8)
seq(0, 1, 0.1)
1:10            # useful shorthand


# c() is for 'concatenation'
x <- 1:5
y <- c(3, x)
c(x, y, x)
c("hello", "world")  # works on all types



# Basis of more complex data structures is the 'list',
# which can be heterogeneous.
example <- list(x=c(2,3), y="hello", z=list(a=TRUE, b=5))
example

# list elements accessed with '$' or [[...]]
example$x
example$z$b
example[["x"]]







# Intra-class correlation (ICC)
# -----------------------------
# We will look at the heights data we collected in class.





# Read in heights data
# Notes:
#  1. .csv mean "comma separated values", can be created by e.g. Excel
#  2. read.csv is a function.  It returns a "data.frame"
#  3. the arrow (<-) means assignment. Equals (=) can also be used
heights <- read.csv("heights.csv", row.names=1)
class(heights)

# Some checks I usually do
dim(heights)
names(heights)
sapply(heights, class)
sapply(heights, function(x) sum(is.na(x)))
heights


# Let's add the mean of each row as a new column
heights$mean <- apply(heights[,2:10], 1, mean, na.rm=TRUE)
heights


# Repeated measures data can be in "long" or "wide" format.
# In long format, all heights are in a single column.
# Which height is in a row is indicated by "factors".
# Sometimes long format can be useful, e.g. for the plotting I am about to do.
heights.long <- reshape(heights,
                        varying=2:10,             # which columns contain repeated measures
                        v.names="estim",          # what should the new column be called
                        timevar="rater",          # what should the new column be called
                        times=1:9,                # what are the values to put here
                        idvar="name",             # what should the new column be called
                        ids=factor(rownames(heights)),    # what are the values to put here
                        direction="long")

# For fun, let's sort the data.frame by the column 'name'
heights.long <- heights.long[order(heights.long$name),]

dim(heights.long)
heights.long

# This loads the lattice package for "trellis" graphics,
# an alternative set of plotting functions with powerful
# features and a consistent interface.
require(lattice)

dotplot(name ~ GS + mean + estim, data=heights.long, auto.key=list(text=c("Gold Standard", "Mean", "Estimate")), par.settings=list(superpose.symbol=list(cex=c(1.2,1.0,1.0), pch=c(23,22,21), col=c("tomato", "chartreuse4", "darkcyan"), fill=c("tomato", "chartreuse4", "transparent"))))



# There are two ICC functions in external packages,
# but neither can account for missing values.
require(psy)
icc(heights[,2:10])
detach(package:psy)

require(irr)
icc(heights[,2:10])
detach(package:irr)

# We will do our own version of ICC for unbalanced data,
# making use of the following result:

# Handbook of tables for probability and statistics, 2nd ed.
# Page 28
# -----------
# analysis of variance for the one-way classification
#
#
#                   df      ms        E[ms]
# between groups   k-1     s_1^2       *
# within groups    n.-k    s_2^2     \sigma^2
# total            n.-1
#
# * = \sigma^2 + \frac{1}{k-1} \left(
#     n. - \frac{\sum n_i^2}{n.} \right) \sigma^2_\alpha
#
# n. = \sum n_i

hmat <- as.matrix(heights[,2:10])  # coerse to a matrix
k <- nrow(hmat)                    # k = number of subjects
n.i <- apply(hmat, 1, function(x) sum(!is.na(x))) # num of (non-missing) values/subject
mu.i <- apply(hmat, 1, mean, na.rm=TRUE) # subject-specific means
ss.between <- sum(n.i * (mu.i - mean(mu.i))^2)
df.between <- k-1
ms.between <- ss.between / df.between
ss.within <- sum((sweep(hmat, 1, mu.i))^2, na.rm=TRUE)  # check ?sweep for details
df.within <- sum(n.i) - k
ms.within <- ss.within / df.within
sigma2.within <- ms.within
sigma2.between <- (ms.between - sigma2.within) * (k-1) /
                    (sum(n.i) - (sum(n.i^2) / sum(n.i)))
sigma2.between / (sigma2.between + sigma2.within)  # ICC






# We can write our own function.
# Advantages:
#  1. Reuseable
#  2. Code is more structured
#  3. Less clutter in top-level namespace (try ls() to see)
myicc <- function(xmat) {
    k <- nrow(xmat)                    # k = number of subjects
    mu.i <- apply(xmat, 1, mean, na.rm=TRUE)
    n.i <- apply(xmat, 1, function(x) sum(!is.na(x)))
    ss.between <- sum(n.i * (mu.i - mean(mu.i))^2)
    df.between <- k-1
    ms.between <- ss.between / df.between
    ss.within <- sum((sweep(xmat, 1, mu.i))^2, na.rm=TRUE)
    df.within <- sum(n.i) - k
    ms.within <- ss.within / df.within
    sigma2.within <- ms.within
    sigma2.between <- (ms.between - sigma2.within) * (k-1) /
                        (sum(n.i) - (sum(n.i^2) / sum(n.i)))
    sigma2.between / (sigma2.between + sigma2.within)
}

myicc(hmat)



# We want to make sure our function works.  How about simulating some data?

sim.icc.data <- function(k, n, sigma2.between=1, sigma2.within=1, mu=0) {
    mu.i <- rnorm(k, mu, sqrt(sigma2.between))
    # Caution: vectors that are too short get 'recycled'
    xmat <- matrix(rnorm(k*n, mu.i, sqrt(sigma2.within)), k, n, byrow=FALSE)
    return(xmat)
}

# This data should have an ICC of 0.9
sim.mat <- sim.icc.data(6, 9, sigma2.between=9, sigma2.within=1)

myicc(sim.mat)


# Another way to compute ICC for heights data.
# ICC is based on a random-effects model.
# We can use linear mixed-effects model (lme) to fit this.
require(nlme)
lme.fit <- lme(estim ~ 1, data=na.omit(heights.long), random=~1 | name)
lme.fit