12

rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

  • Upload
    others

  • View
    2

  • Download
    0

Embed Size (px)

Citation preview

Page 1: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames
Page 2: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

# example 6.2 (2^4)

rm(list=ls(all=TRUE)) library(AlgDesign) # for gen.factorial()

design = gen.factorial(2,4,varNames=c("A","B","C","D"), factors="all")# design = gen.factorial(2,4,varNames=c("A","B","C","D")) # -1/+1

attach(design) # Makes the names in design visible. design # Table 6.10. y = c(45,71,48,65,68,60,80,65,43,100,45,104,75,86,70,96) contr = as.character("contr.helmert") lm.1 = lm(y~A*B*C*D, # R knows to expand A*B*C*D to the full model above. contrasts = list(A=contr,B=contr,C=contr,D=contr)) summary.aov(lm.1) # SS Table 6.12

# Df Sum Sq Mean Sq# A 1 1870.6 1870.6# B 1 39.1 39.1# C 1 390.1 390.1# D 1 855.6 855.6# A:B 1 0.1 0.1# A:C 1 1314.1 1314.1# B:C 1 22.6 22.6 # A:D 1 1105.6 1105.6# B:D 1 0.6 0.6# C:D 1 5.1 5.1# A:B:C 1 14.1 14.1# A:B:D 1 68.1 68.1# A:C:D 1 10.6 10.6 # B:C:D 1 27.6 27.6# A:B:C:D 1 7.6 7.6 # Note: No SSE. (lm.1)$resid # zero residuals.

png("example6_2_scatter.png") # I.e., perfect predictions! plot(predict(lm.1),y) abline(0,1) dev.off()

################################################################

Page 3: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

summary(lm.1) # In R the coeffs depend on contrasts.

# Coefficients:# Estimate # (Intercept) 70.0625 = mean(y)# A1 10.8125 = ( mean(y[A==2]) - mean(y[A==1]) )/2 prop to effect.# B1 1.5625 = ( mean(y[B==2]) - mean(y[B==1]) )/2 # C1 4.9375 # D1 7.3125 # A1:B1 0.0625 # A1:C1 -9.0625 # B1:C1 1.1875 # A1:D1 8.3125 # B1:D1 -0.1875 # C1:D1 -0.5625 # A1:B1:C1 0.9375 # A1:B1:D1 2.0625 # A1:C1:D1 -0.8125 # B1:C1:D1 -1.3125 # A1:B1:C1:D1 0.6875

Page 4: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

eff = 2*lm.1$coef eff = eff[2:length(eff)] # Exclude the grand mean. as.matrix(eff,col=1)

# A1 21.625 # 2*coeffs = main effects (effects in Table 6.12)# B1 3.125# C1 9.875# D1 14.625# A1:B1 0.125# A1:C1 -18.125# B1:C1 2.375# A1:D1 16.625# B1:D1 -0.375# C1:D1 -1.125# A1:B1:C1 1.875# A1:B1:D1 4.125# A1:C1:D1 -1.625# B1:C1:D1 -2.625# A1:B1:C1:D1 1.375

# lm.1$effects/2 # gives same answers as above, but with wrong signs!

png("example6_2_effects.png") plot(eff,axes=F) axis(1,labels=names(eff),at=c(1:length(eff)),cex.axis=.5) box() abline(h=0) dev.off()

png("example6_2_qq.png") qqnorm(eff) abline(median(eff),4, col=2) dev.off()

# Because SSE cannot be estimated, we can't do tests. But # Daniel (1959) hypothesized that nonsignificant effects are those # whose effect size follows a normal distribution about zero. # Then, nonsignificant effects should fall on a straight line # in a qq-plot. It's unlikely that the intercept of the qq plot # will be zero. So, to provide a better visual assessment of the # the straight line, I plot a straight line with an intercept # given by the median of the effects. As for the slope, I find # it by trial and error! # The smallest effect, and the top 4 effects are declared as # the significant ones. According to the following: # they are AC, C, D, AD, and A (from lowest to highest effect).

Page 5: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames
Page 6: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

png("example6_2_plots.png") par(mfrow=c(2,3)) ylim=c(40,110) boxplot(y[A==1],y[A==2],ylim=ylim,xlab="A") boxplot(y[C==1],y[C==2],ylim=ylim,xlab="C") boxplot(y[D==1],y[D==2],ylim=ylim,xlab="D")

boxplot(y[A==1 & C==1 ],y[A==2 & C==1], ylim=ylim,xlab="A", main="D-/D+") boxplot(y[A==1 & C==2 ],y[A==2 & C==2], add=2,col=2, boxwex = 0.5) boxplot(y[A==1 & D==1 ],y[A==2 & D==1], ylim=ylim,xlab="A",main="D-/D+") boxplot(y[A==1 & D==2 ],y[A==2 & D==2], add=2,col=2, boxwex = 0.5) dev.off()

Page 7: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

# Discard B

lm.2 = lm(y ~ A*C*D, contrasts = list(A=contr,C=contr,D=contr))

lm.2$model # Discarding B causes replication n=2

# y A C D# 1 45 1 1 1 1 1 1 # 2 71 2 1 1 # 3 48 1 1 1 1 1 1# 4 65 2 1 1# 5 68 1 2 1# 6 60 2 2 1# 7 80 1 2 1# 8 65 2 2 1# 9 43 1 1 2# 10 100 2 1 2# 11 45 1 1 2# 12 104 2 1 2# 13 75 1 2 2# 14 86 2 2 2# 15 70 1 2 2# 16 96 2 2 2

plot(lm.2) # all look OK.

summary(aov(lm.2))

# Df Sum Sq Mean Sq F value Pr(>F) # A 1 1870.6 1870.6 83.368 1.67e-05 ***# C 1 390.1 390.1 17.384 0.003124 ** # D 1 855.6 855.6 38.131 0.000267 ***# A:C 1 1314.1 1314.1 58.565 6.00e-05 ***# A:D 1 1105.6 1105.6 49.273 0.000110 ***# C:D 1 5.1 5.1 0.226 0.647483 # A:C:D 1 10.6 10.6 0.471 0.512032 # Residuals 8 179.5 22.4

as.matrix(2*lm.2 $coef,col=1)

# (Intercept) 140.125 # These are the coeffs given on page 260.# A1 21.625 # BUT with C*D and A*C*D still included in model.# C1 9.875# D1 14.625# A1:C1 -18.125# A1:D1 16.625# C1:D1 -1.125# A1:C1:D1 -1.625

Page 8: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

lm.3 = lm(y~ A + C + D + A*C + A*D, contrasts = list(A=contr,C=contr,D=contr)) as.matrix(2*lm.3 $coef,col=1) # (Intercept) 140.125 # In this case, the coeffs don't change.# A1 21.625# C1 9.875# D1 14.625# A1:C1 -18.125# A1:D1 16.625

as.matrix( predict(lm.3),col=1)

# 1 46.250 # pages 261# 2 69.375# 3 46.250# 4 69.375# 5 74.250# 6 61.125# 7 74.250# 8 61.125# 9 44.250# 10 100.625# 11 44.250# 12 100.625# 13 72.250# 14 92.375# 15 72.250# 16 92.375

png("example6_2_scatter2.png") plot(y, predict(lm.3)) abline(0,1) dev.off()

Page 9: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

Today's questions about "How can one use Daniel's conjecture if all the effects are significant?" made me wonder! Recall that we can always arrange for nonsignificant effects to exist - just include factors in the model that are "random," i.e., unrelated to the response. Alternatively, one can use the idea behind the permutation/randomization test we studied early in the quarter. Let's see if we can, using the data in example 6_2 discussed in class.

a) Compute the 15 (2^4 - 1) effects (excluding the intercpet), and call them eff_obs.

b) Randomly shuffle the y values ntrial = 100 times, each time computing the 15 effects. Store these effects in a 100 x 15 matrix called eff

Each row of eff has some qqplot, and so we have a 100 qqplots which we can draw on a single figure, as a cloud of points, which display the region in the qqplot that is consistent with the null-hypothesis of no-effect. Each of these qqplots has some intercept and slope estimating the mean and the standard deviation of the population of effects.

c) Compute the typical mean and typical standard deviation of the 100 qqplots, and call them typ_mean and typ_sd, respectively. You may use mean or median as a measure of "typical."

We want to compare the qqplot of the observed data with the cloud of null qqplots, and so we need to use a unique set of quantiles on the x-axis of the qqplot. Let's agree to use the quantiles of N( typ_mean, typ_sd/sqrt(15) ). (I think you will see the reason for 1/sqrt(15) later, if not already.)

d) Using the "by hand" method of making qqplots from lab2, make a single figure that shows the cloud of all 100 qqplots. Don't forget to use the quantiles we just agreed upon.

e) Superimpose the qqplot of eff_obs (in red color), and draw a diagonal line (in red).(The reason we're drawing a diagonal line, i.e., intercept = 0, slope = 1, is that we'vealready decided what quantiles should be on the x-axis.

f) Comment on the figure. Remember why we did all this! Specifically, comment on whether the figure we have made is consistent with the 5 significant effects we selected in class.

I'll make a confession: Some facets of the figure we've made don't make sense to me. Either I've made a mistake and you guys will catch it; or we (including me) will learn soemthing new. Let's see.

Page 10: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

# Consider the data in example 7.1, all by R.# a) Before any blocking, the data set looks like that shown at the beginning of section 6.2 . Note that there are n=3 replications. # Develop a 2^2 CRD full model, and produce the anova table shown in Table 6.1.

# b) Report the A effect, the B effect, and the AB effect. Recall that the value of the effects depends on what contrast we use, but you # can use the default R contrasts.

# c) Now run an RCBD model with replication being blocked.

# Note that the ss table for RCBD (for 2 treatment factors and 1 block factor) is exactly the same as that of a 3-factor model, except we# just don't include in the 3-factor model any interactions between treatment and blocks. However, we could include interactions, because# in this case there is enough df in the data to estimate everything and SSE. Try it on your own, something like lm(y~A+B+A:B + C+A:C)

# d) Report the A, B and AB effects, again using the default contrasts.

# You will find that the effects A, B, AB are the same in CRD and RCBD. But the SS values and the corresponding p-values are different.# Also, note that whereas A and B (and even AB) have a single number associated with them (because they all have df=1) there are TWO # effects shown for the block factor (because it has df=3-1=2). This is all consistent with what we said in the lecture.

# You will also find that, based on comparing the anova tables for CRD and RCBD models, it seems like including the block factor in the # model does reduce the SSE (from 31.33 to 24.83), as it is supposed to. But according to the p-values, the reduction in SSE is not # sufficient to make any of the nonsignificant factors (e.g. AB) significant. Furthermore, blocking does not seem to make any of the # significant factors more significant either. All of this is consistent with the finding that the p-value of the block factor itself is nonsignificant,# i.e. blocking appears to have been unnecessary in this case.

# In words, the nonsignificance of the block factor suggests that the data within each block may not be more similar to each other than data # between/across the blocks. Let's test that hypothesis using the randomization idea.

# First, record the values of sse in the RCBD model above, and call it sse_obs. Then, ask yourself this question: If the blocks are truly # unncessary (i.e., if the data within each block are just as similar as data between blocks), then what is the distribution of sse? We can# actually build the appropriate empirical distribution of sse by permuting the data between the blocks (but not within the blocks).

# e) Write code to build the empirical distribution of sse (i.e. a histogram of sse) under the null hypothesis that the populations are the same # across blocks. Hint: If we read the data into a matrix: y.m = matrix(c(28,36,18,31, 25,32,19,30, 27,32,23,29 ), ncol=3, byrow=F)# the randomization we are looking for permutes the numbers *between* the columns,i.e. *on each row.* It does NOT permute numbers on# different rows. Set the number of trials to 5000. Hint2: Check out our previous labs.

# f) From the histogram in the previous part, and the observed value of sse, compute the appropriate area (i.e., p-value) for testing the # hypothesis that the blocks have no effect. Hint: You have to think a bit to decide if the right or left area is appropriate.

Page 11: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames
Page 12: rm(list=ls(all=TRUE)) design = gen.factorial(2,4,varNames

�����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������