rm(list=ls()) library(boot) library(mvtnorm) ### # # # 8.4 Parametric bootstrap # # ### ### # # Example. Parametric bootstrap: Testing exponentiality of the distribution # ### # Back to datasets on failures of airconditioning (aircond <- c(aircondit$hours)) (aircond7 <- c(aircondit7$hours)) summary(aircond7) # Test the hypothesis that aircond (aircond7) can be considered to be a sample # from exponential distribution. x <- aircond; # x <- aircond7; lambdahat <- 1/mean(x); n <- length(x) # Kolmogorov-Smirnov test without taking into consideration that lambdahat # is not fixed but estimated from the data ks.test(x, y="pexp", lambdahat); Fn <- ecdf(x)(x); Ftheta <- pexp(x, rate=lambdahat) x0 <- seq(0,600,length=1000) plot(x0, ecdf(x)(x0), type="l") lines(x0, pexp(x0, rate=lambdahat), type="l", col="blue",lwd=3) # Surely, to use ks.test like this is not allowed! We cannot just estimate # theta in the test statistic, because the asymptotic distribution is # derived under the condition that theta under H_0 is known and fixed # Use parametric bootstrap to approximate the distribution of the test # statistic under H_0 # Observed test statistics (KS <- max(abs(Fn - Ftheta), abs(Fn[-n]-Ftheta[-1]))) # Kolmogorov-Smirnov (CM <- mean((Fn-Ftheta)^2)); # Cramer-Von Mises (AD <- mean((Fn-Ftheta)^2/(Ftheta*(1-Ftheta)))); # Anderson-Darling B <- 999; KSb <- numeric(B); CMb <- numeric(B); ADb <- numeric(B); set.seed(1234567) for(i in 1:B){ xb <- sort(round(rexp(n, lambdahat))); # Bootstrap sample lambdahatb <- 1/mean(xb); Fnb <- ecdf(xb)(xb); Fthetab <- pexp(xb, rate=lambdahatb); KSb[i] <- max(abs(Fnb - Fthetab), abs(Fnb[-n]-Fthetab[-1])); CMb[i] <- mean((Fnb-Fthetab)^2); ADb[i] <- mean((Fnb-Fthetab)^2/(Fthetab*(1-Fthetab))); } # p-values print(paste0("KS test: ",(sum(KSb >= KS)+1)/(B+1))) print(paste0("CM test: ",(sum(CMb >= KS)+1)/(B+1))) print(paste0("AD test: ",(sum(ADb >= KS)+1)/(B+1))) ### # # Example. Testing exponentiality of the distribution - simulation study # Consequences of ignoring that the parameter under H_0 is unknown # ### # set.seed(1234567) n <- 12; # sample size # nopak <- 10000; nopak <- 200; B <- 999; pval.KS <- numeric(nopak); pval.KS.parboot <- numeric(nopak); pb <- winProgressBar(title = "progress bar", min = 0, max = nopak, width = 300) for(i in 1:nopak){ setWinProgressBar(pb, i, title=paste( round(i/nopak*100, 0),"% done")) x <- sort(100*rexp(n)); # x <- sort(rgamma(n, 2, scale=50)); lambdahat <- 1/mean(x); pval.KS[i] <- ks.test(x, y="pexp", lambdahat)$p.value; Fn <- ecdf(x)(x); Ftheta <- pexp(x, rate=lambdahat) # Observed test statistics (KS <- max(abs(Fn - Ftheta), abs(Fn[-n]-Ftheta[-1]))) KSb <- numeric(B); # Bootstrap - programmed with the help of matrix calculations in order # to avoid another for-cycle Xb <- matrix(rexp(n*B, lambdahat), n, B); Xb <- apply(Xb, 2, sort); Fn <- matrix((1:n)/n, n, B); lambda.star <- 1/colMeans(Xb); Lambda.star <- matrix(lambda.star, n, B, byrow=T) Ftheta.star <- 1-exp(-Lambda.star*Xb) Diff1 <- abs(Fn - Ftheta.star) Diff2 <- abs(Fn[-n,] - Ftheta.star[-1,]) Diff <- rbind(Diff1, Diff2) # KSb <- pmax(apply(Diff1, 2, max), apply(Diff2, 2, max)); KSb <- pmax(apply(Diff, 2, max)); # The previous procedures do the same as the following # Compute the test statistic of KS test # # test.KS <- function(x){ # x <- sort(x); # lambdahat.star <- 1/mean(x); # Fn <- ecdf(x)(x); # Ftheta <- pexp(x, rate=lambdahat.star) # max(abs(Fn - Ftheta), abs(Fn[-n]-Ftheta[-1])); # } # KSb <- apply(Xb, 2, test.KS); pval.KS.parboot[i] <- (sum(KSb >= KS)+1)/(B+1); } close(pb) # Level of the KS-test ingoring that the parameters are estimated mean(pval.KS <= 0.05) # When ignoring that theta is estimated, usual tests are extremely # conservative (reject H_0 much less than they should, i.e. have little # power to reject if H_0 is not true) # Level of the KS-test with the help of parametric bootstrap mean(pval.KS.parboot <= 0.05) # Parametric bootstrap works well, level is close to the nominal level 0.05 hist(pval.KS,breaks=50,prob=TRUE,main="Simulated p-values: Ignoring estimation") hist(pval.KS.parboot,breaks=50,prob=TRUE,main="Simulated p-values: NP Bootstrap") ### # # Example. Testing equality of expected values of two independent samples # ### # A. Case study load("Iq-en.RData") # Test the hypothesis that the expected average marks at the seventh grade # ('mark7') are the same for boys and girls boxplot(Iq$mark7 ~ Iq$Gender, ylab="average mark"); # two sample t-test with equality variance assumption t.test(Iq$mark7 ~ Iq$Gender, var.equal = TRUE); # two sample t-test with without equality variance assumption t.test(Iq$mark7 ~ Iq$Gender); # Wilcoxon test wilcox.test(Iq$mark7 ~ Iq$Gender); # Bootstrap tests B <- 1999; x <- Iq$mark7[Iq$Gender=="boy"]; y <- Iq$mark7[Iq$Gender=="girl"]; n1 <- length(x); n2 <- length(y); # Nonparametric bootstrap test # !!! resampling from distributions under H0 !!! # This is really needed, because we need to generate bootstrap replicates # under the null hypothesis xc <- x - mean(x); yc <- y - mean(y); # Test statistic (t1 <- t.test(x, y)$stat); t1b <- numeric(B); for(i in 1:B){ xb <- sample(xc, size=n1, replace=TRUE); yb <- sample(yc, size=n2, replace=TRUE); t1b[i] <- t.test(xb, yb)$stat; } # p-value (sum(abs(t1b) >= abs(t1)) + 1)/(B+1) # Permutation test (see Section 8.6) z <- c(x,y); # z <- c(xc,yc); t1b <- numeric(B); for(i in 1:B){ zb <- sample(z); # random permutation xb <- zb[1:n1]; yb <- zb[(n1+1):(n1+n2)]; t1b[i] <- t.test(xb, yb)$stat; } # p-value (sum(abs(t1b) >= abs(t1)) + 1)/(B+1) # B. Simulation study # nopak <- 20000; nopak <- 1000; B <- 999; sigma1 <- sd(x); sigma2 <- sd(y); n1 <- 10; n2 <- 5; n <- n1+n2; ttest.pvalue <- numeric(nopak); twelch.pvalue <- numeric(nopak); tboot.pvalue <- numeric(nopak); tperm.pvalue <- numeric(nopak); pb <- winProgressBar(title = "progress bar", min = 0, max = nopak, width = 300) for(i in 1:nopak){ setWinProgressBar(pb, i, title=paste( round(i/nopak*100, 0),"% done")) # A. Case of the same distributions # z1 <- sigma1 * rnorm(n1); # z2 <- sigma2 * rnorm(n2); # B. Case of different distributions z1 <- rexp(n1) z2 <- rgamma(n2, shape=4, scale=1/4) ttest <- t.test(z1, z2, var.equal=TRUE); ttest.pvalue[i] <- ttest$p.value; twelch <- t.test(z1, z2); twelch.pvalue[i] <- twelch$p.value; z1c <- z1 - mean(z1); z2c <- z2 - mean(z2); # Bootstrap test Z1b <- matrix(sample(z1c, size=n1*B, replace=TRUE), nrow=n1, ncol=B) Z2b <- matrix(sample(z2c, size=n2*B, replace=TRUE), nrow=n2, ncol=B) Z1b.mean <- colMeans(Z1b); Z2b.mean <- colMeans(Z2b); var1.b <- apply(Z1b, 2, var) var2.b <- apply(Z2b, 2, var) tboot.pvalue[i] <- (1+sum(abs(Z1b.mean - Z2b.mean)/sqrt(var1.b/n1+var2.b/n2) >= abs(twelch$statistic)))/(B+1); # Permutation test zc <- c(z1c, z2c); Zc <- apply(matrix(zc, n, B), 2, sample) Z1b <- Zc[1:n1,]; Z2b <- Zc[(n1+1):n,]; Z1b.mean <- colMeans(Z1b); Z2b.mean <- colMeans(Z2b); var1.b <- (colMeans(Z1b^2) - Z1b.mean^2)*n1/(n1-1); var2.b <- (colMeans(Z2b^2) - Z2b.mean^2)*n2/(n2-1); tperm.pvalue[i] <- (1+sum(abs(Z1b.mean - Z2b.mean)/sqrt(var1.b/n1+var2.b/n2) >= abs(twelch$statistic)))/(B+1); } close(pb) colMeans(cbind(ttest.pvalue, twelch.pvalue, tboot.pvalue, tperm.pvalue) <= 0.05) hist(ttest.pvalue,main="Simulated p-values:t-test") hist(twelch.pvalue,main="Simulated p-values:Welch t-test") hist(tboot.pvalue,main="Simulated p-values:Nonparametric bootstrap") hist(tperm.pvalue,main="Simulated p-values:Permutation test") # A. n1 = 28, n2 = 14, X ~ N(0, 0.53), Y ~ N(0, 0.14), B = 999, nopak=10 000 # ttest.pvalue twelch.pvalue tboot.pvalue tperm.pvalue # 0.0175 0.0474 0.0464 0.0460 # B1. n1 = 28, n2 = 14, X ~ Exp(1), Y ~ Gamma(4, 1/4), B = 999, nopak=10 000 # ttest.pvalue twelch.pvalue tboot.pvalue tperm.pvalue # 0.0315 0.0508 0.0465 0.0488 # B2. n1 = 14, n2 = 7, X ~ Exp(1), Y ~ Gamma(4, 1/4), B = 999, nopak=10 000 # ttest.pvalue twelch.pvalue tboot.pvalue tperm.pvalue # 0.0315 0.0508 0.0465 0.0488 # B3. n1 = 10, n2 = 5, X ~ Exp(1), Y ~ Gamma(4, 1/4), B = 999, nopak=20 000 # ttest.pvalue twelch.pvalue tboot.pvalue tperm.pvalue # 0.04450 0.04570 0.03545 0.05015 ### # # Bootstrap for multivariate estimators # ### n = 100 nu = 5 X = rmvt(n,df=nu,sigma=diag(2)) # multivariate t-distribution with nu degrees of freedom # X[,1] = rnorm(n) # X[,2] = X[,1]^2+rnorm(n)-1 plot(X,xlab="",ylab="",main="Observations",cex=.5) m = colMeans(X) B = 100 # simple nonparametric bootstrap for the vector of means m. = matrix(nrow=B,ncol=2) for(b in 1:B){ ii = sample(n, size=n, replace=TRUE); X. = X[ii,] m.[b,] = colMeans(X.) } points(m.,pch=16, cex=.5, col=2) points(m[1],m[2], cex=1, col=3,pch=16) legend("bottomright",c("Bootstrap estimates", "Original estimate"),col=c(2,3), pch=16) # true variance of m under multivariate t with nu degrees of freedom diag(rep(nu/(nu-2)/n,2)) # estimated variance from CLT var(X)/n # bootstrap-estimated variance of m var(m.) plot(m., xlab="", ylab="", main = "Bootstrap Estimates",col=2,cex=.5) points(0,0, pch=3, lwd=5) points(m[1],m[2],col=2,pch=3,lwd=5) points(mean(m.[,1]),mean(m.[,2]),col=3,pch=3,lwd=5) legend("bottomright",c("Bootstrap estimates", "True theta", "Estimated theta","Boostrap estimated expectation"), col=c(2,1,2,3),pch=c(1,3,3,3))