rm(list=ls()); ### # # # 10. Nonparametric regression # # ### library(MASS); library(locpol); ### # # 1. Motorcycle data # ### data(mcycle); # A data frame giving a series of measurements of head acceleration # in a simulated motorcycle accident, used to test crash helmets. # times: in milliseconds after impact. # accel: in g. plot(mcycle); X <- mcycle$times; Y <- mcycle$accel; x0 <- seq(min(X), max(X), by=0.1); # A. Fitting by global polynomials # A naive function to fit polynomials of order p # for numerical reasons, it would be better to use the function 'poly' fit.pol <- function(p, X, Y, COLOR="blue", draw=TRUE){ n <- length(X) XX <- matrix(NA, n, p) for(i in 1:p){ XX[,i] <- X^(i); } fit <- lm(Y~XX); print(summary(fit)); XX0 <- matrix(NA, length(x0), p) for(i in 1:p){ XX0[,i] <- x0^(i); } fitted <- cbind(1,XX0)%*%fit$coef; if(draw){ lines(x0, fitted, col=COLOR, lwd=2); } invisible(fitted) } plot(mcycle); fit.pol(2, X, Y, COLOR="red"); fit.pol(4, X, Y, COLOR="orange"); fit.pol(6, X, Y, COLOR="yellow"); fit.pol(8, X, Y, COLOR="green"); fit.pol(10, X, Y, COLOR="blue"); fit.pol(12, X, Y, COLOR="darkblue"); legend("bottomright", lty=rep(1, 6), col=c("red", "orange", "yellow", "green", "blue", "darkblue"), legend=paste("p=", seq(2,12, by=2))) # B. Local constant and local linear fiting x0 <- seq(min(X), max(X), by=0.1); # Nadaraya - Watson's estimator # Local constant estimator, in the window we consider we take the mean of all # observations within the window as the resulting regression estimator h = 5 d = 150 xe2 = seq(5,55,length = d) ye = rep(NA,d) for(i in 1:d){ x = xe2[i] plot(mcycle, main = "Nadaraya - Watson (loc. pol. with p=0)") abline(v=x, lty=2) abline(v=x+c(-h,h), lty = 3, lwd=2) II = (x-h<=mcycle$times)&(x+h>=mcycle$times) points(mcycle[II,], col = 2, pch = 16) segments(x-h,ye[i]<-mean(mcycle[II,2]), x+h, ye[i], col = 2, lwd = 2) points(x,ye[i], pch = 15, cex=1.5, col = 2) lines(xe2[1:i],ye[1:i],col=2) Sys.sleep(10/d) } # Local Linear estimator (p=1) # For this estimator, we fit a linear function to all data within the # observation window. The point estimator of the regression function # (locally an intercept when properly shifted) gives an estimator of the # regression function. The slope, at the same time, gives a reasonable # estimator of the first derivative of the regression function h = 15 # try also h = .5, or h = 50 for(i in 1:d){ x = xe2[i] plot(mcycle, main = "Local Linear (loc. pol. with p=1)") abline(v=x, lty=2) abline(v=x+c(-h,h), lty = 3, lwd=2) II = (x-h<=mcycle$times)&(x+h>=mcycle$times) points(mcycle[II,], col = 2, pch = 16) # fit.lm = lm(Y[II]~I(X[II]-x)) ye[i] = fit.lm$coef[1] # segments(x-h,ye[i]-h*fit.lm$coef[2], x+h, ye[i]+h*fit.lm$coef[2], col = 2, lwd = 2) points(x,ye[i], pch = 15, cex=1.5, col = 2) lines(xe2[1:i],ye[1:i],col=2) Sys.sleep(10/d) } # The previous illustrations were shown for the rectangular kernel # (only points within h from x are considered, all with the same weights) # Of course, as in kernel density estimation, other kernels and weighting # schemes could be used analogously fit = locpol(Y~X,deg=1, data=data.frame(X,Y), xeval=x0, kernel = EpaK, bw=h) lines(fit$lpFit[,1], fit$lpFit[,2], lwd=2, col=3) legend("bottomright",c("p=1 (rect. kernel)", "p=1 (Epan. kernel)"), col=c(2,3), lwd=2) ### # # Work with package locpol # ### # Local linear estimator p = 1 fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK); # does not work. # The bandwidth is found by cross-validation. By default the bandwidth h is # searched for in the interval '.lokestOptInt' which ranges from 0.0005 to # 1.5000, which causes often troubles. # Banwdith choice through cross-validation (bw.CV <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(1,15))); # Rule of thumb (ROT) (bw.ROT <- thumbBw(X, Y, deg=1, kernel=EpaK)) # ROT - manual calculations n <- nrow(mcycle); fit <- lm(Y~X+I(X^2)+I(X^3)+I(X^4)) # sigmatilde2 <- sum(residuals(fit)^2)/(n-5); sigmatilde2 <- sum(residuals(fit)^2)/n; ddm <- cbind(0,0,1, X,X^2)%*%(coef(fit)*(0:4)*(-1:3)) (RK(EpaK)*sigmatilde2/(n*mu2K(EpaK)^2 * mean(ddm^2)))^(1/5) # Be careful with the scaling thumbBw(X, Y, deg=1, kernel=EpaK) thumbBw(2*X, Y, deg=1, kernel=EpaK) thumbBw(3*X, Y, deg=1, kernel=EpaK) # We use bandwidth chosen through cross-validation BW <- bw.CV; # Local linear estimator p = 1 - the second attempt fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=BW); # Plots associatioted with 'locpol' object # Beware! plot(locpol) alters the global plotting parameters op = par(mfrow=c(1,1)) plot(1,1) plot(fit.LL) plot(1,1) # to fix this issue it must be wrapped in # op = par() # save the current plotting parameters # and par(op) # change the plotting parameters back to op # plot(1,1) # Now it's ok again # Local constant estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); # Nonparametric fit plot(X, Y); lines(x0, fit.LL$lpFit$Y, col="green", lwd=2) lines(x0, fit.NW$beta0, col="red", lwd=2) fit.pol(12, X, Y, COLOR="darkblue"); legend("bottomright", lty=rep(1, 3) , col=c("green", "red", "darkblue"), legend=c("local linear", "local constant (NW)", "polynomial of order 12"), lwd=rep(2,3)) # Comparison of the weights for the local linear smoother, and the # Nadaraya-Watson smoother weights.NW <- locCteWeightsC(x=X, xeval=X, kernel=EpaK, bw=bw.CV); weights.LL <- locLinWeightsC(x=X, xeval=X, kernel=EpaK, bw=bw.CV); # par(mfrow=c(3,3)); indexy <- round(seq(1,133,length=9)); for(i in 1:9){ qqq <- as.logical(weights.NW$locWeig[indexy[i],] > 1e-10) | as.logical( abs(weights.LL$locWeig[indexy[i],]) > 1e-10) plot(jitter(weights.NW$locWeig[indexy[i],qqq]), jitter( weights.LL$locWeig[indexy[i],qqq]), cex=0.55, pch=16, main=paste("Weights at the point x =", X[indexy[i]]), ylab="Local linear weights", xlab="Nadaraya-Watson weights"); } par(mfrow=c(1,1)) ### # # Comparison of local polynomial estimation for p=0,1,2,3 # ### compare.lp <- function(BW){ # # Local polynomial fits fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=BW); fit.LL2 <- locpol(Y~X, deg=2, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=BW); fit.LL3 <- locpol(Y~X, deg=3, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=BW); # Local linear estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); plot(X, Y, main=paste("Banwdith:", round(BW,2))); lines(x0, fit.NW$beta0, col="red", lwd=2) lines(x0, fit.LL$lpFit$Y, col="green", lwd=2); lines(x0, fit.LL2$lpFit$Y, col="blue", lwd=2); lines(x0, fit.LL3$lpFit$Y, col="darkblue", lwd=2); legend("bottomright", lty=rep(1, 4), col=c("red", "green", "blue", "darkblue"), legend=c("local constant (NW)", "local linear", "local quadratic", "local cubic")) } # # CV, p=1 (BW <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(1,15))); compare.lp(BW) # # CV, p=3 (BW <- regCVBwSelC(X, Y, deg=3, kernel=EpaK, interval=c(1,15))); compare.lp(BW) # # Comparison of local polynomial estimation for p=0,1,2,3 # # (for each p the bandwidth is chosen by CV) # # Local polynomial fits (BW1 <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(1,15))); fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=BW1); (BW2 <- regCVBwSelC(X, Y, deg=2, kernel=EpaK, interval=c(1,15))); fit.LL2 <- locpol(Y~X, deg=2, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=BW2); (BW3 <- regCVBwSelC(X, Y, deg=3, kernel=EpaK, interval=c(1,15))); fit.LL3 <- locpol(Y~X, deg=3, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=BW3); # Local linear estimator p = 0 (Nadaraya-Watson) BW0 <- regCVBwSelC(X, Y, deg=0, kernel=EpaK, interval=c(1,15)); fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=BW0); plot(X, Y, main=paste("Banwidths:", round(BW0,2), ",", round(BW1,2), ",", round(BW2,2), ",", round(BW3,2))); lines(x0, fit.NW$beta0, col="red", lwd=2) lines(x0, fit.LL$lpFit$Y, col="green", lwd=2); lines(x0, fit.LL2$lpFit$Y, col="blue", lwd=2); lines(x0, fit.LL3$lpFit$Y, col="darkblue", lwd=2); legend("bottomright", lty=rep(1, 4), col=c("red", "green", "blue", "darkblue"), legend=c("local constant (NW)", "local linear", "local quadratic", "local cubic"), lwd=rep(2,4)) ### # # Function Lowess, see Section 10.6 # ### ?lowess # # Lowess - smoothing fit.lowess <- lowess(X, Y); # The role of the bandwidth is played by parameter f = smoother span # By default f=2/3 plot(mcycle, main="f=2/3"); lines(fit.lowess, lwd=2); # The default values of 'the smoother span' is too large. plot(mcycle); lines(lowess(X, Y, f=1/2), col="red", lwd=2); lines(lowess(X, Y, f=1/3), col="orange", lwd=2); lines(lowess(X, Y, f=1/4), col="yellow", lwd=2); lines(lowess(X, Y, f=1/5), col="green", lwd=2); lines(lowess(X, Y, f=1/6), col="blue", lwd=2); legend("bottomright", lty=rep(1, 6), col=c("red", "orange", "yellow", "green", "blue"), legend=paste("f=", round(1/(2:6),3)), lwd=rep(2,6)); # iter=0 vs iter=3 plot(mcycle, main="Lowess iter=0 vs. iter=3, f=1/4"); lines(lowess(X, Y, f=1/4, iter=0), col="red", lwd=2); # local linear fit with nearest neighbour bandwidth choice lines(lowess(X, Y, f=1/4), col="blue", lwd=2); # robustified local linear fit with nearest neighbour bandwidth choice legend("bottomright", lty=rep(1, 6), col=c("red", "blue"), legend=c("Lowess - iter = 0", "Lowess - iter = 3"), lwd=rep(2,2)); # # Adding several outliers set.seed(1234567) # outliers <- cbind(sample(X, size=5), sample(Y, size=5)); X1 <- c(X, 2+runif(5)*55); Y1 <- c(Y, -134+24*runif(5)); n <- length(Y); # original sample size plot(mcycle, main="Lowess iter=0 vs. iter=3, f=1/4"); points(X1[(n+1):(n+5)], Y1[(n+1):(n+5)], col="darkgreen", pch=4, cex=2, lwd=4); lines(lowess(X1, Y1, f=1/4, iter=0), col="red", lwd=2); # local linear fit with nearest neighbour bandwidth choice lines(lowess(X1, Y1, f=1/4, iter=3), col="blue", lwd=2); # robustified local linear fit with nearest neighbour bandwidth choice legend("topright", lty=rep(1, 6), col=c("red", "blue"), legend=c("Lowess - iter = 0", "Lowess - iter = 3"), lwd=rep(2,2)); #### # # Section 10.5.4: An illustration how the nearest-neighbour type # bandwidth changes with x # #### n.x0 <- length(x0); bw <- numeric(n.x0); k0 <- floor(1/4*n) for(i in 1:n.x0){ bw[i] <- sort(abs(x0[i]-X))[k0]; } par(mfrow=c(1,2)); plot(x0, bw, type="l", lwd=2, main=paste("Nearest neighbour bandwidth, k=n/4")) rug(X); # plot(density(X), xlim=range(x0), type="l", lwd=2, main="Density of the covariates") rug(X); par(mfrow=c(1,1)); ### # # Section 10.7: Conditional variance estimation # ### bw.CV <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(1,15)); fit.NW <- locCteSmootherC(x=X, y=Y, xeval=X, kernel=EpaK, bw=bw.CV); fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=X, kernel=EpaK, bw=bw.CV); weights.NW <- locCteWeightsC(x=X, xeval=x0, kernel=EpaK, bw=2*bw.CV)$locWeig; weights.LL <- locLinWeightsC(x=X, xeval=x0, kernel=EpaK, bw=2*bw.CV)$locWeig; sigma0.NW <- weights.NW%*%(Y^2) - (weights.NW%*%Y)^2; sigma1.NW <- weights.NW%*%(Y-fit.NW$beta0)^2; sigma0.LL <- weights.LL%*%(Y^2) - (weights.LL%*%Y)^2; sigma1.LL <- weights.LL%*%(Y-fitted(fit.LL))^2; table(sigma0.NW < 0); table(sigma1.NW < 0); table(sigma0.LL < 0); table(sigma1.LL < 0); plot(x0, sqrt(sigma0.NW), ylim=range(mcycle), type="l", ylab="", main="Conditional standard deviation estimation", lwd=2); lines(x0, sqrt(sigma1.NW), lwd=2, lty="dashed"); lines(x0, sqrt(pmax(sigma0.LL,0)), col="blue", lwd=2); lines(x0, sqrt(pmax(sigma1.LL,0)), col="blue", lwd=2, lty="dashed"); abline(h=0, lty="dotted") points(mcycle); # legend(40,-70, lty=c("solid", "dashed", "solid", "dashed"), lwd=c(2,2,2,2), col=c("black", "black", "blue", "blue"), legend=c("NW - direct", "NW - modified", "LL - direct", "LL - modified")); ### # # Example: GAGurine # ### data(GAGurine); # Data were collected on the concentration of a chemical GAG in the # urine of 314 children aged from zero to seventeen years. The aim # of the study was to produce a chart to help a paediatrican to # assess if a child's GAG concentration is ''normal''. # Age: age of a child in years. # GAG: concentration of GAG (the units have been lost). plot(GAGurine); X <- GAGurine$Age; Y <- GAGurine$GAG; x0 <- seq(min(X), max(X), by=0.01); # Local linear estimator p = 1 fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK); summary(fit.LL); (bw.CV <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(1,15))); (bw.ROT <- thumbBw(X, Y, deg=1, kernel=EpaK)) # Local linear estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); # Nonparametric fit plot(X, Y) lines(x0, fit.LL$lpFit$Y, col="blue", lwd=2) lines(x0, fit.NW$beta0, col="red", lwd=2) fit.pol(8, X, Y, COLOR="green"); legend("topright", lty=rep(1, 6), col=c("blue", "red", "green"), lwd=2, legend=c("Local linear", "local constant (NW)", "polynomial of order 8")); # op = par(mfrow=c(1,1)) plot(fit.LL); par(op) # plot(X, Y); lines(x0, fit.LL$lpFit$Y, col="blue", lwd=2); lines(lowess(X, Y, f=1/4, iter=0), col="green", lwd=2); lines(lowess(X, Y, f=1/4), col="darkgreen", lwd=2); lines(lowess(X, Y, f=2*bw.CV/(max(X)-min(X)), iter=0), col="red", lwd=2); 2*bw.CV/(max(X)-min(X)) # equivalent span, # if the desing points were uniformly distributed legend("topright", lty=rep(1, 6), col=c("blue", "green", "darkgreen", "red"), legend=c("Local linear global bw", "Lowess - iter = 0, f=1/4", "Lowess - iter = 3, f=1/3", paste("Lowess - iter = 0, f=", round(2*bw.CV/(max(X)-min(X)), 3)))); ### # # Simulated example 1 # ### n <- 200; set.seed(1234567); X <- -2+4*sort(runif(n)); # random design Uniform(-2,2) # X <- seq(-2, 2, length=n); # regular fixed design cond.mean <- function(x) x + 2*exp(-16*x^2); mX <- cond.mean(X); sigma <- 0.4; eps <- rnorm(n); Y <- mX + sigma*eps; plot(X, Y) x0 <- seq(-2, 2, length=1000); # Local linear estimator p = 1 (fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK)); (bw.CV <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(0.1,4))); (bw.ROT <- thumbBw(X, Y, deg=1, kernel=EpaK)) # Local linear estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); # Nonparametric fit plot(X, Y) lines(x0, cond.mean(x0), lwd=3) lines(x0, fit.LL$lpFit$Y, col="blue", lwd=2) lines(x0, fit.NW$beta0, col="red", lwd=2) fit.pol(12, X, Y, COLOR="green"); lines(lowess(X, Y, f=2*fit.LL$bw/(max(X)-min(X)), iter=0), col="darkgrey", lwd=2) legend("bottomright", lty=rep(1, 4), lwd=rep(2, 4), col=c("black", "blue", "red", "green", "darkgrey"), legend=c("True mean function", "Local linear", "Local constant (NW)", "polynomial of order 12", "lowess iter=0")); ### # # Simulated example 2 # ### n <- 200; set.seed(1234567); X <- sqrt(1/2)*sort(rnorm(n)); # random design N(0,1/2) # X <- seq(-2, 2, length=n); # regular fixed design cond.mean <- function(x) sin(2*x) + 2*exp(-16*x^2); mX <- cond.mean(X); sigma <- 0.3; eps <- rnorm(n); Y <- mX + sigma*eps; plot(X, Y) x0 <- seq(min(X), max(X), length=1000); # Local linear estimator p = 1 (fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK)); (bw.CV <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(0.1,4))); (bw.ROT <- thumbBw(X, Y, deg=1, kernel=EpaK)) # Local linear estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); # Nonparametric fit temp <- fit.pol(12, X, Y, draw=FALSE); plot(X, Y, ylim=range(Y, fit.LL$lpFit$Y, fit.NW$beta0)) lines(x0, cond.mean(x0), lwd=2) lines(x0, fit.LL$lpFit$Y, col="blue", lwd=2) lines(x0, fit.NW$beta0, col="red", lwd=2) fit.pol(12, X, Y, COLOR="green"); lines(lowess(X, Y, f=2*fit.LL$bw/(max(X)-min(X)), iter=0), col="darkgrey", lwd=2) legend(0, -0.5, lty=rep(1, 4), lwd=rep(2, 4), col=c("black", "blue", "red", "green", "darkgrey"), legend=c("True mean function", "Local linear", "Local constant (NW)", "polynomial of order 12", "lowess iter=0")); ### # # Simulated example 3 # ### n <- 200; set.seed(1234567); X <- sqrt(1/2)*sort(rnorm(n)); # random design N(0,1/2) # X <- -2+4*sort(runif(n)); # random design Uniform(-2,2) # X <- seq(-2, 2, length=n); # regular fixed design cond.mean <- function(x) 0.3*exp(-4*(x+1)^2) + 0.7*exp(-16*(x-1)^2); mX <- cond.mean(X); sigma <- 0.1; eps <- rnorm(n); Y <- mX + sigma*eps; plot(X, Y) # x0 <- seq(-2, 2, length=1000); x0 <- seq(min(X), max(X), length=1000); # Local linear estimator p = 1 (fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK)); (bw.CV <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(0.1,4))); (bw.ROT <- thumbBw(X, Y, deg=1, kernel=EpaK)) # Local linear estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); # Nonparametric fit plot(X, Y) lines(x0, cond.mean(x0), lwd=2) lines(x0, fit.LL$lpFit$Y, col="blue", lwd=2) lines(x0, fit.NW$beta0, col="red", lwd=2) fit.pol(12, X, Y, COLOR="green"); lines(lowess(X, Y, f=2*fit.LL$bw/(max(X)-min(X)), iter=0), col="pink", lwd=2) legend(-1.5, 0.75, lty=rep(1, 4), lwd=rep(2, 4), col=c("black", "blue", "red", "green", "pink"), legend=c("True mean function", "Local linear", "Local constant (NW)", "polynomial of order 12", "lowess iter=0")); ### # # Simulated example 4 # ### n <- 200; set.seed(1234567); X <- sqrt(1/5)*sort(abs(rnorm(n))); # random design N(0,1/5) # X <- sort(runif(n)); # random design Uniform(0,1) # X <- seq(-2, 2, length=n); # regular fixed design cond.mean <- function(x) 200 * (exp(-9.75*x) - 4*exp(-19.5*x) + 3*exp(-29.25*x)); mX <- cond.mean(X); sigma <- 5; eps <- rnorm(n); Y <- mX + sigma*eps; plot(X, Y) # x0 <- seq(0, 1, length=1000); x0 <- seq(min(X), max(X), length=1000); # Local linear estimator p = 1 (fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK)); (bw.CV <- regCVBwSelC(X, Y, deg=1, kernel=EpaK, interval=c(0.01,4))); (bw.ROT <- thumbBw(X, Y, deg=1, kernel=EpaK)) # Local linear estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); # Nonparametric fit plot(X, Y) lines(x0, cond.mean(x0), lwd=3) lines(x0, fit.LL$lpFit$Y, col="blue", lwd=2) lines(x0, fit.NW$beta0, col="red", lwd=2) fit.pol(12, X, Y, COLOR="green"); lines(lowess(X, Y, f=fit.LL$bw/(max(X)-min(X)), iter=0), col="darkgrey", lwd=2) legend(0.6, -30, lty=rep(1, 4), lwd=rep(2, 4), col=c("black", "blue", "red", "green", "darkgrey"), legend=c("True mean function", "Local linear", "Local constant (NW)", "polynomial of order 12", "lowess iter=0")); # Once more with hand chosen bandwidths (fit.LL <- locpol(Y~X, deg=1, data=data.frame(X,Y), xeval=x0, kernel=EpaK, bw=0.1)); # Local linear estimator p = 0 (Nadaraya-Watson) fit.NW <- locCteSmootherC(x=X, y=Y, xeval=x0, kernel=EpaK, bw=fit.LL$bw); # Nonparametric fit plot(X, Y) lines(x0, cond.mean(x0), lwd=3) lines(x0, fit.LL$lpFit$Y, col="blue", lwd=2) lines(x0, fit.NW$beta0, col="red", lwd=2) fit.pol(12, X, Y, COLOR="green"); lines(lowess(X, Y, f=2*fit.LL$bw/(max(X)-min(X)), iter=0), col="darkgrey", lwd=2) legend(0.6, -30, lty=rep(1, 4), lwd=rep(2, 4), col=c("black", "blue", "red", "green", "darkgrey"), legend=c("True mean function", "Local linear", "Local constant (NW)", "polynomial of order 12", "lowess iter=0")); ### # # Calculation of LOWESS # ### set.seed(1234567) X <- sort(rnorm(n)); Y <- sin(X) + rnorm(n)/10; # lp0 <- numeric(n); lp1 <- numeric(n); bw <- numeric(n); k0 <- floor(2/3*n) K <- function(x) (1 - (abs(x))^3)^3 * (abs(x) < 1); # kernel for the local weights B <- function(x) (1 - x^2)^2 * (abs(x) < 1); # kernel for weighting residuals for(i in 1:n){ bw[i] <- sort(abs(X[i]-X))[k0]; # local linear fit p=1 lp0[i] <- lm(Y~1 + I(X-X[i]), weights=K((X-X[i])/bw[i]))$coef[1]; } n.iter <- 3; res <- Y - lp0; for (j in 1:n.iter){ delta <- B(res/(6*median(abs(res)))); for(i in 1:n){ lp1[i] <- lm(Y~1 + I(X-X[i]), weights=K((X-X[i])/bw[i])*delta)$coef[1]; } res <- Y - lp1; } par(mfrow=c(1,2)) plot(X, Y) # Lowess fit0 <- lowess(X, Y, iter=0, delta=0); fit1 <- lowess(X, Y, delta=0); # # Lowess R - "our implementation of lowess" -- initial. fits summary(fit0$y-lp0) # # Lowess R - "our implementation of lowess" -- after 3 iterations summary(fit1$y-lp1) lines(X, sin(X)) lines(fit0, col="red", lwd=2); lines(fit1, col="blue", lwd=2, lty="dashed"); legend("bottomright", cex=.8, lty=c("solid", "dashed"), lwd=rep(2, 2), col=c("red", "blue"), legend=c("Lowess iter=0", "Lowess iter=3")); # plot(fit0$x, fit0$y-lp0, type="l", lwd=2, ylim=range(fit0$y-lp0, fit1$y-lp1), cex.main = .8, main="Difference of lowess function and hand calculations", col="red", xlab="x", ylab="Difference"); lines(fit0$x, fit1$y-lp1, lty="dashed", lwd=2, col="blue") legend("bottomright", cex=.8, lty=c("solid", "dashed"), lwd=rep(2, 2), col=c("red", "blue"), legend=c("Lowess iter=0 - Lowess iter=0", "Lowess iter=3 - Lowess iter=3")); par(mfrow=c(1,1))