########################################################################################### # code for analyzing doubly censored data with known left censoring time # ########################################################################################### ############### functions to be used in the estimation and inference procedure ############ H <- function(x){ y <- -log(1-x) return(y) } weightf0 <- function(v,l,u){ weight <- rep(1,length(v)) return(weight) } weightf1 <- function(v,l,u){ weight <- v>=(l+u)/2 return(weight) } test.stat <- function(bb,taugrid,weightf,l,u){ a <- tail(which(taugrid<=l),1) b <- which(taugrid>=u)[1] v1 <- c(taugrid[(a+1):(b-1)],u) v2 <- c(l,taugrid[(a+1):(b-1)]) vv <- v1-v2 weight <- weightf(v2,l,u) stat <- t(bb[a:(b-1)]*weight)%*%vv return(stat) } test.m <- function(m,taugrid,weightf,l,u){ ln <- dim(m)[1] p <- dim(m)[2] tt <- rep(NA,p) eff <- rep(NA,p) for (i in 1:p) { mtemp <- m[,i] tt[i] <- test.stat(mtemp,taugrid,weightf,l,u) } return(tt) } getp <- function(stat,statstar){ stat.se <- sqrt(apply(statstar,2,var,na.rm=T)) ptemp1 <- pnorm(stat/stat.se) ptemp2 <- 1-ptemp1 AA <- array(NA,c(1,p,2)) AA[,,1] <- ptemp1 AA[,,2] <- ptemp2 pval <- 2*apply(AA,c(1,2),min) return(list(pval,stat.se)) } ####################### function that returns the point estimation ################## # dat = data set to analyze # t0 = truncation time for the conditional version of the quantile regression model; # to fit the unconditional version, simply set t0 = 0. # w = the perturbation random variables, used for the resampling-based inference # procedure; for the point estimation, it is set to be a vector of 1. # g = link function ##################################################################################### sol.LK <- function(dat,t0,w,g,g.inv){ II <- dat$X>t0 X <- dat$X-t0 L <- dat$L-t0 Delta <- dat$delta Z <- (as.matrix(cbind(rep(1,n),dat[,!names(dat) %in% c("X","L","delta")])))*II g.inv.X <- rep(0,n) g.inv.X[X>0] <- g.inv(X[X>0]) bhat <- array(NA,c(Ln,p)) Y <- (Delta==1)*g.inv.X*w W0 <- (Delta==1)*Z*w Y[n+1] <- Y[n+2] <- R W1 <- -apply(W0,2,sum) j <- 1 while (j<=Ln) { temp <- rep(0,n) quant <- rep(0,n) for (k in 1:j) { if (k>1) quant <- g(Z%*%bhat[k-1,]) temp <- temp+(L<=quant)*(X>quant)*(H(taugrid[k])-ifelse(k==1,0,H(taugrid[k-1]))) } W2 <- 2*apply(w*Z*as.vector(temp),2,sum) W <- rbind(W0,W1,W2) bhat[j,]<- rq(Y~W-1)$coefficients # check if the solution did converge by examining the difference of estimation at # two consecutive quantiles if (j>=2) { if (sum(abs(bhat[j,]-bhat[j-1,])>M)>0) { bhat[j,] <- rep(NA,p) break } } j <- j+1 } return(bhat) } #################### function that returns the analysis results ################### # dat = data matrix that contains the dataset to analyze; should only include: # X: observed times (failure events or censored events) # L: left censoring times # delta: censoring indicator (1: failure; 2: left censored; 3: right censored) # Z: covariate matrix # p0 = quantile of t0 among the observed time X. For the uncondtional model p0=0 # l = lower bound of tau for the 2nd-stage inference (e.g., 0.1) # u = upper bound of tau for the 2nd-stage inference (e.g., 0.7) # B = number of resamples for the inference procedure (e.g., 500) # g = link function: Q(tau|Z)=g{Z*beta(tau)} # g.inv = inverse function of g # graph = flag for plotting the regression coefficients (1=Yes) ################################################################################## data.est <- function(dat,p0,l,u,B,g,g.inv,graph){ ################## get the point estimates of beta(tau) ##################### t0 <- ifelse(p0==0,0,quantile(dat$X,p0)) bhat <- sol.LK(dat,t0,rep(1,n),g,g.inv) ################### start the resampling-based inference procedure ########## bstar <- ddstar <- array(NA,c(Ln,p,B)) gamma1.star <- gamma2.star <- effavg.star <- array(NA,c(B,p)) for (r in 1:B) { set.seed(r) zeta <- rexp(n,1) bstar[,,r] <- sol.LK(dat,t0,zeta,g,g.inv) gamma1.star[r,] <- test.m(bstar[,,r],taugrid,weightf0,l,u) effavg.star[r,] <- gamma1.star[r,]/(u-l) ddstar[,,r] <- bstar[,,r]-rep(1,Ln)%*%t(effavg.star[r,]) gamma2.star[r,] <- test.m(ddstar[,,r],taugrid,weightf1,l,u) if (r %in% c(B*(1:10)/10) ) { print(paste(100*r/B,"% of the resampling is completed")) } } ############## variance estimates for beta(tau) from resampling ############ bhat.var <- apply(bstar,c(1,2),var,na.rm=T) bhat.se <- sqrt(bhat.var) ############# start testing for overall significance and constancy ######### gamma1 <- test.m(bhat,taugrid,weightf0,l,u) # test stat for overall significance gamma1.se <- getp(gamma1,gamma1.star)[[2]] # se of gamma1 gamma1.p <- getp(gamma1,gamma1.star)[[1]] # p-value for the overall significance eff.avg <- gamma1/(u-l) # average covariate effect eff.avg.se <- gamma1.se/sqrt(u-l) # se of average covariate effect dd <- bhat-rep(1,Ln)%*%t(eff.avg) gamma2 <- test.m(dd,taugrid,weightf1,l,u) # test stat for constancy gamma2.se <- getp(gamma2,gamma2.star)[[2]] # se of gamma2 gamma2.p <- getp(gamma2,gamma2.star)[[1]] # p-value for the contancy ######################## plot the estimated regression quantiles ########### ### all coefficients are plotted on the same panel, with 3 covariate per row if (graph==1) { l.indx <- l*100 u.indx <- u*100 xx <- taugrid[l.indx:u.indx] yy <- bhat[l.indx:u.indx,] yylow <- yy-1.96*bhat.se[l.indx:u.indx,] yyup <- yy+1.96*bhat.se[l.indx:u.indx,] xxname <- c("Intercept",paste("Z",1:(p-1))) layout(t(array(c(1:(ceiling(p/3)*3)),c(3,ceiling(p/3))))) for (i in 1:p) { ylim.low <- min(yylow[,i])-0.1 ylim.up <- max(yyup[,i])+0.1 plot(xx,yy[,i],xlab="tau",ylab="regression quantile",ylim=c(ylim.low,ylim.up),type="s") points(xx,yylow[,i],pch=20) points(xx,yyup[,i],pch=20) abline(h=0,col="grey") title(xxname[i]) } } return(list(coeff=bhat,coeff.se=bhat.se,avg.effect=list(avg=eff.avg,avg.se=eff.avg.se)# test.sig=list(test.stat=gamma1,test.se=gamma1.se,p.val=gamma1.p), # test.const=list(test.stat=gamma2,test.se=gamma2.se,p.val=gamma2.p))) } ################################################################################### # a data example # ################################################################################### ############################ structure of the data ############################### # columns of covariates ( "z1" and "z2" in the example), and # 3 columns of the time variables, which must be named as follows: # "X": observed time; # "L": left censoring time; # "delta": censoring indicator (1=failure, 2=left censored, 3=right censored) ################################################################################## dat <- read.csv("F:/Double Censoring/code_for_analysis/data_example.csv",sep=",") R <<- 1e+8 M <<- 10 taugrid <<- seq(0.01,0.8,0.01) # grid of tau Ln <<- length(taugrid) # size of the grid n <<- dim(dat)[1] Z <- cbind(rep(1,n),dat[,!names(dat) %in% c("X","L","delta")]) p <<- dim(Z)[2] p0 <- 0 # fit the unconditional model l <- 0.1 # lower bound of tau of interest u <- 0.7 # upper bound of tau of interest B <- 500 # number if resamples for inference g <- exp # link function g.inv <- log # inverse function of g graph <- 1 # a graph will be displayed res <- data.est(dat,p0,l,u,B,g,g.inv,graph) # data.est is the function for analysis res$coeff ### point estimate of beta(tau) res$ceoff.se ### se of beta(tau) res$avg.effect ### average covariate effects over the tau range [l,u] res$test.sig ### results for the overall significance test over the tau range [l,u] res$test.const ### results for the constancy test over the tau range [l,u]