Inverse Propensity Score Weighting (IPW)

Recall that the stabilized IPW estimator for the ATE is \[ \hat{\rho}_{IPW} = \frac{\sum_{i=1}^n \frac{D_i}{\hat{e}(X_i)}Y_i}{\sum_{i=1}^n \frac{D_i}{\hat{e}(X_i)}} - \frac{\sum_{i=1}^n \frac{1-D_i}{1-\hat{e}(X_i)}Y_i}{\sum_{i=1}^n \frac{1-D_i}{1-\hat{e}(X_i)}}, \] and that this can be implementing with weighted least squares regression in which you regress \(Y\) on \(D\), using the inverse of the estimate propensity score \(\hat{e}(X_i)\) as the weights for the treated and the inverse of \(1-\hat{e}(X_i)\) as the weights for the control.

Inference for \(\hat{\rho}_{IPW}\) should account for the sampling variation in the estimated propensity scores, \(\hat{e}(\cdot)\). In the Robust Inference I lecture notes, we showed how to derived the variance when \(\hat{e}(\cdot)\) is estimated using an \(M\)-estimator. Here we will use the bootstrap instead.

Make a simulated population with confounding

We will simulate a population with confounding in \(X\). We do it in a way that makes it so that simple control specifications for \(X\) in a regression would be inadequate, and so the specification challenge is a little harder than usual:

# Make population data
set.seed(11)
N.pop <- 10000
index <- 1:N.pop
X <- .5*exp(rnorm(N.pop))
Y0 <- 50-.05*X^2+(1+.5*(max(X)-X))*rnorm(N.pop)
Y1 <- -1.5 + Y0 + 10*X + .2*X^2+ .25*X*rnorm(N.pop)
e.X <- (1+exp(1-.2*X))^(-1)
D <- rbinom(N.pop, 1, e.X)
Y <- D*Y1 + (1-D)*Y0
rho <- mean(Y1-Y0)

plot(X, Y0, col="blue", pch=19, 
     cex=.25, ylim=range(c(Y1,Y0)),
     main="Potential outcomes")
points(X, Y1, col="red", pch=19, cex=.25)

plot(X, e.X, pch=19, main="Propensity scores")

plot(X, Y, type="n", ,
     main="Realized outcomes")
points(X[D==0], Y[D==0], col="blue", pch=19, cex=.25)
points(X[D==1], Y[D==1], col="red", pch=19, cex=.25)

rho
## [1] 7.124273
summary(lm(Y~D))[[4]][,1]
## (Intercept)           D 
##   49.800748    9.237881
summary(lm(Y~D+X))[[4]][,1]
## (Intercept)           D           X 
##   45.379997    7.790128    5.881640
summary(lm(Y~D*I(X-mean(X))))[[4]][,1]
##      (Intercept)                D   I(X - mean(X)) D:I(X - mean(X)) 
##       49.7601797        7.3606239       -0.5394921       11.7581545
pop.data <- data.frame(index, Y, D, X)

Sampling process

Next we draw a sample and estimate the effect using simple regression strategies and then IPW:

# Draw a sample:
n.samp <- 1000

samp.i <- sample(index, n.samp)
samp.data <- pop.data[samp.i,]
plot(samp.data$X, samp.data$Y, type="n")
points(samp.data$X[D==0], samp.data$Y[D==0], col="blue")
points(samp.data$X[D==1], samp.data$Y[D==1], col="red")

summary(lm(Y~D, data=samp.data))[[4]][,1]
## (Intercept)           D 
##    49.18994    10.72004
summary(lm(Y~D+X, data=samp.data))[[4]][,1]
## (Intercept)           D           X 
##   44.331840    8.891598    6.449878
summary(lm(Y~D*I(X-mean(X)), data=samp.data))[[4]][,1]
##      (Intercept)                D   I(X - mean(X)) D:I(X - mean(X)) 
##       49.1173279        8.5252179       -0.8426236       12.3346502
e.hat.X <- predict(glm(D~X, data=samp.data, 
                        family="binomial"), 
                        type="response")

samp.data$w <- samp.data$D*(1/e.hat.X) + (1-samp.data$D)*(1/(1-e.hat.X))

plot(samp.data$X, samp.data$Y, type="n")
points(samp.data$X[D==0], samp.data$Y[D==0], 
       cex=.5*samp.data$w[D==0],col="blue")
points(samp.data$X[D==1], samp.data$Y[D==1], 
       cex=.5*samp.data$w[D==1],col="red")

Fitting IPW model

fit.ipsw.s <- lm(Y~D, weights=samp.data$w, data=samp.data)
summary(fit.ipsw.s)[[5]][,1]
## (Intercept)           D 
##   49.114690    8.478966

Estimating the standard error

# Get bootstrap estimate
n.boot <- 500
ate.hat <- rep(NA, n.boot)
t.out <- rep(NA, n.boot)

for (i in 1:n.boot){
    boot.index <- sample(samp.data$index, n.samp, replace=T)
    boot.data <- samp.data[match(boot.index, samp.data$index),]     
    e.hat.boot <- predict(glm(D~X, data=boot.data, 
                            family="binomial"), 
                            type="response")
boot.data$w <- boot.data$D*(1/e.hat.boot)/sum(boot.data$D*(1/e.hat.boot)) + (1-boot.data$D)*(1/(1-e.hat.boot))/sum((1-boot.data$D)*(1/(1-e.hat.boot)))
    fit.ipsw.b <- lm(Y~D, weights=boot.data$w, data=boot.data) 
    ate.hat[i] <- coef(fit.ipsw.b)[2]
    t.out[i] <- coeftest(fit.ipsw.b , vcov.= vcovHC(fit.ipsw.b, "HC2"))[2,3]
    cat(paste(i, " ")); flush.console() # progress indicator
}
## 1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99  100  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176  177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208  209  210  211  212  213  214  215  216  217  218  219  220  221  222  223  224  225  226  227  228  229  230  231  232  233  234  235  236  237  238  239  240  241  242  243  244  245  246  247  248  249  250  251  252  253  254  255  256  257  258  259  260  261  262  263  264  265  266  267  268  269  270  271  272  273  274  275  276  277  278  279  280  281  282  283  284  285  286  287  288  289  290  291  292  293  294  295  296  297  298  299  300  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315  316  317  318  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336  337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354  355  356  357  358  359  360  361  362  363  364  365  366  367  368  369  370  371  372  373  374  375  376  377  378  379  380  381  382  383  384  385  386  387  388  389  390  391  392  393  394  395  396  397  398  399  400  401  402  403  404  405  406  407  408  409  410  411  412  413  414  415  416  417  418  419  420  421  422  423  424  425  426  427  428  429  430  431  432  433  434  435  436  437  438  439  440  441  442  443  444  445  446  447  448  449  450  451  452  453  454  455  456  457  458  459  460  461  462  463  464  465  466  467  468  469  470  471  472  473  474  475  476  477  478  479  480  481  482  483  484  485  486  487  488  489  490  491  492  493  494  495  496  497  498  499  500
hist(ate.hat, breaks=50)
abline(v=coef(fit.ipsw.s)[2], col="blue")
abline(v=rho, col="red")

aCI.out <- coeftest(fit.ipsw.s , vcov.= vcovHC(fit.ipsw.s, "HC2"))

# Naive analytical asymptotic CI ignoring pscore estimation
aaCI <- c(aCI.out[2,1]-qt(.975, fit.ipsw.s[[9]])*aCI.out[2,2], aCI.out[2,1]+qt(.975, fit.ipsw.s[[9]])*aCI.out[2,2])
# Bootstrap-b CI
bbCI <- quantile(ate.hat, c(0.025, .975))
# Bootstrap-t CI
btCI <- aCI.out[2,2]*quantile(t.out, c(0.025, .975))

coef(fit.ipsw.s)[2]
##        D 
## 8.478966
aaCI
## [1]  6.520593 10.437340
bbCI
##      2.5%     97.5% 
##  6.614625 10.355653
btCI
##      2.5%     97.5% 
##  6.580765 10.348629

Comparing to the true sampling distribution

n.iter <- 500
ate.hat.s <- rep(NA, n.iter)
t.out.s <- rep(NA, n.iter)

for(j in 1:n.iter){
    samp.i <- sample(index, n.samp)
    samp.data <- pop.data[samp.i,]
    e.hat.X <- predict(glm(D~X, data=samp.data, 
                        family="binomial"), 
                        type="response")
    samp.data$w <- samp.data$D*(1/e.hat.X) + (1-samp.data$D)*(1/(1-e.hat.X))
    fit.ipsw <- lm(Y~D, weights=samp.data$w, data=samp.data)
    ate.hat.s[j] <- coef(fit.ipsw)[2]
    t.out.s[j] <- coeftest(fit.ipsw , vcov.= vcovHC(fit.ipsw, "HC2"))[2,3]
    cat(paste(j, " ")); flush.console() # progress indicator
}
## 1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99  100  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176  177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208  209  210  211  212  213  214  215  216  217  218  219  220  221  222  223  224  225  226  227  228  229  230  231  232  233  234  235  236  237  238  239  240  241  242  243  244  245  246  247  248  249  250  251  252  253  254  255  256  257  258  259  260  261  262  263  264  265  266  267  268  269  270  271  272  273  274  275  276  277  278  279  280  281  282  283  284  285  286  287  288  289  290  291  292  293  294  295  296  297  298  299  300  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315  316  317  318  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336  337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354  355  356  357  358  359  360  361  362  363  364  365  366  367  368  369  370  371  372  373  374  375  376  377  378  379  380  381  382  383  384  385  386  387  388  389  390  391  392  393  394  395  396  397  398  399  400  401  402  403  404  405  406  407  408  409  410  411  412  413  414  415  416  417  418  419  420  421  422  423  424  425  426  427  428  429  430  431  432  433  434  435  436  437  438  439  440  441  442  443  444  445  446  447  448  449  450  451  452  453  454  455  456  457  458  459  460  461  462  463  464  465  466  467  468  469  470  471  472  473  474  475  476  477  478  479  480  481  482  483  484  485  486  487  488  489  490  491  492  493  494  495  496  497  498  499  500
# True coef mean
mean(ate.hat.s)
## [1] 7.405926
# Estimate
coef(fit.ipsw.s)[2]
##        D 
## 8.478966
# True sampling sd
sd(ate.hat.s)
## [1] 0.7989762
# Estimates
#  Naive analytical
aCI.out[2,2]
## [1] 0.9979767
#  bootstrap-b
sd(ate.hat)
## [1] 0.9788662
# True t stat dist
plot(density(t.out.s-mean(t.out.s)), 
     main="Sampling distribution",
     xlim=c(-4,6))
# Estimates:
#   Analytical
points( seq(-4,4,.01), 
        dt(seq(-4,4,.01), 
        df=fit.ipsw.s[[9]]), 
        type="l", 
        col="red")
#  Bootstrap-t
points(density(t.out-mean(t.out)), type="l",col="blue")
legend(2,.3, legend=c("True","Analytical","Boot-t"),
       col=c("black","red","blue"),
       lty=c("solid","solid","solid"),
       cex=.75)

Bootstrap failure

We will show the failure of the bootstrap for estimates of the maximum, using \(\max\{X_i\}\) for units \(i\) in the sample. This is a non-smooth statistic:

First, we can view our target, the population maximum, relative to what we get in our sample:

par(mfrow=c(1,1))
pophist <- hist(pop.data$X, breaks=100, freq=TRUE, main="Hist. of X")
hist(samp.data$X, add=TRUE, breaks=pophist$breaks, col="black")

max(pop.data$X)
## [1] 20.21729
max(samp.data$X)
## [1] 12.70117

Now look at the sampling distribution of \(\max\{X_i\}\):

max.s <- rep(NA, n.iter)

for(j in 1:n.iter){
    samp.i <- sample(index, n.samp)
    samp.data <- pop.data[samp.i,]
    max.s[j] <-  max(samp.data$X)
}

hist(max.s, freq=F, breaks=pophist$breaks)

Now look at how the bootstrap attempts to approximate this:

max.b <- rep(NA, n.iter)

for (i in 1:n.boot){
    boot.index <- sample(samp.data$index, n.samp, replace=T)
    boot.data <- samp.data[match(boot.index, samp.data$index),]     
    max.b[i] <-  max(boot.data$X)
}   

hist(max.b, freq=F, breaks=pophist$breaks)

sd(max.s)
## [1] 3.907458
sd(max.b)
## [1] 0.6836264