# Load required libraries
library(tidyverse) # For data wrangling and ggplot2
library(ggplot2) # For visualization (included in tidyverse)
# Load the dataset
data <- read.csv("data.csv")
# Boxplot of outcome (Y) by treatment (Z)
# This helps compare the median, spread, and potential outliers between treated and control groups
data %>%
ggplot(aes(x = as.factor(z), y = y)) +
geom_boxplot() +
labs(title = "Boxplot of Outcome (Y) by Treatment Group",
x = "Treatment (Z)", y = "Outcome (Y)")
# Density plot of outcome (Y) by treatment (Z)
# This provides a smooth view of the distribution of Y for treated vs control groups
ggplot(data, aes(x = y, fill = as.factor(z))) +
geom_density(alpha = 0.4) + # alpha controls transparency
labs(title = "Distribution of Outcome (Y) by Treatment (Z)",
x = "Outcome (Y)", fill = "Treatment") +
theme_minimal() # cleaner theme for plots
Rightward shift in Treated group: The distribution for Z = 1 (treated
group) is slightly shifted to the right, indicating higher values of Y
on average. This suggests a positive effect of the treatment on the
outcome. Visually, the treated group (z = 1) seems to have a higher
median outcome and slightly greater spread than the control group This
suggests a positive association between receiving the intervention and
higher achievement — a good initial sign that the treatment might be
effective.
Now let’s convert the covariates which are factors as factors:
# Convert selected categorical variables to factor
data <- data %>%
mutate(
selfrpt = as.factor(selfrpt),
race = as.factor(race),
gender = as.factor(gender),
fgen = as.factor(fgen),
urban = as.factor(urban)
)
Now let’s see how cobvariates are changing with Y:
# Load patchwork to arrange ggplot figures
library(patchwork)
# Define categorical covariates of interest (all are student-level variables)
covariates <- c("selfrpt", "race", "gender", "fgen", "urban")
# Create an empty list to store plots
plots <- list()
# Loop over each categorical covariate to generate boxplots for treatment vs control
for (i in 1:length(covariates)) {
# Current variable name
var <- covariates[i]
# Boxplot of Y by covariate for the treated group (Z = 1)
plot_treated <- data %>%
filter(z == 1) %>%
ggplot(aes(x = .data[[var]], y = y)) +
geom_boxplot() +
labs(title = paste(var, "- Treated (z = 1)")) +
theme_minimal()
# Boxplot of Y by covariate for the control group (Z = 0)
plot_control <- data %>%
filter(z == 0) %>%
ggplot(aes(x = .data[[var]], y = y)) +
geom_boxplot() +
labs(title = paste(var, "- Controlled (z = 0)")) +
theme_minimal()
# Combine treated and control plots vertically using patchwork
plots[[i]] <- plot_treated / plot_control
}
# You can now view all plots with: wrap_plots(plots)
higher self-reported expectations (selfrpt) seem associated with slightly increased achievement Also the controlled groups and treatement groups trends are similar for all categorical covariates and we can see self reported expectations people are more intelligent at end. Also urban and rural have some effect.
# Load required libraries
library(ggplot2) # For general plotting
library(GGally) # For ggcorr correlation plot
# Select only numeric covariates for correlation analysis
numeric_covs <- data %>%
select(where(is.numeric), -z, -y) # Remove treatment (z) and outcome (y) from the correlation matrix
# Generate a correlation matrix plot
ggcorr(numeric_covs,
label = TRUE, # Display correlation coefficients on the plot
label_round = 2, # Round labels to 2 decimal places
label_size = 3, # Font size for labels
name = "Correlation" # Legend title
) +
ggtitle("Correlation Matrix of Numeric Covariates") # Plot title
Pair | Correlation | Interpretation mindset & test | -0.48 | Schools
with lower fixed mindsets tend to have higher test scores — aligns with
the theory that growth mindsets are linked to better academic
performance. mindset & size | -0.49 | Larger schools tend to have
more fixed mindsets — may indicate school culture effects. test &
sch_race | -0.48 | Schools with higher minority composition tend to have
lower test scores. test & pov | -0.42 | Higher poverty concentration
is associated with lower test scores. test & size | +0.48 | Larger
schools tend to have higher test scores — possibly due to resource
concentration or district effects. mindset & sch_race | +0.42 |
Schools with higher minority composition have higher fixed mindset
scores (i.e., lower growth mindset) — suggests cultural and structural
differences in mindset environments.
# Group the data by treatment assignment (z) and calculate the mean of the outcome (y) within each group
treatement <- data %>%
group_by(z) %>%
summarise(y_mean = mean(y))
# Calculate the crude (unadjusted) difference in means: treated group mean - control group mean
treatement_hat <- treatement$y_mean[2] - treatement$y_mean[1]
# Print the unadjusted estimate of the average treatment effect (ATE)
print(treatement_hat)
## [1] 0.4569075
# Calculate the standard deviation of outcome (y) separately for treated (z=1) and control (z=0) groups
treatement_sd <- data %>%
group_by(z) %>%
summarise(y_sd = sd(y)) %>%
ungroup()
# Create separate datasets for treated and control groups
z_0 <- data %>% filter(z == 0) # Control group
z_1 <- data %>% filter(z == 1) # Treated group
# Compute the standard error for the difference in means using pooled standard deviations
# Formula: sqrt( (sd_treated^2 / n_treated) + (sd_control^2 / n_control) )
sd <- sqrt((treatement_sd$y_sd[2])^2 / nrow(z_1) +
(treatement_sd$y_sd[1])^2 / nrow(z_0))
# Compute the z-statistic: (difference in means) / (standard error)
treatement_sd_hat <- treatement_hat / sd
# Print the z-statistic
print(treatement_sd_hat)
## [1] 28.68164
# Compute the 95% Confidence Interval (CI) for the unadjusted difference in means (treatment effect)
# Using the standard normal critical value of 1.96 for 95% confidence
ci_lower <- treatement_hat - 1.96 * sd # Lower bound of the CI
ci_upper <- treatement_hat + 1.96 * sd # Upper bound of the CI
# Display the confidence interval as a numeric vector
c(ci_lower, ci_upper)
## [1] 0.4256841 0.4881309
# Load the cobalt package for covariate balance diagnostics
library(cobalt)
# Define the covariates to assess balance on
covariates <- c("selfrpt", "race", "gender", "fgen", "urban",
"mindset", "test", "sch_race", "pov", "size")
# Construct a formula for the treatment assignment model
# This is used to check balance of covariates between treatment (z = 1) and control (z = 0)
fml <- as.formula(paste("z ~", paste(covariates, collapse = " + ")))
# Create a balance table WITHOUT any weighting or matching yet
# This reflects the raw, unadjusted differences in covariates
bal <- bal.tab(fml, data = data, estimand = "ATE")
# Create a Love plot to visualize standardized mean differences
# Red points show the degree of imbalance for each covariate
love.plot(bal,
threshold = 0.05, # Draw a vertical line at |0.05| as the imbalance threshold
abs = TRUE, # Show absolute standardized differences
var.order = "unadjusted", # Keep the order as-is (unadjusted ordering)
stars = "raw", # Show significance stars based on raw data
colors = c("red"), # Use red color for visual emphasis
title = "Covariate Balance Before Adjustment")
# Create a Love plot without weights using raw covariate values
# This visualizes the standardized mean differences between treatment groups
love.plot(
x = subset(data, select = c(selfrpt, race, gender, fgen, urban,
mindset, test, sch_race, pov, size)), # Covariates of interest
treat = data$z, # Treatment indicator variable
abs = TRUE # Plot absolute standardized differences
) +
theme(legend.position = "none") # Remove legend for cleaner visualization
# Generate a balance table (summary of covariate differences between treatment groups)
# using the raw covariate values and the treatment indicator
bal.tab(
x = subset(data, select = c(selfrpt, race, gender, fgen, urban,
mindset, test, sch_race, pov, size)), # Covariates of interest
treat = data$z # Binary treatment indicator (0 = control, 1 = treated)
)
## Balance Measures
## Type Diff.Un
## selfrpt_1 Binary -0.0022
## selfrpt_2 Binary -0.0049
## selfrpt_3 Binary -0.0066
## selfrpt_4 Binary -0.0252
## selfrpt_5 Binary -0.0243
## selfrpt_6 Binary 0.0477
## selfrpt_7 Binary 0.0155
## race_1 Binary -0.0115
## race_2 Binary -0.0004
## race_3 Binary -0.0017
## race_4 Binary 0.0184
## race_5 Binary 0.0027
## race_6 Binary 0.0022
## race_7 Binary 0.0013
## race_8 Binary 0.0033
## race_9 Binary -0.0035
## race_10 Binary 0.0016
## race_11 Binary -0.0015
## race_12 Binary -0.0021
## race_13 Binary -0.0063
## race_14 Binary -0.0041
## race_15 Binary 0.0015
## gender_2 Binary -0.0272
## fgen Binary -0.0465
## urban_0 Binary 0.0021
## urban_1 Binary -0.0122
## urban_2 Binary 0.0138
## urban_3 Binary -0.0165
## urban_4 Binary 0.0128
## mindset Contin. -0.0984
## test Contin. 0.0597
## sch_race Contin. -0.0048
## pov Contin. -0.0249
## size Contin. 0.0729
##
## Sample sizes
## Control Treated
## All 7007 3384
# Define the categorical variables to be converted to numeric for histogram plotting
vars_to_convert <- c("selfrpt", "race", "gender", "fgen", "urban")
# Convert factor variables to numeric using mutate + across
# This is needed because ggplot2 histograms require numeric inputs
data <- data %>%
mutate(across(all_of(vars_to_convert), ~ as.numeric(as.character(.))))
# List of variables to include in histogram plots (all now numeric)
hist_covariates <- c("selfrpt", "race", "gender", "fgen", "urban")
# Create an empty list to store histogram plots
hist_plots <- list()
# Loop through each covariate and create a histogram split by treatment group (z)
for (i in seq_along(hist_covariates)) {
var <- hist_covariates[i]
# Create histogram for the current covariate, colored by treatment status
hist_plots[[i]] <- ggplot(data, aes(x = .data[[var]], fill = as.factor(z))) +
geom_histogram(position = "dodge", bins = 7, color = "black") + # separate bars by group
scale_fill_manual(
values = c("gray70", "skyblue"), # custom colors
name = "Treatment Group",
labels = c("Control", "Treatment")
) +
labs(
title = paste("Distribution of", var, "by Treatment Group"),
x = var,
y = "Count"
) +
theme_minimal()
}
# Display all histograms together using patchwork
wrap_plots(hist_plots)
# Load patchwork to combine multiple ggplot2 plots
library(patchwork)
# List of continuous school-level covariates for comparison by treatment group
covariates <- c("mindset", "test", "sch_race", "pov", "size")
# Initialize empty list to store plots
plots <- list()
# Loop over each covariate to create boxplots grouped by treatment
for(i in seq_along(covariates)) {
var <- covariates[i] # current variable
# Create a boxplot of the variable by treatment group
plots[[i]] <- ggplot(data, aes(x = as.factor(z), y = .data[[var]], fill = as.factor(z))) +
geom_boxplot() +
scale_fill_manual(
values = c("gray70", "skyblue"), # Colors for control and treatment
name = "Treatment Group",
labels = c("Control", "Treatment")
) +
scale_x_discrete(labels = c("Control", "Treatment")) + # Custom x-axis labels
labs(
title = paste("Boxplot of", var, "by Treatment Group"),
x = "Group",
y = var
) +
theme_minimal()
}
# Display all boxplots in a grid layout
wrap_plots(plots)
# Fit a simple linear regression model to estimate the unadjusted effect of treatment (z) on outcome (y)
model <- lm(y ~ z, data)
# Display the summary of the model including coefficient estimates and standard errors
summary(model)
##
## Call:
## lm(formula = y ~ z, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2381 -0.4535 -0.0144 0.4254 4.8308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.169049 0.008199 -20.62 <2e-16 ***
## z 0.456907 0.014367 31.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6863 on 10389 degrees of freedom
## Multiple R-squared: 0.08872, Adjusted R-squared: 0.08863
## F-statistic: 1011 on 1 and 10389 DF, p-value: < 2.2e-16
# Load the 'car' package if not already loaded (for robust variance estimation)
# install.packages("car") # Uncomment if needed
library(car)
# Compute the robust (heteroskedasticity-consistent) variance estimator for the treatment coefficient
# Using HC2 estimator which adjusts for leverage (small-sample bias correction)
# This returns the robust variance of the treatment coefficient (z)
car::hccm(model, type = "hc2")[2, 2]
## [1] 0.0002537748
# Convert selected variables back to factor type (if previously changed to numeric)
# This ensures proper encoding in the logistic regression model for propensity scores
data <- data %>%
mutate(
selfrpt = as.factor(selfrpt),
race = as.factor(race),
gender = as.factor(gender),
fgen = as.factor(fgen),
urban = as.factor(urban)
)
# Estimate propensity scores using logistic regression
# Predict the probability of treatment assignment (z) given all covariates
# The outcome y is excluded to avoid post-treatment bias
ps_model <- glm(z ~ . - y, data = data, family = binomial)
# Extract the predicted probabilities (i.e., propensity scores)
p_scores <- ps_model$fitted.values
# Store the outcome and treatment variables
Y <- data$y
W <- data$z
# Compute the Inverse Probability Weighting (IPW) estimate of the Average Treatment Effect (ATE)
# Horvitz-Thompson-style estimator
ipw <- mean(W * Y / p_scores - (1 - W) * Y / (1 - p_scores))
# Output the IPW estimate
ipw
## [1] 0.4115405
# Number of bootstrap resamples
B <- 1000
# Sample size
n <- nrow(data)
# Set seed for reproducibility
set.seed(100)
# Create a vector to store the IPW estimate from each bootstrap sample
boot_est <- vector(length = B)
# Loop through B bootstrap samples
for (b in 1:B) {
# Generate a bootstrap sample (resample rows with replacement)
sample_boot <- sample(1:n, n, replace = TRUE)
dat_boot <- data[sample_boot, ]
# Extract outcome and treatment for bootstrap sample
Y_boot <- dat_boot$y
W_boot <- dat_boot$z
# Fit a propensity score model on the bootstrap sample
ps_boot <- glm(z ~ . - y, dat_boot, family = binomial)$fitted.values
# Compute the IPW estimator for this bootstrap sample
boot_est[b] <- mean(W_boot * Y_boot / ps_boot -
(1 - W_boot) * Y_boot / (1 - ps_boot))
}
# Estimate the variance of the IPW estimator from the bootstrap distribution
var_est <- var(boot_est)
# Compute the standard error from the bootstrap variance estimate
se_boot <- sqrt(var_est)
# Calculate the lower bound of the 95% confidence interval for the IPW estimate
lower <- ipw - 1.96 * se_boot
# Calculate the upper bound of the 95% confidence interval
upper <- ipw + 1.96 * se_boot
# Display the confidence interval
c(lower, upper)
## [1] 0.3838889 0.4391920
# Create a Love plot to assess covariate balance before and after propensity score weighting
love.plot(
x = data[ , -c(1, 2, 13)], # Remove columns not used as covariates (e.g., ID, treatment Z, outcome Y)
treat = W, # Treatment assignment vector
weights = (W / p_scores) + ((1 - W) / (1 - p_scores)), # IPW weights (Horvitz-Thompson-style)
sample.names = c("Unweighted", "PSWeighted") # Labels for before and after weighting
)
# Compute the Hajek estimator for the Average Treatment Effect (ATE)
# This is a stabilized version of the IPW estimator, where weights are normalized
haj <- mean(W * Y / p_scores) / mean(W / p_scores) - # Weighted mean outcome for treated group
mean((1 - W) * Y / (1 - p_scores)) / mean((1 - W) / (1 - p_scores)) # Weighted mean for control group
# Display the Hajek ATE estimate
haj
## [1] 0.4116452
# Trim the dataset to include only units with propensity scores between 0.05 and 0.95
# This removes extreme values near 0 or 1, which can cause unstable IPW estimates
trim_dat1 <- data[as.logical((p_scores < 0.95) * (p_scores > 0.05)), ]
# Extract the outcome and treatment vectors for the trimmed sample
Y <- trim_dat1$y
W <- trim_dat1$z
# Extract the corresponding trimmed propensity scores
pscores2 <- p_scores[as.logical((p_scores < 0.95) * (p_scores > 0.05))]
# Compute IPW estimate on the trimmed dataset
ipw2 <- mean(W * Y / pscores2 - (1 - W) * Y / (1 - pscores2))
# Display the trimmed IPW estimate
ipw2
## [1] 0.4115405
# Fit an outcome regression model using only the treated units (z == 1)
om1 <- lm(y ~ . - z, data = data[data$z == 1, ])
# Fit an outcome regression model using only the control units (z == 0)
om0 <- lm(y ~ . - z, data = data[data$z == 0, ])
# Predict potential outcomes for *all* units using each model
mu_1_est <- predict(om1, data) # Predicted Y(1) for everyone
mu_0_est <- predict(om0, data) # Predicted Y(0) for everyone
# AIPW estimator for treated potential outcome: combine regression + residuals weighted by inverse probability
mu_1_dr <- mean(mu_1_est) + mean((data$z == 1) * (data$y - mu_1_est) / p_scores)
# AIPW estimator for control potential outcome
mu_0_dr <- mean(mu_0_est) + mean((data$z == 0) * (data$y - mu_0_est) / (1 - p_scores))
# AIPW estimate of the average treatment effect (ATE)
tau_est_aipw2 <- mu_1_dr - mu_0_dr
# Output the AIPW estimate
tau_est_aipw2
## [1] 0.4114252
# Set seed for reproducibility
set.seed(100)
# Initialize vector to store AIPW estimates from each bootstrap sample
boot_est <- vector(length = B)
# Loop over B bootstrap replications
for (b in 1:B) {
# Generate a bootstrap sample by resampling with replacement
sample_boot <- sample(1:n, n, replace = TRUE)
dat_boot <- data[sample_boot, ]
# Estimate propensity scores on the bootstrap sample
ps_boot <- glm(z ~ . - y, dat_boot, family = binomial)$fitted.values
# Fit separate outcome models on treated and control units
om1 <- lm(y ~ . - z, dat_boot[dat_boot$z == 1, ])
om0 <- lm(y ~ . - z, dat_boot[dat_boot$z == 0, ])
# Predict Y(1) and Y(0) for all units in the bootstrap sample
mu_1_estb <- predict(om1, dat_boot)
mu_0_estb <- predict(om0, dat_boot)
# Compute doubly robust (AIPW) estimates for the treated and control outcomes
mu_1_drb <- mean(mu_1_estb) + mean((dat_boot$z == 1) * (dat_boot$y - mu_1_estb) / ps_boot)
mu_0_drb <- mean(mu_0_estb) + mean((dat_boot$z == 0) * (dat_boot$y - mu_0_estb) / (1 - ps_boot))
# Store the ATE estimate for this bootstrap sample
boot_est[b] <- mu_1_drb - mu_0_drb
}
# Calculate the bootstrap variance of the AIPW estimator
var_est <- var(boot_est)
# Output the variance estimate
var_est
## [1] 0.0001982752
# Compute the standard error of the AIPW estimator using the bootstrap variance
se_aipw <- sqrt(var_est)
# Compute the z-statistic: estimate divided by its standard error
# This can be used to assess statistical significance (e.g., p-value)
z_aipw <- tau_est_aipw2 / se_aipw
# Compute the lower bound of the 95% confidence interval
lower <- tau_est_aipw2 - 1.96 * se_aipw
# Compute the upper bound of the 95% confidence interval
upper <- tau_est_aipw2 + 1.96 * se_aipw
# Display the confidence interval as a vector
c(lower, upper)
## [1] 0.3838264 0.4390240
# Divide the propensity scores into 5 equal-sized strata (quintiles)
# This is done using quantiles at 20%, 40%, 60%, 80%
q.pscore <- quantile(p_scores, (1:(5 - 1)) / 5)
# Cut the propensity scores into 5 bins based on these quantiles
# Label the strata 1 through 5
ps.strata <- cut(p_scores, breaks = c(0, q.pscore, 1), labels = 1:5)
# Create a copy of the data and add propensity scores and strata labels
dat <- data
dat$pscores <- p_scores
dat$strata <- ps.strata
# Summarize each stratum: count of units, and min/max propensity score in the stratum
dat %>%
group_by(strata) %>%
summarise(
nb = n(), # Number of observations in the stratum
ps_min = min(pscores), # Minimum propensity score in the stratum
ps_max = max(pscores) # Maximum propensity score in the stratum
)
## # A tibble: 5 × 4
## strata nb ps_min ps_max
## <fct> <int> <dbl> <dbl>
## 1 1 2079 0.160 0.285
## 2 2 2086 0.285 0.313
## 3 3 2070 0.313 0.338
## 4 4 2078 0.338 0.365
## 5 5 2078 0.365 0.548
# For each propensity score stratum, compute:
# - Number of observations (nb)
# - Mean outcome for treated (y1_bar) and control (y0_bar)
# - Variance of outcome for treated (s12) and control (s02)
dat_sum <- dat %>%
group_by(strata) %>%
summarise(
nb = n(), # Number of units in the stratum
y1_bar = mean(y[z == 1]), # Average outcome for treated
y0_bar = mean(y[z == 0]), # Average outcome for control
s12 = var(y[z == 1]), # Variance for treated
s02 = var(y[z == 0]) # Variance for control
)
# Estimate the overall ATE using a weighted average across strata
# Each stratum's difference in means is weighted by its sample size
tau_ps <- sum(dat_sum$nb * (dat_sum$y1_bar - dat_sum$y0_bar)) / sum(dat_sum$nb)
# Output the stratified estimate of the treatment effect
tau_ps
## [1] 0.4148841
# Generate a Love plot to assess covariate balance before and after subclassification (stratification)
# This checks whether covariates are more balanced within strata of the propensity score
love.plot(
x = dat[ , -c(1, 2, 13, 14, 15, 16, 17)], # Exclude irrelevant columns (e.g., ID, z, y, pscores, strata, etc.)
treat = dat$z, # Treatment assignment
subclass = dat$strata, # Subclassification strata (from propensity scores)
sample.names = c("Unweighted", "PSWeighted") # Labels for pre- and post-stratification
)
# For each stratum, calculate:
# - Total number of units (nb)
# - Number of treated (n1) and control (n0) units
# - Variance of outcome for treated (s12) and control (s02)
dat_sum <- dat %>%
group_by(strata) %>%
summarise(
nb = n(), # Total number of units in the stratum
n1 = sum(z == 1), # Count of treated units
n0 = sum(z == 0), # Count of control units
s12 = var(y[z == 1]), # Variance in treated group
s02 = var(y[z == 0]) # Variance in control group
)
# Compute the total number of units across all strata
N <- sum(dat_sum$nb)
# Estimate the variance of the stratified ATE using Neyman's formula for weighted means
# This weights each stratum's variance contribution by (nb^2 / N^2)
var_ps <- sum((dat_sum$nb^2) * ((dat_sum$s12 / dat_sum$n1) + (dat_sum$s02 / dat_sum$n0))) / N^2
# Compute the standard error of the stratified ATE estimate
se_ps <- sqrt(var_ps)
# Output the standard error
se_ps
## [1] 0.01555497
lower <- tau_ps - 1.96 * se_ps
upper <- tau_ps + 1.96 * se_ps
c(lower, upper)
## [1] 0.3843964 0.4453718