library(tidyverse)
library(grf)
library(DoubleML)
library(mlr3verse)
<- read_csv("ExampleData/Example.csv")
Data_R
<- sample(0:1,nrow(Data_R),replace = TRUE)
Group
<- Data_R$Price
Y
<- Data_R$Reform
D
<- Data_R |>
X select(
-Price,
-Reform
)
7 異質性の探索
条件付き平均差 \(\tau(X)=E[Y|D=1,X]-E[Y|D=0,X]\)の(ノンパラメトリックな)推定値を異質性のシグナルとして活用
一般に、条件付きの平均差のノンパラメトリックな推定値について、推定誤差を評価するのが難しいため
効果の異質性の”シグナル”として使う (Chernozhukov et al. 2018; Kallus 2022)
\(E[\tau(X)|\tau(X)\le Median(\tau(X))]\) を推定
\(\tau(X)\) はCausal Forest (Wager and Athey 2018; Athey, Tibshirani, and Wager 2019) で推定
7.1 設定
<- causal_forest(
EstCF X = X[Group == 0,],
W = D[Group == 0],
Y = Y[Group == 0]
)
<- EstCF |>
PredTau predict(X)
hist(PredTau$predictions)
<- double_ml_data_from_matrix(
Task X = X[Group == 1,],
y = Y[Group == 1],
d = D[Group == 1]
)
<- DoubleMLPLR$new(
EstDML
Task,lrn("regr.lm"),
lrn("regr.lm")
)
$fit() EstDML
INFO [21:02:33.207] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 1/5)
INFO [21:02:33.229] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 2/5)
INFO [21:02:33.239] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 3/5)
INFO [21:02:33.246] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 4/5)
INFO [21:02:33.253] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 5/5)
INFO [21:02:33.334] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 1/5)
INFO [21:02:33.515] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 2/5)
INFO [21:02:33.522] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 3/5)
INFO [21:02:33.528] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 4/5)
INFO [21:02:33.534] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 5/5)
EstDML
================= DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: y
Treatment variable(s): d
Covariates: X1, X2, X3, X4
Instrument(s):
No. Observations: 7375
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
ml_l: regr.lm
ml_m: regr.lm
------------------ Resampling ------------------
No. folds: 5
No. repeated sample splits: 1
Apply cross-fitting: TRUE
------------------ Fit summary ------------------
Estimates and significance testing of the effect of target variables
Estimate. Std. Error t value Pr(>|t|)
d 3.2798 0.5999 5.468 4.56e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
<- quantile(PredTau$predictions,0.5)
Q
<- double_ml_data_from_matrix(
Task X = X[Group == 1 & PredTau$predictions >= Q,],
y = Y[Group == 1 & PredTau$predictions >= Q],
d = D[Group == 1 & PredTau$predictions >= Q]
)
<- DoubleMLPLR$new(
EstDML
Task,lrn("regr.lm"),
lrn("regr.lm")
)
$fit() EstDML
INFO [21:02:33.617] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 1/5)
INFO [21:02:33.624] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 2/5)
INFO [21:02:33.630] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 3/5)
INFO [21:02:33.637] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 4/5)
INFO [21:02:33.643] [mlr3] Applying learner 'regr.lm' on task 'nuis_l' (iter 5/5)
INFO [21:02:33.687] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 1/5)
INFO [21:02:33.693] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 2/5)
INFO [21:02:33.698] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 3/5)
INFO [21:02:33.704] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 4/5)
INFO [21:02:33.710] [mlr3] Applying learner 'regr.lm' on task 'nuis_m' (iter 5/5)
EstDML
================= DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: y
Treatment variable(s): d
Covariates: X1, X2, X3, X4
Instrument(s):
No. Observations: 3678
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
ml_l: regr.lm
ml_m: regr.lm
------------------ Resampling ------------------
No. folds: 5
No. repeated sample splits: 1
Apply cross-fitting: TRUE
------------------ Fit summary ------------------
Estimates and significance testing of the effect of target variables
Estimate. Std. Error t value Pr(>|t|)
d 5.3312 0.8785 6.069 1.29e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
import pandas as pd
import sklearn.model_selection as model_selection
= pd.read_csv('ExampleData/Example.csv')
Data_Python
= Data_Python['Price']
y
= Data_Python['Reform']
d
= Data_Python.drop(['Price','Reform'],axis=1).values
x
= model_selection.train_test_split(
y_train, y_test,d_train,d_test,x_train,x_test
y,
d,
x,= 0.5
train_size )
import econml.dml as dml
import sklearn.linear_model as Linear
import matplotlib.pyplot as plt
import seaborn as sns
= dml.CausalForestDML(
est_cf = Linear.LinearRegression(),
model_y = Linear.LinearRegression(),
model_t =2000
n_estimators
)
= est_cf.fit(y_train, d_train, X = x_train, W=None)
est_cf
= est_cf.effect(x_test)
pred_tau
= plt.figure()
fig
sns.histplot(pred_tau)
import doubleml as DML
= DML.DoubleMLData.from_arrays(x_test, y_test, d_test)
Task
= DML.DoubleMLPLR(
FitPLR
Task,
Linear.LinearRegression(),
Linear.LinearRegression(),= 2
n_folds
)
= True) FitPLR.fit(store_predictions
================== DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4']
Instrument variable(s): None
No. Observations: 7397
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
Learner ml_l: LinearRegression()
Learner ml_m: LinearRegression()
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: True
------------------ Fit summary ------------------
coef std err t P>|t| 2.5 % 97.5 %
d 3.580131 0.562229 6.36774 1.918334e-10 2.478182 4.682081
import numpy as np
= np.quantile(a=pred_tau, q=0.5)
Q
= DML.DoubleMLData.from_arrays(x_test[pred_tau >= Q], y_test[pred_tau >= Q], d_test[pred_tau >= Q])
Task
= DML.DoubleMLPLR(
FitPLR
Task,
Linear.LinearRegression(),
Linear.LinearRegression(),= 2
n_folds
)
= True) FitPLR.fit(store_predictions
================== DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4']
Instrument variable(s): None
No. Observations: 3699
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
Learner ml_l: LinearRegression()
Learner ml_m: LinearRegression()
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: True
------------------ Fit summary ------------------
coef std err t P>|t| 2.5 % 97.5 %
d 4.153337 0.774872 5.360028 8.320895e-08 2.634615 5.672059