library(tidyverse)
library(recipes)
library(mlr3verse)
library(mlr3pipelines)
library(data.table)
library(DoubleML)
<- fread("ExampleData/Example.csv")
Data_R
<- double_ml_data_from_data_frame(Data_R,
Task_R x_cols = c("TradeQ", "Size", "BuildYear", "Distance"),
y_col = c("Price"),
d_cols = c("Reform"))
6 セミパラメトリック推定によるパラメタ推定
- Chernozhukov et al. (2018) を実装する
6.1 設定
import pandas as pd
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
from doubleml import DoubleMLIRM
= pd.read_csv('ExampleData/Example.csv')
Data_Python
= DoubleMLData(Data_Python,
Task_Python = 'Price',
y_col = 'Reform',
d_cols = ['TradeQ',"Size","Distance","BuildYear"]) x_cols
6.2 平均効果の推定: Partial Linear Model
部分線形モデル (Robinson 1988)
RandomForestとOLSのStackingを用いる
- Pythonについて、現状、RandomForestのみ
<- lrn("regr.lm",
RegOLS id = "RegressionOLS"
)
<- lrn("regr.ranger",
RegRF id = "RegressionRandomForest"
)
<- list(
RegLearners
RegOLS,
RegRF
)
<- lrn("regr.lm",
RegSuperLearner id = "RegressionSuperLearner")
<-
RegNuisanceLearner pipeline_stacking(RegLearners, RegSuperLearner) |>
as_learner()
::get_logger("mlr3")$set_threshold("warn")
lgr
<- DoubleMLPLR$new(Task_R,
FitPLR_R ml_l=RegNuisanceLearner$clone(),
ml_m=RegNuisanceLearner$clone(),
n_folds = 2)
$fit()
FitPLR_R
print(FitPLR_R)
================= DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: Price
Treatment variable(s): Reform
Covariates: TradeQ, Size, BuildYear, Distance
Instrument(s):
No. Observations: 14793
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
ml_l: RegressionOLS.RegressionRandomForest.nop.featureunion.RegressionSuperLearner
ml_m: RegressionOLS.RegressionRandomForest.nop.featureunion.RegressionSuperLearner
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: TRUE
------------------ Fit summary ------------------
Estimates and significance testing of the effect of target variables
Estimate. Std. Error t value Pr(>|t|)
Reform 4.8926 0.4061 12.05 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
= DoubleMLPLR(Task_Python,
FitPLR_Python = 500),
RandomForestRegressor(n_estimators = 500),
RandomForestRegressor(n_estimators = 2)
n_folds
FitPLR_Python.fit()
<doubleml.double_ml_plr.DoubleMLPLR object at 0x298446340>
print(FitPLR_Python)
================== DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: Price
Treatment variable(s): ['Reform']
Covariates: ['TradeQ', 'Size', 'Distance', 'BuildYear']
Instrument variable(s): None
No. Observations: 14793
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
Learner ml_l: RandomForestRegressor(n_estimators=500)
Learner ml_m: RandomForestRegressor(n_estimators=500)
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: True
------------------ Fit summary ------------------
coef std err t P>|t| 2.5 % 97.5 %
Reform 5.224552 0.361128 14.467301 1.949731e-47 4.516753 5.93235
6.3 平均効果の推定: AIPW
- AIPW (Robins and Rotnitzky 1995)
<- lrn("classif.log_reg",
ProbOLS id = "ProbLM",
predict_type = "prob"
)
<- lrn("classif.ranger",
ProbRF id = "ProbRanger",
predict_type = "prob"
)
<- list(ProbOLS,ProbRF)
ProbLearners
<- lrn("classif.log_reg",
ProbSuperLearner id = "ProbSuperLearner")
<- pipeline_stacking(ProbLearners, ProbSuperLearner) |>
ProbNuisanceLearner as_learner()
::get_logger("mlr3")$set_threshold("warn")
lgr
= DoubleMLIRM$new(Task_R,
FitAIPW_R ml_g=RegNuisanceLearner,
ml_m=ProbNuisanceLearner,
n_folds = 2,
trimming_threshold = 0.1)
$fit()
FitAIPW_R
print(FitAIPW_R)
================= DoubleMLIRM Object ==================
------------------ Data summary ------------------
Outcome variable: Price
Treatment variable(s): Reform
Covariates: TradeQ, Size, BuildYear, Distance
Instrument(s):
No. Observations: 14793
------------------ Score & algorithm ------------------
Score function: ATE
DML algorithm: dml2
------------------ Machine learner ------------------
ml_g: RegressionOLS.RegressionRandomForest.nop.featureunion.RegressionSuperLearner
ml_m: ProbLM.ProbRanger.nop.featureunion.ProbSuperLearner
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: TRUE
------------------ Fit summary ------------------
Estimates and significance testing of the effect of target variables
Estimate. Std. Error t value Pr(>|t|)
Reform 3.8588 0.4247 9.086 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
= DoubleMLIRM(Task_Python,
FitAIPW_Python = 500),
RandomForestRegressor(n_estimators = 500),
RandomForestClassifier(n_estimators = 2,
n_folds = 0.1)
trimming_threshold
FitAIPW_Python.fit()
<doubleml.double_ml_irm.DoubleMLIRM object at 0x16ba7f430>
FitAIPW_Python.summary
coef std err t P>|t| 2.5 % 97.5 %
Reform 4.348929 0.545113 7.978028 1.486896e-15 3.280526 5.417331