library(tidyverse)
library(recipes)
library(mlr3verse)
library(mlr3pipelines)
library(data.table)
library(DoubleML)
Data_R <- fread("ExampleData/Example.csv")
Task_R <- double_ml_data_from_data_frame(Data_R,
x_cols = c("TradeQ", "Size", "BuildYear", "Distance"),
y_col = c("Price"),
d_cols = c("Reform"))6 セミパラメトリック推定によるパラメタ推定
- Chernozhukov et al. (2018) を実装する
6.1 設定
import pandas as pd
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from doubleml import DoubleMLData
from doubleml import DoubleMLPLR
from doubleml import DoubleMLIRM
Data_Python = pd.read_csv('ExampleData/Example.csv')
Task_Python = DoubleMLData(Data_Python,
y_col = 'Price',
d_cols = 'Reform',
x_cols = ['TradeQ',"Size","Distance","BuildYear"])6.2 平均効果の推定: Partial Linear Model
部分線形モデル (Robinson 1988)
RandomForestとOLSのStackingを用いる
- Pythonについて、現状、RandomForestのみ
RegOLS <- lrn("regr.lm",
id = "RegressionOLS"
)
RegRF <- lrn("regr.ranger",
id = "RegressionRandomForest"
)
RegLearners <- list(
RegOLS,
RegRF
)
RegSuperLearner <- lrn("regr.lm",
id = "RegressionSuperLearner")
RegNuisanceLearner <-
pipeline_stacking(RegLearners, RegSuperLearner) |>
as_learner()
lgr::get_logger("mlr3")$set_threshold("warn")
FitPLR_R <- DoubleMLPLR$new(Task_R,
ml_l=RegNuisanceLearner$clone(),
ml_m=RegNuisanceLearner$clone(),
n_folds = 2)
FitPLR_R$fit()
print(FitPLR_R)================= DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: Price
Treatment variable(s): Reform
Covariates: TradeQ, Size, BuildYear, Distance
Instrument(s):
No. Observations: 14793
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
ml_l: RegressionOLS.RegressionRandomForest.nop.featureunion.RegressionSuperLearner
ml_m: RegressionOLS.RegressionRandomForest.nop.featureunion.RegressionSuperLearner
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: TRUE
------------------ Fit summary ------------------
Estimates and significance testing of the effect of target variables
Estimate. Std. Error t value Pr(>|t|)
Reform 4.8926 0.4061 12.05 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FitPLR_Python = DoubleMLPLR(Task_Python,
RandomForestRegressor(n_estimators = 500),
RandomForestRegressor(n_estimators = 500),
n_folds = 2)
FitPLR_Python.fit()<doubleml.double_ml_plr.DoubleMLPLR object at 0x298446340>
print(FitPLR_Python)================== DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: Price
Treatment variable(s): ['Reform']
Covariates: ['TradeQ', 'Size', 'Distance', 'BuildYear']
Instrument variable(s): None
No. Observations: 14793
------------------ Score & algorithm ------------------
Score function: partialling out
DML algorithm: dml2
------------------ Machine learner ------------------
Learner ml_l: RandomForestRegressor(n_estimators=500)
Learner ml_m: RandomForestRegressor(n_estimators=500)
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: True
------------------ Fit summary ------------------
coef std err t P>|t| 2.5 % 97.5 %
Reform 5.224552 0.361128 14.467301 1.949731e-47 4.516753 5.93235
6.3 平均効果の推定: AIPW
- AIPW (Robins and Rotnitzky 1995)
ProbOLS <- lrn("classif.log_reg",
id = "ProbLM",
predict_type = "prob"
)
ProbRF <- lrn("classif.ranger",
id = "ProbRanger",
predict_type = "prob"
)
ProbLearners <- list(ProbOLS,ProbRF)
ProbSuperLearner <- lrn("classif.log_reg",
id = "ProbSuperLearner")
ProbNuisanceLearner <- pipeline_stacking(ProbLearners, ProbSuperLearner) |>
as_learner()
lgr::get_logger("mlr3")$set_threshold("warn")
FitAIPW_R = DoubleMLIRM$new(Task_R,
ml_g=RegNuisanceLearner,
ml_m=ProbNuisanceLearner,
n_folds = 2,
trimming_threshold = 0.1)
FitAIPW_R$fit()
print(FitAIPW_R)================= DoubleMLIRM Object ==================
------------------ Data summary ------------------
Outcome variable: Price
Treatment variable(s): Reform
Covariates: TradeQ, Size, BuildYear, Distance
Instrument(s):
No. Observations: 14793
------------------ Score & algorithm ------------------
Score function: ATE
DML algorithm: dml2
------------------ Machine learner ------------------
ml_g: RegressionOLS.RegressionRandomForest.nop.featureunion.RegressionSuperLearner
ml_m: ProbLM.ProbRanger.nop.featureunion.ProbSuperLearner
------------------ Resampling ------------------
No. folds: 2
No. repeated sample splits: 1
Apply cross-fitting: TRUE
------------------ Fit summary ------------------
Estimates and significance testing of the effect of target variables
Estimate. Std. Error t value Pr(>|t|)
Reform 3.8588 0.4247 9.086 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FitAIPW_Python = DoubleMLIRM(Task_Python,
RandomForestRegressor(n_estimators = 500),
RandomForestClassifier(n_estimators = 500),
n_folds = 2,
trimming_threshold = 0.1)
FitAIPW_Python.fit()<doubleml.double_ml_irm.DoubleMLIRM object at 0x16ba7f430>
FitAIPW_Python.summary coef std err t P>|t| 2.5 % 97.5 %
Reform 4.348929 0.545113 7.978028 1.486896e-15 3.280526 5.417331