第8章 制限従属変数モデル

先に出版社サイトよりデータをダウンロードする.

# サポートファイルへのリンク
curl <- "https://www.yuhikaku.co.jp/static_files/05385_support08.zip"
# ダウンロード保存用フォルダが存在しない場合, 作成
if(!dir.exists("downloads")){
    dir.create("downloads")
}
cdestfile <- "downloads/support08.zip"
download.file(curl, cdestfile)
# データ保存用フォルダが存在しない場合, 作成
if(!dir.exists("data")){
    dir.create("data")
}
# WSL上のRで解凍すると文字化けするので、Linuxのコマンドを外部呼び出し
# Windowsの場合は別途コマンドを用いる.
if(.Platform$OS.type == "unix") {
    system(sprintf('unzip -n -Ocp932 %s -d %s', "downloads/support08.zip", "./data"))
} else {
    print("Windowsで解凍するコマンドを別途追加せよ.")
}

必要なライブラリを読み込む.

library(tidyverse)
library(estimatr)
library(margins)
library(modelsummary)
library(gt)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:wooldridge':
## 
##     cement
## The following object is masked from 'package:dplyr':
## 
##     select
library(marginaleffects)
library(pscl)
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
library(nnet)
library(AER)
library(sampleSelection)
## Loading required package: maxLik
## Loading required package: miscTools
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/

実証例8.1 PIAACを用いた女性の就業選択の実証例

まずはデータを読み込む.

piaac <- read.csv("data/08_第8章/piaac.csv")

プロビット/ロジットモデルの推定には, glm()を用い, 引数にfamily = binomial(link = "probit")などと指定する. 限界効果は (Rの確率密度関数を用いて) 手計算でもできるが, marginsパッケージを使うと自動で求められる (参考).

# 女性に限定し, さらに就業ダミー変数を作成
piaac_female <- piaac %>%
    filter(gender == "Female") %>%
    mutate(emp = ifelse(lfs == "Employed", 1, 0))
model_811 <- lm_robust(emp ~ educ + age + couple + child, data = piaac_female, se_type = "stata")
summary(model_811)
## 
## Call:
## lm_robust(formula = emp ~ educ + age + couple + child, data = piaac_female, 
##     se_type = "stata")
## 
## Standard error type:  HC1 
## 
## Coefficients:
##              Estimate Std. Error t value  Pr(>|t|)   CI Lower CI Upper   DF
## (Intercept)  0.375932   0.105430   3.566 0.0003727  0.1691497  0.58271 1742
## educ         0.019482   0.005882   3.312 0.0009448  0.0079454  0.03102 1742
## age          0.002013   0.001166   1.726 0.0845504 -0.0002746  0.00430 1742
## couple      -0.117846   0.031481  -3.743 0.0001874 -0.1795902 -0.05610 1742
## child        0.013723   0.012861   1.067 0.2860951 -0.0115009  0.03895 1742
## 
## Multiple R-squared:  0.0137 ,    Adjusted R-squared:  0.01144 
## F-statistic: 6.665 on 4 and 1742 DF,  p-value: 2.541e-05
model_812 <- glm(emp ~ educ + age + couple + child, data = piaac_female, family = binomial(link = "probit"))
summary(model_812)
## 
## Call:
## glm(formula = emp ~ educ + age + couple + child, family = binomial(link = "probit"), 
##     data = piaac_female)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.344270   0.288020  -1.195 0.231970    
## educ         0.053671   0.016076   3.339 0.000842 ***
## age          0.005297   0.002997   1.767 0.077161 .  
## couple      -0.339106   0.097026  -3.495 0.000474 ***
## child        0.039649   0.033777   1.174 0.240461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2264.1  on 1746  degrees of freedom
## Residual deviance: 2239.5  on 1742  degrees of freedom
##   (958 observations deleted due to missingness)
## AIC: 2249.5
## 
## Number of Fisher Scoring iterations: 4
model_813 <- margins(model_812)
summary(model_813)
##  factor     AME     SE       z      p   lower   upper
##     age  0.0019 0.0011  1.7721 0.0764 -0.0002  0.0041
##   child  0.0145 0.0124  1.1751 0.2400 -0.0097  0.0388
##  couple -0.1243 0.0352 -3.5298 0.0004 -0.1933 -0.0553
##    educ  0.0197 0.0058  3.3702 0.0008  0.0082  0.0311
model_814 <- glm(emp ~ educ + age + couple + child, data = piaac_female, family = binomial(link = "logit"))
summary(model_814)
## 
## Call:
## glm(formula = emp ~ educ + age + couple + child, family = binomial(link = "logit"), 
##     data = piaac_female)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.578109   0.471171  -1.227 0.219837    
## educ         0.086867   0.026266   3.307 0.000942 ***
## age          0.008822   0.004886   1.806 0.070975 .  
## couple      -0.558740   0.163200  -3.424 0.000618 ***
## child        0.071606   0.057635   1.242 0.214084    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2264.1  on 1746  degrees of freedom
## Residual deviance: 2239.4  on 1742  degrees of freedom
##   (958 observations deleted due to missingness)
## AIC: 2249.4
## 
## Number of Fisher Scoring iterations: 4
model_815 <- margins(model_814)
summary(model_815)
##  factor     AME     SE       z      p   lower   upper
##     age  0.0020 0.0011  1.8118 0.0700 -0.0002  0.0041
##   child  0.0161 0.0129  1.2442 0.2134 -0.0093  0.0414
##  couple -0.1255 0.0362 -3.4630 0.0005 -0.1965 -0.0545
##    educ  0.0195 0.0058  3.3453 0.0008  0.0081  0.0309

表8-1 女性の就業決定

推定方法と\(\bar{R}^2\text{/疑似}R^2\)を表示する行を追加した.

models_81 <- list("(1)" = model_811,
                  "(2)" = model_812,
                  "(3)" = model_813,
                  "(4)" = model_814,
                  "(5)" = model_815)
cm <- c("educ" = "教育年数",
        "age" = "年齢",
        "couple" = "配偶者あり",
        "child" = "子供数",
        "(Intercept)" = "定数項")
glance_custom.lm_robust <- function(x) {
    out <- tibble("adj.r.squared_or_peseudo.r.squared" = x$adj.r.squared)
    return(out)
}
glance_custom.glm <- function(x) {
    peseudo.r.squared <- 1 - (x$deviance / x$null.deviance)
    out <- tibble("adj.r.squared_or_peseudo.r.squared" = peseudo.r.squared)
    return(out)
}
gm <- tribble(~raw,                 ~clean,                          ~fmt,
              "adj.r.squared_or_peseudo.r.squared",      "$\\bar{R}^2$/疑似$R^2$",                  2,
              "nobs",               "$N$",                0)
rows <- tribble(~term, ~`(1)`, ~`(2)`, ~`(3)`, ~`(4)`, ~`(5)`,
                "推定方法", "OLS", "プロビット", "限界効果", "ロジット", "限界効果")
attr(rows, 'position') <- 1
modelsummary(models_81,
             coef_map = cm,
             gof_map = gm,
             add_rows = rows,
             stars = c("*" = 0.05, "**" = 0.01, "***" = 0.001),
             estimate = "{estimate}{stars}",
             output = "kableExtra",
             notes = "* p &lt; 0.05, ** p &lt; 0.01, *** p &lt; 0.001") %>%
    row_spec(c(1, 13), extra_css = "border-bottom: 1.5px solid") %>%
    row_spec(11, extra_css = ";border-bottom: 1.5px solid") # 32行目の下 (estimateとstatisticsの境) のみコロンが必要
&nbsp;(1) &nbsp;&nbsp;(2) &nbsp;&nbsp;(3) &nbsp;&nbsp;(4) &nbsp;&nbsp;(5)
推定方法 OLS プロビット 限界効果 ロジット 限界効果
教育年数 0.019*** 0.054*** 0.020*** 0.087*** 0.020***
(0.006) (0.016) (0.006) (0.026) (0.006)
年齢 0.002 0.005 0.002 0.009 0.002
(0.001) (0.003) (0.001) (0.005) (0.001)
配偶者あり −0.118*** −0.339*** −0.124*** −0.559*** −0.126***
(0.031) (0.097) (0.035) (0.163) (0.036)
子供数 0.014 0.040 0.015 0.072 0.016
(0.013) (0.034) (0.012) (0.058) (0.013)
定数項 0.376*** −0.344 −0.578
(0.105) (0.288) (0.471)
\(\bar{R}^2\)/疑似\(R^2\) 0.01 0.01 0.01
\(N\) 1747 1747 1747
* p &lt; 0.05, ** p &lt; 0.01, *** p &lt; 0.001

実証例8.2 PIAACを用いた仕事満足度の決定要因の分析

順序付きプロビットモデルの推定には, MASS::polrを使う. 他にもoglmx::oprobit.reg()などがある. polrで推定した順序付きプロビットモデルは, marginaleffect::avg_slopes()に渡すことで限界効果を求めることができる. ただし, avg_slopes()はデフォルトでは標準誤差の計算にt分布ではなく正規分布を仮定するため, dfを指定してt分布を使うようにする. 表の統計量欄にあるMcFaddenの擬似\(R^2\)は, pscl::pR2()を使って求めた.

# 男性に限定し, さらに仕事満足度を示す変数をファクター化し順序を与える
piaac_male <- piaac %>%
    filter(gender == "Male") %>%
    mutate(jsrev = factor(js, levels = c("Extremely dissatisfied", "Dissatisfied", "Neither satisfied nor dissatisfied","Satisfied", "Extremely satisfied"),
                          ordered = TRUE))
model_82 <- polr(jsrev ~ educ + age + couple + child, data = piaac_male, method = "probit")
# avg_slopesは標準ではt分布ではなく正規分布を仮定するため, dfを指定する
model_82_marginal <- avg_slopes(model_82, df = insight::get_df(model_82))
# %ポイント変化を求めるために一旦listに書き出し
model_82_marginal <- modelsummary(model_82_marginal, shape = term ~ group, output = "modelsummary_list", fmt = 5)
# パーセント表示に変換し, また表示のために仕事満足度順に並び替え
model_82_marginal$tidy <- model_82_marginal$tidy %>%
    mutate(estimate = estimate * 100) %>%
    mutate(std.error = std.error * 100) %>%
    mutate(conf.low = conf.low * 100) %>%
    mutate(conf.high = conf.high * 100) %>%
    mutate(group = factor(group, levels = c("Extremely dissatisfied", "Dissatisfied", "Neither satisfied nor dissatisfied","Satisfied", "Extremely satisfied"),
                          labels = c("とても不満", "不満", "どちらでもない", "満足", "とても満足"),
                          ordered = TRUE)) %>%
    arrange(group)
# 限界効果の統計量 (サンプルサイズ) は表示に不要なのでNULLを代入
model_82_marginal$glance <- NULL
models_82 <- list("モデル係数" = model_82,
                  "限界効果 (% ポイント変化)" = model_82_marginal)
cm <- c("educ" = "教育年数",
        "age" = "年齢",
        "couple" = "配偶者あり",
        "child" = "子供数",
        "Extremely dissatisfied|Dissatisfied" = "$\\mu_1$",
        "Dissatisfied|Neither satisfied nor dissatisfied" = "$\\mu_2$",
        "Neither satisfied nor dissatisfied|Satisfied" = "$\\mu_3$",
        "Satisfied|Extremely satisfied" = "$\\mu_4$")
# pscl::pR2()でMcFaddenの疑似R2を求める
glance_custom.polr <- function(x) {
    capture.output(McFadden <- pR2(x)["McFadden"]) # pR2のcat出力を抑制するためにcapture.output()で囲う
    out <- tibble("pseudo.r.squared" = McFadden)
    return(out)
}
gm <- tribble(~raw,               ~clean,           ~fmt,
              "pseudo.r.squared", "疑似$R^2$",      3,
              "nobs",             "サンプルサイズ", 0)
modelsummary(models_82,
             shape = term ~ group,
             coef_map = cm,
             gof_map = gm,
             stars = c("*" = 0.05, "**" = 0.01, "***" = 0.001))
限界効果 (% ポイント変化)
モデル係数 とても不満 不満 どちらでもない 満足 とても満足
* p < 0.05, ** p < 0.01, *** p < 0.001
教育年数 0.030* -0.067* -0.472* -0.639* 0.710* 0.468*
(0.012) (0.033) (0.189) (0.252) (0.281) (0.188)
年齢 0.007* -0.015 -0.106* -0.143* 0.159* 0.105*
(0.003) (0.008) (0.048) (0.064) (0.071) (0.048)
配偶者あり -0.084 0.171 1.253 1.812 -1.868 -1.367
(0.151) (0.282) (2.150) (3.313) (3.162) (2.579)
子供数 -0.011 0.024 0.171 0.232 -0.258 -0.170
(0.040) (0.090) (0.626) (0.847) (0.942) (0.621)
$\mu_1$ -1.772***
(0.292)
$\mu_2$ -0.634*
(0.274)
$\mu_3$ 0.414
(0.273)
$\mu_4$ 2.027***
(0.278)
疑似$R^2$ 0.004
サンプルサイズ 1225

限界効果で配偶者ありの値だけ少しStataと異なっている (カテゴリー変数の処理の仕方の差によるものか?).

実証例8.3 PIAACを用いた就業形態決定要因の分析

多項ロジットモデルは下ではnnet::multinom()を用いたが, 他にもmlogit::mlogit()などがある.

# 女性に限定
piaac_female <- piaac %>%
    filter(gender == "Female") %>%
    mutate(empstat_edt = factor(empstat_edt, labels = c("フル", "パート", "不就業")) %>% relevel(ref = 3)) # 先にベースラインを設定
model_83 <- multinom(empstat_edt ~ educ + age + couple + child, data = piaac_female, trace = FALSE)
model_83_marginal <- avg_slopes(model_83, df = insight::get_df(model_83), type = "probs")

# pscl::pR2()でMcFaddenの疑似R2を求める
glance_custom.multinom <- function(x) {
    capture.output(McFadden <- pR2(x)["McFadden"]) # pR2のcat出力を抑制するためにcapture.output()で囲う
    out <- tibble("pseudo.r.squared" = McFadden)
    return(out)
}

# 係数が対応する目的変数を示す列名が元の多項ロジットモデルではresponse, 限界効果ではgroupと異なっているため,
# 元のモデルを限界効果に合わせてgroupという列を追加 (逆ではなぜかmodelsummaryの出力がうまくいかなかった)
model_83 <- modelsummary(model_83, output = "modelsummary_list")
model_83$tidy <- model_83$tidy %>% mutate(group = response)
models_83 <- list(" " = model_83,
                  "限界効果" = model_83_marginal)
cm <- c("educ" = "教育年数",
        "age" = "年齢",
        "couple" = "配偶者あり",
        "child" = "子供数",
        "(Intercept)" = "定数項")
gm <- tribble(~raw,               ~clean,           ~fmt,
              "pseudo.r.squared", "疑似$R^2$",      3,
              "nobs",             "サンプルサイズ", 0)
modelsummary(models_83,
             shape = term ~ group,
             coef_map = cm,
             gof_map = gm,
             stars = c("*" = 0.05, "**" = 0.01, "***" = 0.001))
限界効果
フル パート フル パート 不就業
* p < 0.05, ** p < 0.01, *** p < 0.001
教育年数 0.186*** 0.003 0.036*** -0.017** -0.020***
(0.032) (0.030) (0.006) (0.006) (0.006)
年齢 0.008 0.007 0.001 0.001 -0.002
(0.006) (0.005) (0.001) (0.001) (0.001)
配偶者あり -0.898*** -0.541** -0.132*** -0.020 0.153***
(0.186) (0.183) (0.035) (0.034) (0.031)
子供数 0.085 0.084 0.009 0.011 -0.020
(0.067) (0.063) (0.011) (0.011) (0.013)
定数項 -2.489*** -0.208
(0.574) (0.529)
疑似$R^2$ 0.017 0.017
サンプルサイズ 1747 1747

限界効果の配偶者ありの標準誤差の値がやはりStataと異なっている.

実証例8.4 PIAACを用いた女性の労働時間決定の分析

トビットモデルはAER::tobit()を用いて推定できる. 被説明変数の下限・上限はそれぞれleft, rightパラメータで指定する. 擬似\(R^2\)は, AER::tobit()で推定したモデルにはpscl::pR2()が使えないため, 手動で計算する (参考).

# 女性に限定
piaac_female <- piaac %>%
    filter(gender == "Female")
model_84_OLS <- lm(hours ~ educ + age + couple + child, data = piaac_female, y = TRUE)
model_84_tobit <- tobit(hours ~ educ + age + couple + child, data = piaac_female, left = 0)
models_84 <- list("(1)" = model_84_OLS,
                  "(2)" = model_84_tobit)
rows <- tribble(~term, ~`(1)`, ~`(2)`,
                "推定方法", "OLS", "トービット")
attr(rows, 'position') <- 1
cm <- c("educ" = "教育年数",
        "age" = "年齢",
        "couple" = "配偶者あり",
        "child" = "子供数",
        "(Intercept)" = "定数項")
glance_custom.tobit <- function(x) {
    out <- tibble("r.squared" = 1 - x$loglik[2]/x$loglik[1],
                  "nobs_zero" = sum(as.character(x$y) == "  0-"))
    return(out)
}
glance_custom.lm <- function(x) {
    out <- tibble("nobs_zero" = sum(x$y == 0))
    return(out)
}
gm <- tribble(~raw,               ~clean,           ~fmt,
              "r.squared", "$R^2$/疑似$R^2$",      3,
              "nobs",             "$N$", 0,
              "nobs_zero",         "うち0時間", 0)
modelsummary(models_84,
             coef_map = cm,
             gof_map = gm,
             add_rows = rows,
             stars = c("*" = 0.05, "**" = 0.01, "***" = 0.001),
             )
(1) (2)
* p < 0.05, ** p < 0.01, *** p < 0.001
推定方法 OLS トービット
教育年数 0.853*** 1.291***
(0.232) (0.352)
年齢 0.046 0.094
(0.043) (0.067)
配偶者あり -6.808*** -9.432***
(1.343) (1.995)
子供数 0.673 0.996
(0.443) (0.669)
定数項 11.987** -0.269
(4.153) (6.317)
$R^2$/疑似$R^2$ 0.022 0.003
$N$ 1736 1736
うち0時間 608 608

実証例8.5 PIAACを用いた女性賃金の決定要因の分析

先にデータを加工しておく. filter())で女性のみに限定し, drop_na()で指定した変数で欠損がある行を削除する. さらに, mutate()を用いて変数を作成する. ヘキットモデルの推定にはsampleSelection::heckit()を用いる. method引数で推定方法を指定でき, 2段階ヘキット法は"2step", 最尤法は"ml"とすればよい.

piaac_female <- piaac %>%
    filter(gender == "Female") %>%
    drop_na(educ, age, couple, child) %>%
    mutate(lwage = log(wage)) %>%
    mutate(exp = age - educ - 6) %>%
    mutate(exp2 = exp^2/100) %>%
    mutate(selected = !is.na(wage))
model_85_OLS <- lm(lwage ~ educ + exp + exp2, data = piaac_female)
model_85_OLS <- modelsummary(model_85_OLS, output = "modelsummary_list")
model_85_OLS$tidy$component = "" # modelsummaryのshapeで変形するために列を作成
model_85_2step <- heckit(selected ~ educ + exp + exp2 + couple + child, lwage ~ educ + exp + exp2, data = piaac_female, method = "2step")
model_85_2step <- modelsummary(model_85_2step, output = "modelsummary_list")
model_85_2step$tidy <- model_85_2step$tidy %>%
    mutate(component = ifelse(component == "selection", "セレクション式", "賃金式")) # componentを日本語に直し, auxiliarlyは賃金式に統合
model_85_ml <- heckit(selected ~ educ + exp + exp2 + couple + child, lwage ~ educ + exp + exp2, data = piaac_female, method = "ml")
model_85_ml <- modelsummary(model_85_ml, output = "modelsummary_list")
model_85_ml$tidy <- model_85_ml$tidy %>%
    mutate(component = ifelse(component == "selection", "セレクション式", "賃金式")) # componentを日本語に直し, auxiliarlyは賃金式に統合
models_85 <- list("OLS" = model_85_OLS,
                  "2段階ヘキット" = model_85_2step,
                  "最尤法ヘキット" = model_85_ml)
cm <- c("educ" = "教育年数",
        "exp" = "経験年数",
        "exp2" = "経験年数2乗/100",
        "couple" = "配偶者あり",
        "child" = "子供数",
        "(Intercept)" = "定数項",
        "invMillsRatio" = "逆ミルズ比",
        "rho" = "誤差項の相関")
gm <- tribble(~raw,               ~clean,           ~fmt,
              "adj.r.squared", "$\\bar{R}^2$/疑似$R^2$",      2,
              "nobs",             "$N$", 0)
modelsummary(models_85,
             coef_map = cm,
             gof_map = gm,
             stars = c("*" = 0.05, "**" = 0.01, "***" = 0.001),
             shape = term ~ component)
2段階ヘキット 最尤法ヘキット
OLS セレクション式 賃金式 セレクション式 賃金式
* p < 0.05, ** p < 0.01, *** p < 0.001
教育年数 0.086*** 0.003 0.086*** 0.003 0.086***
(0.013) (0.017) (0.013) (0.017) (0.013)
経験年数 0.007 0.133*** 0.009 0.133*** 0.010
(0.012) (0.014) (0.034) (0.014) (0.018)
経験年数2乗/100 -0.013 -0.239*** -0.017 -0.239*** -0.018
(0.021) (0.025) (0.061) (0.025) (0.031)
配偶者あり -0.288** -0.288**
(0.094) (0.094)
子供数 0.001 0.002
(0.031) (0.031)
定数項 5.831*** -1.173*** 5.786*** -1.175*** 5.768***
(0.237) (0.299) (0.673) (0.299) (0.347)
逆ミルズ比 0.028
(0.390)
誤差項の相関 0.039 0.055
(0.220)
$\bar{R}^2$/疑似$R^2$ 0.05 0.05
$N$ 984 1747 1747

練習問題 8-4 [実証]

  1. 表8-1を参照せよ.
  2. 実証例8.5を参照せよ.