yardstickパッケージでモデル性能を多角的に評価。回帰・分類問題の主要指標からカスタムメトリックまで、実用的な評価手法を完全マスターしよう。
本章では、yardstickパッケージを使用した包括的なモデル評価手法を学習し、実用的な評価システムを構築します。
yardstickパッケージの主要関数を理解し、効果的なモデル評価を実現しましょう。
yardstickは、機械学習モデルの性能評価のための統一されたAPIを提供するTidymodelsの中核パッケージです。
# 基本セットアップ
library(tidymodels)
library(tidyverse)
library(yardstick)
# サンプルデータの準備
data("ames", package = "modeldata")
data("two_class_dat", package = "modeldata")
data("hpc_data", package = "modeldata")
# 回帰用データ
ames_sample <- ames |> 
  select(Sale_Price, Lot_Area, Year_Built, Overall_Qual) |>
  mutate(log_price = log10(Sale_Price)) |>
  slice_sample(n = 1000)
# 予測結果のシミュレーション
set.seed(123)
ames_predictions <- ames_sample |>
  mutate(
    .pred = log_price + rnorm(n(), 0, 0.1),
    residuals = log_price - .pred
  )
# 分類用データの準備
classification_sample <- two_class_dat |>
  rename(truth = Class, estimate = predicted) |>
  mutate(
    .pred_Class1 = 1 - .pred_Class2,
    .pred_Class2 = .pred_Class2
  )
        回帰問題における代表的な評価指標を理解します。
# 主要な回帰指標の計算
regression_metrics <- ames_predictions |>
  metrics(truth = log_price, estimate = .pred)
regression_metrics
# 個別指標の計算
rmse_value <- ames_predictions |> rmse(truth = log_price, estimate = .pred)
mae_value <- ames_predictions |> mae(truth = log_price, estimate = .pred)
rsq_value <- ames_predictions |> rsq(truth = log_price, estimate = .pred)
mape_value <- ames_predictions |> mape(truth = log_price, estimate = .pred)
# 結果のまとめ
tibble(
  Metric = c("RMSE", "MAE", "R²", "MAPE"),
  Value = c(rmse_value$.estimate, mae_value$.estimate, 
            rsq_value$.estimate, mape_value$.estimate)
)
            特定の用途に応じた評価指標セットを作成できます。
# カスタム指標セットの作成 custom_regression_metrics <- metric_set( rmse, mae, rsq, mape, ccc, # 一致相関係数 rpiq, # 四分位範囲に対する予測性能比 huber_loss # Huber損失 ) # カスタム指標での評価 custom_results <- ames_predictions |> custom_regression_metrics(truth = log_price, estimate = .pred) custom_results
# 建築年代別の評価
ames_grouped <- ames_predictions |>
  mutate(
    era = case_when(
      Year_Built < 1950 ~ "Pre-1950",
      Year_Built < 1980 ~ "1950-1979",
      Year_Built < 2000 ~ "1980-1999",
      TRUE ~ "2000+"
    )
  )
# グループ別評価
grouped_metrics <- ames_grouped |>
  group_by(era) |>
  metrics(truth = log_price, estimate = .pred) |>
  ungroup()
# 可視化
grouped_metrics |>
  filter(.metric == "rmse") |>
  ggplot(aes(x = era, y = .estimate)) +
  geom_col(fill = "lightblue") +
  geom_text(aes(label = round(.estimate, 3)), vjust = -0.3) +
  labs(title = "RMSE by Construction Era",
       x = "Era", y = "RMSE") +
  theme_minimal()
            
# 残差の詳細分析
residual_analysis <- ames_predictions |>
  mutate(
    abs_residuals = abs(residuals),
    squared_residuals = residuals^2,
    standardized_residuals = residuals / sd(residuals)
  )
# 残差プロット
p1 <- residual_analysis |>
  ggplot(aes(x = .pred, y = residuals)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(se = FALSE) +
  labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals")
p2 <- residual_analysis |>
  ggplot(aes(sample = standardized_residuals)) +
  stat_qq() + stat_qq_line() +
  labs(title = "Q-Q Plot of Standardized Residuals")
library(patchwork)
p1 + p2
        二値分類における主要な評価指標を学習します。
# 基本的な分類指標 binary_metrics <- classification_sample |> metrics(truth = truth, estimate = estimate, .pred_Class1) binary_metrics # 混同行列(Confusion Matrix) conf_matrix <- classification_sample |> conf_mat(truth = truth, estimate = estimate) conf_matrix # 混同行列の可視化 autoplot(conf_matrix, type = "heatmap") + scale_fill_gradient(low = "white", high = "lightblue") + theme_minimal()
# 詳細な分類指標
detailed_metrics <- classification_sample |>
  summarise(
    accuracy = accuracy_vec(truth, estimate),
    precision = precision_vec(truth, estimate),
    recall = recall_vec(truth, estimate),
    f1_score = f_meas_vec(truth, estimate),
    specificity = specificity_vec(truth, estimate),
    sensitivity = sensitivity_vec(truth, estimate)
  )
detailed_metrics
# 指標の解釈
interpretation <- tribble(
  ~Metric, ~Formula, ~Interpretation,
  "Accuracy", "(TP + TN) / (TP + TN + FP + FN)", "全体的な正解率",
  "Precision", "TP / (TP + FP)", "陽性予測の正確さ",
  "Recall (Sensitivity)", "TP / (TP + FN)", "実際の陽性の検出率",
  "Specificity", "TN / (TN + FP)", "実際の陰性の検出率",
  "F1 Score", "2 * (Precision * Recall) / (Precision + Recall)", "精度と再現率の調和平均"
)
interpretation
            
# ROC曲線の作成
roc_curve_data <- classification_sample |>
  roc_curve(truth = truth, .pred_Class1)
# ROC曲線の可視化
roc_plot <- roc_curve_data |>
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(size = 1.2, color = "blue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  coord_equal() +
  labs(title = "ROC Curve",
       x = "False Positive Rate (1 - Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal()
roc_plot
# AUC値の計算
auc_value <- classification_sample |>
  roc_auc(truth = truth, .pred_Class1)
auc_value
            
# PR曲線の作成
pr_curve_data <- classification_sample |>
  pr_curve(truth = truth, .pred_Class1)
# PR曲線の可視化
pr_plot <- pr_curve_data |>
  ggplot(aes(x = recall, y = precision)) +
  geom_line(size = 1.2, color = "green") +
  labs(title = "Precision-Recall Curve",
       x = "Recall", y = "Precision") +
  theme_minimal()
pr_plot
# PR-AUC値の計算
pr_auc_value <- classification_sample |>
  pr_auc(truth = truth, .pred_Class1)
pr_auc_value
# ROCとPRの比較
library(patchwork)
(roc_plot + labs(subtitle = paste("AUC =", round(auc_value$.estimate, 3)))) +
(pr_plot + labs(subtitle = paste("PR-AUC =", round(pr_auc_value$.estimate, 3))))
            ROC vs PR曲線の使い分け:
3つ以上のクラスがある分類問題の評価方法を学習します。
# 多クラス分類データの準備
multiclass_sample <- hpc_data |>
  select(obs, pred, VF:L) |>
  slice_sample(n = 500)
# 基本的な多クラス指標
multiclass_metrics <- multiclass_sample |>
  metrics(truth = obs, estimate = pred, VF:L)
multiclass_metrics
# クラス別詳細評価
class_metrics <- multiclass_sample |>
  summarise(
    accuracy = accuracy_vec(obs, pred),
    kappa = kap_vec(obs, pred),
    mcc = mcc_vec(obs, pred)  # Matthews correlation coefficient
  )
class_metrics
            # マクロ平均とマイクロ平均の計算 macro_micro_comparison <- tribble( ~averaging, ~precision, ~recall, ~f1, "macro", precision_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "macro"), recall_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "macro"), f_meas_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "macro"), "micro", precision_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "micro"), recall_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "micro"), f_meas_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "micro") ) macro_micro_comparison
# クラス別の混同行列
multiclass_conf <- multiclass_sample |>
  conf_mat(truth = obs, estimate = pred)
multiclass_conf
# 混同行列の可視化
autoplot(multiclass_conf, type = "heatmap") +
  scale_fill_gradient(low = "white", high = "darkblue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# クラス別の精度・再現率
class_by_class_metrics <- multiclass_sample |>
  group_by(obs) |>
  summarise(
    count = n(),
    correct = sum(obs == pred),
    accuracy = correct / count
  ) |>
  arrange(desc(accuracy))
class_by_class_metrics
        分類問題において、予測確率を二値に変換する最適な閾値を見つけます。
# 異なる閾値でのメトリクス計算
threshold_analysis <- tibble(
  threshold = seq(0.1, 0.9, by = 0.05)
) |>
  mutate(
    metrics = map(threshold, ~ {
      classification_sample |>
        mutate(
          pred_class = if_else(.pred_Class1 >= .x, "Class1", "Class2"),
          pred_class = factor(pred_class, levels = levels(truth))
        ) |>
        summarise(
          accuracy = accuracy_vec(truth, pred_class),
          precision = precision_vec(truth, pred_class),
          recall = recall_vec(truth, pred_class),
          f1 = f_meas_vec(truth, pred_class)
        )
    })
  ) |>
  unnest(metrics)
# 最適閾値の可視化
threshold_plot <- threshold_analysis |>
  pivot_longer(accuracy:f1, names_to = "metric", values_to = "value") |>
  ggplot(aes(x = threshold, y = value, color = metric)) +
  geom_line(size = 1) +
  geom_point() +
  labs(title = "Metrics vs Threshold",
       x = "Threshold", y = "Metric Value",
       color = "Metric") +
  theme_minimal()
threshold_plot
# 最適閾値の特定
optimal_thresholds <- threshold_analysis |>
  summarise(
    best_accuracy = threshold[which.max(accuracy)],
    best_f1 = threshold[which.max(f1)],
    best_precision = threshold[which.max(precision)],
    best_recall = threshold[which.max(recall)]
  )
optimal_thresholds
            
# Youden's J統計量による最適閾値
# J = Sensitivity + Specificity - 1
youden_analysis <- tibble(
  threshold = seq(0.01, 0.99, by = 0.01)
) |>
  mutate(
    j_stat = map_dbl(threshold, ~ {
      pred_class <- if_else(classification_sample$.pred_Class1 >= .x, 
                           "Class1", "Class2")
      pred_class <- factor(pred_class, levels = levels(classification_sample$truth))
      
      sens <- sensitivity_vec(classification_sample$truth, pred_class)
      spec <- specificity_vec(classification_sample$truth, pred_class)
      
      sens + spec - 1
    })
  )
# 最適閾値の特定
optimal_youden <- youden_analysis |>
  slice_max(j_stat, n = 1)
# Youden統計量の可視化
youden_plot <- youden_analysis |>
  ggplot(aes(x = threshold, y = j_stat)) +
  geom_line(size = 1, color = "blue") +
  geom_vline(xintercept = optimal_youden$threshold, 
             color = "red", linetype = "dashed") +
  geom_text(x = optimal_youden$threshold + 0.1, 
            y = optimal_youden$j_stat, 
            label = paste("Optimal =", round(optimal_youden$threshold, 3))) +
  labs(title = "Youden's J Statistic vs Threshold",
       x = "Threshold", y = "J Statistic") +
  theme_minimal()
youden_plot
        ビジネス要件に応じた独自の評価指標を作成できます。
# カスタム指標の例:加重精度
weighted_accuracy <- function(data, truth, estimate, class_weights = NULL, ...) {
  # デフォルトの重み(全て1)
  if (is.null(class_weights)) {
    class_weights <- rep(1, length(levels(data[[truth]])))
    names(class_weights) <- levels(data[[truth]])
  }
  
  # 混同行列の作成
  cm <- table(data[[truth]], data[[estimate]])
  
  # 各クラスの正解数に重みを適用
  weighted_correct <- sum(diag(cm) * class_weights[rownames(cm)])
  total_weighted <- sum(cm * outer(class_weights[rownames(cm)], 
                                  class_weights[colnames(cm)]))
  
  # 結果を返す
  tibble(
    .metric = "weighted_accuracy",
    .estimator = "multiclass",
    .estimate = weighted_correct / sum(cm)
  )
}
# カスタム指標の使用例
custom_weights <- c("Class1" = 2, "Class2" = 1)  # Class1を重視
weighted_result <- weighted_accuracy(
  classification_sample, 
  "truth", 
  "estimate", 
  class_weights = custom_weights
)
weighted_result
            
# コスト考慮評価指標
cost_sensitive_metric <- function(data, truth, estimate, 
                                 fn_cost = 1, fp_cost = 1, ...) {
  cm <- conf_mat_vec(data[[truth]], data[[estimate]])
  
  # 混同行列から各要素を抽出
  tp <- cm[1, 1]  # True Positive
  tn <- cm[2, 2]  # True Negative  
  fp <- cm[2, 1]  # False Positive
  fn <- cm[1, 2]  # False Negative
  
  # 総コストの計算
  total_cost <- fn * fn_cost + fp * fp_cost
  total_cases <- sum(cm)
  
  # コスト per case
  cost_per_case <- total_cost / total_cases
  
  tibble(
    .metric = "cost_per_case",
    .estimator = "binary",
    .estimate = cost_per_case
  )
}
# 使用例:偽陰性のコストが偽陽性の5倍の場合
cost_result <- cost_sensitive_metric(
  classification_sample,
  "truth",
  "estimate", 
  fn_cost = 5,
  fp_cost = 1
)
cost_result
        
# 包括的評価関数
comprehensive_evaluation <- function(data, truth_col, pred_col, pred_prob_col) {
  
  # 基本統計
  basic_stats <- data |>
    summarise(
      n_samples = n(),
      n_classes = n_distinct(!!sym(truth_col)),
      class_balance = list(table(!!sym(truth_col)))
    )
  
  # 基本メトリクス
  basic_metrics <- data |>
    metrics(truth = !!sym(truth_col), 
            estimate = !!sym(pred_col),
            !!sym(pred_prob_col))
  
  # 混同行列
  conf_matrix <- data |>
    conf_mat(truth = !!sym(truth_col), estimate = !!sym(pred_col))
  
  # ROC & PR曲線データ
  roc_data <- data |>
    roc_curve(truth = !!sym(truth_col), !!sym(pred_prob_col))
  
  pr_data <- data |>
    pr_curve(truth = !!sym(truth_col), !!sym(pred_prob_col))
  
  # 結果をリストで返す
  list(
    basic_stats = basic_stats,
    metrics = basic_metrics,
    confusion_matrix = conf_matrix,
    roc_curve = roc_data,
    pr_curve = pr_data
  )
}
# 使用例
evaluation_report <- comprehensive_evaluation(
  classification_sample,
  "truth",
  "estimate", 
  ".pred_Class1"
)
# レポートの表示
evaluation_report$basic_stats
evaluation_report$metrics
            
# 可視化関数
create_evaluation_plots <- function(eval_report) {
  # ROC曲線
  roc_plot <- eval_report$roc_curve |>
    ggplot(aes(x = 1 - specificity, y = sensitivity)) +
    geom_line(size = 1.2, color = "blue") +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    coord_equal() +
    labs(title = "ROC Curve", x = "FPR", y = "TPR") +
    theme_minimal()
  
  # PR曲線
  pr_plot <- eval_report$pr_curve |>
    ggplot(aes(x = recall, y = precision)) +
    geom_line(size = 1.2, color = "green") +
    labs(title = "Precision-Recall Curve") +
    theme_minimal()
  
  # 混同行列
  conf_plot <- autoplot(eval_report$confusion_matrix, type = "heatmap") +
    scale_fill_gradient(low = "white", high = "lightblue") +
    labs(title = "Confusion Matrix") +
    theme_minimal()
  
  # メトリクス棒グラフ
  metrics_plot <- eval_report$metrics |>
    filter(.metric %in% c("accuracy", "precision", "recall", "f_meas")) |>
    ggplot(aes(x = .metric, y = .estimate)) +
    geom_col(fill = "lightcoral") +
    geom_text(aes(label = round(.estimate, 3)), vjust = -0.3) +
    ylim(0, 1) +
    labs(title = "Key Metrics", x = "Metric", y = "Value") +
    theme_minimal()
  
  # 4つのプロットを組み合わせ
  library(patchwork)
  (roc_plot + pr_plot) / (conf_plot + metrics_plot)
}
# 可視化実行
evaluation_plots <- create_evaluation_plots(evaluation_report)
evaluation_plots
        異なる問題タイプやデータの性質に応じた最適な評価戦略を選択しましょう。
本章では、yardstickパッケージを使用した包括的なモデル評価技術をマスターしました。実用的な機械学習プロジェクトでの評価技術を習得しました。
🚀 これであなたもモデル評価のエキスパートです!
※ 当サイトはAmazonアソシエイトプログラムに参加しています