yardstickパッケージでモデル性能を多角的に評価。回帰・分類問題の主要指標からカスタムメトリックまで、実用的な評価手法を完全マスターしよう。
本章では、yardstickパッケージを使用した包括的なモデル評価手法を学習し、実用的な評価システムを構築します。
yardstickパッケージの主要関数を理解し、効果的なモデル評価を実現しましょう。
yardstickは、機械学習モデルの性能評価のための統一されたAPIを提供するTidymodelsの中核パッケージです。
# 基本セットアップ
library(tidymodels)
library(tidyverse)
library(yardstick)
# サンプルデータの準備
data("ames", package = "modeldata")
data("two_class_dat", package = "modeldata")
data("hpc_data", package = "modeldata")
# 回帰用データ
ames_sample <- ames |>
select(Sale_Price, Lot_Area, Year_Built, Overall_Qual) |>
mutate(log_price = log10(Sale_Price)) |>
slice_sample(n = 1000)
# 予測結果のシミュレーション
set.seed(123)
ames_predictions <- ames_sample |>
mutate(
.pred = log_price + rnorm(n(), 0, 0.1),
residuals = log_price - .pred
)
# 分類用データの準備
classification_sample <- two_class_dat |>
rename(truth = Class, estimate = predicted) |>
mutate(
.pred_Class1 = 1 - .pred_Class2,
.pred_Class2 = .pred_Class2
)
回帰問題における代表的な評価指標を理解します。
# 主要な回帰指標の計算
regression_metrics <- ames_predictions |>
metrics(truth = log_price, estimate = .pred)
regression_metrics
# 個別指標の計算
rmse_value <- ames_predictions |> rmse(truth = log_price, estimate = .pred)
mae_value <- ames_predictions |> mae(truth = log_price, estimate = .pred)
rsq_value <- ames_predictions |> rsq(truth = log_price, estimate = .pred)
mape_value <- ames_predictions |> mape(truth = log_price, estimate = .pred)
# 結果のまとめ
tibble(
Metric = c("RMSE", "MAE", "R²", "MAPE"),
Value = c(rmse_value$.estimate, mae_value$.estimate,
rsq_value$.estimate, mape_value$.estimate)
)
特定の用途に応じた評価指標セットを作成できます。
# カスタム指標セットの作成 custom_regression_metrics <- metric_set( rmse, mae, rsq, mape, ccc, # 一致相関係数 rpiq, # 四分位範囲に対する予測性能比 huber_loss # Huber損失 ) # カスタム指標での評価 custom_results <- ames_predictions |> custom_regression_metrics(truth = log_price, estimate = .pred) custom_results
# 建築年代別の評価
ames_grouped <- ames_predictions |>
mutate(
era = case_when(
Year_Built < 1950 ~ "Pre-1950",
Year_Built < 1980 ~ "1950-1979",
Year_Built < 2000 ~ "1980-1999",
TRUE ~ "2000+"
)
)
# グループ別評価
grouped_metrics <- ames_grouped |>
group_by(era) |>
metrics(truth = log_price, estimate = .pred) |>
ungroup()
# 可視化
grouped_metrics |>
filter(.metric == "rmse") |>
ggplot(aes(x = era, y = .estimate)) +
geom_col(fill = "lightblue") +
geom_text(aes(label = round(.estimate, 3)), vjust = -0.3) +
labs(title = "RMSE by Construction Era",
x = "Era", y = "RMSE") +
theme_minimal()
# 残差の詳細分析
residual_analysis <- ames_predictions |>
mutate(
abs_residuals = abs(residuals),
squared_residuals = residuals^2,
standardized_residuals = residuals / sd(residuals)
)
# 残差プロット
p1 <- residual_analysis |>
ggplot(aes(x = .pred, y = residuals)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
geom_smooth(se = FALSE) +
labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals")
p2 <- residual_analysis |>
ggplot(aes(sample = standardized_residuals)) +
stat_qq() + stat_qq_line() +
labs(title = "Q-Q Plot of Standardized Residuals")
library(patchwork)
p1 + p2
二値分類における主要な評価指標を学習します。
# 基本的な分類指標 binary_metrics <- classification_sample |> metrics(truth = truth, estimate = estimate, .pred_Class1) binary_metrics # 混同行列(Confusion Matrix) conf_matrix <- classification_sample |> conf_mat(truth = truth, estimate = estimate) conf_matrix # 混同行列の可視化 autoplot(conf_matrix, type = "heatmap") + scale_fill_gradient(low = "white", high = "lightblue") + theme_minimal()
# 詳細な分類指標
detailed_metrics <- classification_sample |>
summarise(
accuracy = accuracy_vec(truth, estimate),
precision = precision_vec(truth, estimate),
recall = recall_vec(truth, estimate),
f1_score = f_meas_vec(truth, estimate),
specificity = specificity_vec(truth, estimate),
sensitivity = sensitivity_vec(truth, estimate)
)
detailed_metrics
# 指標の解釈
interpretation <- tribble(
~Metric, ~Formula, ~Interpretation,
"Accuracy", "(TP + TN) / (TP + TN + FP + FN)", "全体的な正解率",
"Precision", "TP / (TP + FP)", "陽性予測の正確さ",
"Recall (Sensitivity)", "TP / (TP + FN)", "実際の陽性の検出率",
"Specificity", "TN / (TN + FP)", "実際の陰性の検出率",
"F1 Score", "2 * (Precision * Recall) / (Precision + Recall)", "精度と再現率の調和平均"
)
interpretation
# ROC曲線の作成
roc_curve_data <- classification_sample |>
roc_curve(truth = truth, .pred_Class1)
# ROC曲線の可視化
roc_plot <- roc_curve_data |>
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_line(size = 1.2, color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
coord_equal() +
labs(title = "ROC Curve",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_minimal()
roc_plot
# AUC値の計算
auc_value <- classification_sample |>
roc_auc(truth = truth, .pred_Class1)
auc_value
# PR曲線の作成
pr_curve_data <- classification_sample |>
pr_curve(truth = truth, .pred_Class1)
# PR曲線の可視化
pr_plot <- pr_curve_data |>
ggplot(aes(x = recall, y = precision)) +
geom_line(size = 1.2, color = "green") +
labs(title = "Precision-Recall Curve",
x = "Recall", y = "Precision") +
theme_minimal()
pr_plot
# PR-AUC値の計算
pr_auc_value <- classification_sample |>
pr_auc(truth = truth, .pred_Class1)
pr_auc_value
# ROCとPRの比較
library(patchwork)
(roc_plot + labs(subtitle = paste("AUC =", round(auc_value$.estimate, 3)))) +
(pr_plot + labs(subtitle = paste("PR-AUC =", round(pr_auc_value$.estimate, 3))))
ROC vs PR曲線の使い分け:
3つ以上のクラスがある分類問題の評価方法を学習します。
# 多クラス分類データの準備
multiclass_sample <- hpc_data |>
select(obs, pred, VF:L) |>
slice_sample(n = 500)
# 基本的な多クラス指標
multiclass_metrics <- multiclass_sample |>
metrics(truth = obs, estimate = pred, VF:L)
multiclass_metrics
# クラス別詳細評価
class_metrics <- multiclass_sample |>
summarise(
accuracy = accuracy_vec(obs, pred),
kappa = kap_vec(obs, pred),
mcc = mcc_vec(obs, pred) # Matthews correlation coefficient
)
class_metrics
# マクロ平均とマイクロ平均の計算 macro_micro_comparison <- tribble( ~averaging, ~precision, ~recall, ~f1, "macro", precision_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "macro"), recall_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "macro"), f_meas_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "macro"), "micro", precision_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "micro"), recall_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "micro"), f_meas_vec(multiclass_sample$obs, multiclass_sample$pred, estimator = "micro") ) macro_micro_comparison
# クラス別の混同行列
multiclass_conf <- multiclass_sample |>
conf_mat(truth = obs, estimate = pred)
multiclass_conf
# 混同行列の可視化
autoplot(multiclass_conf, type = "heatmap") +
scale_fill_gradient(low = "white", high = "darkblue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# クラス別の精度・再現率
class_by_class_metrics <- multiclass_sample |>
group_by(obs) |>
summarise(
count = n(),
correct = sum(obs == pred),
accuracy = correct / count
) |>
arrange(desc(accuracy))
class_by_class_metrics
分類問題において、予測確率を二値に変換する最適な閾値を見つけます。
# 異なる閾値でのメトリクス計算
threshold_analysis <- tibble(
threshold = seq(0.1, 0.9, by = 0.05)
) |>
mutate(
metrics = map(threshold, ~ {
classification_sample |>
mutate(
pred_class = if_else(.pred_Class1 >= .x, "Class1", "Class2"),
pred_class = factor(pred_class, levels = levels(truth))
) |>
summarise(
accuracy = accuracy_vec(truth, pred_class),
precision = precision_vec(truth, pred_class),
recall = recall_vec(truth, pred_class),
f1 = f_meas_vec(truth, pred_class)
)
})
) |>
unnest(metrics)
# 最適閾値の可視化
threshold_plot <- threshold_analysis |>
pivot_longer(accuracy:f1, names_to = "metric", values_to = "value") |>
ggplot(aes(x = threshold, y = value, color = metric)) +
geom_line(size = 1) +
geom_point() +
labs(title = "Metrics vs Threshold",
x = "Threshold", y = "Metric Value",
color = "Metric") +
theme_minimal()
threshold_plot
# 最適閾値の特定
optimal_thresholds <- threshold_analysis |>
summarise(
best_accuracy = threshold[which.max(accuracy)],
best_f1 = threshold[which.max(f1)],
best_precision = threshold[which.max(precision)],
best_recall = threshold[which.max(recall)]
)
optimal_thresholds
# Youden's J統計量による最適閾値
# J = Sensitivity + Specificity - 1
youden_analysis <- tibble(
threshold = seq(0.01, 0.99, by = 0.01)
) |>
mutate(
j_stat = map_dbl(threshold, ~ {
pred_class <- if_else(classification_sample$.pred_Class1 >= .x,
"Class1", "Class2")
pred_class <- factor(pred_class, levels = levels(classification_sample$truth))
sens <- sensitivity_vec(classification_sample$truth, pred_class)
spec <- specificity_vec(classification_sample$truth, pred_class)
sens + spec - 1
})
)
# 最適閾値の特定
optimal_youden <- youden_analysis |>
slice_max(j_stat, n = 1)
# Youden統計量の可視化
youden_plot <- youden_analysis |>
ggplot(aes(x = threshold, y = j_stat)) +
geom_line(size = 1, color = "blue") +
geom_vline(xintercept = optimal_youden$threshold,
color = "red", linetype = "dashed") +
geom_text(x = optimal_youden$threshold + 0.1,
y = optimal_youden$j_stat,
label = paste("Optimal =", round(optimal_youden$threshold, 3))) +
labs(title = "Youden's J Statistic vs Threshold",
x = "Threshold", y = "J Statistic") +
theme_minimal()
youden_plot
ビジネス要件に応じた独自の評価指標を作成できます。
# カスタム指標の例:加重精度
weighted_accuracy <- function(data, truth, estimate, class_weights = NULL, ...) {
# デフォルトの重み(全て1)
if (is.null(class_weights)) {
class_weights <- rep(1, length(levels(data[[truth]])))
names(class_weights) <- levels(data[[truth]])
}
# 混同行列の作成
cm <- table(data[[truth]], data[[estimate]])
# 各クラスの正解数に重みを適用
weighted_correct <- sum(diag(cm) * class_weights[rownames(cm)])
total_weighted <- sum(cm * outer(class_weights[rownames(cm)],
class_weights[colnames(cm)]))
# 結果を返す
tibble(
.metric = "weighted_accuracy",
.estimator = "multiclass",
.estimate = weighted_correct / sum(cm)
)
}
# カスタム指標の使用例
custom_weights <- c("Class1" = 2, "Class2" = 1) # Class1を重視
weighted_result <- weighted_accuracy(
classification_sample,
"truth",
"estimate",
class_weights = custom_weights
)
weighted_result
# コスト考慮評価指標
cost_sensitive_metric <- function(data, truth, estimate,
fn_cost = 1, fp_cost = 1, ...) {
cm <- conf_mat_vec(data[[truth]], data[[estimate]])
# 混同行列から各要素を抽出
tp <- cm[1, 1] # True Positive
tn <- cm[2, 2] # True Negative
fp <- cm[2, 1] # False Positive
fn <- cm[1, 2] # False Negative
# 総コストの計算
total_cost <- fn * fn_cost + fp * fp_cost
total_cases <- sum(cm)
# コスト per case
cost_per_case <- total_cost / total_cases
tibble(
.metric = "cost_per_case",
.estimator = "binary",
.estimate = cost_per_case
)
}
# 使用例:偽陰性のコストが偽陽性の5倍の場合
cost_result <- cost_sensitive_metric(
classification_sample,
"truth",
"estimate",
fn_cost = 5,
fp_cost = 1
)
cost_result
# 包括的評価関数
comprehensive_evaluation <- function(data, truth_col, pred_col, pred_prob_col) {
# 基本統計
basic_stats <- data |>
summarise(
n_samples = n(),
n_classes = n_distinct(!!sym(truth_col)),
class_balance = list(table(!!sym(truth_col)))
)
# 基本メトリクス
basic_metrics <- data |>
metrics(truth = !!sym(truth_col),
estimate = !!sym(pred_col),
!!sym(pred_prob_col))
# 混同行列
conf_matrix <- data |>
conf_mat(truth = !!sym(truth_col), estimate = !!sym(pred_col))
# ROC & PR曲線データ
roc_data <- data |>
roc_curve(truth = !!sym(truth_col), !!sym(pred_prob_col))
pr_data <- data |>
pr_curve(truth = !!sym(truth_col), !!sym(pred_prob_col))
# 結果をリストで返す
list(
basic_stats = basic_stats,
metrics = basic_metrics,
confusion_matrix = conf_matrix,
roc_curve = roc_data,
pr_curve = pr_data
)
}
# 使用例
evaluation_report <- comprehensive_evaluation(
classification_sample,
"truth",
"estimate",
".pred_Class1"
)
# レポートの表示
evaluation_report$basic_stats
evaluation_report$metrics
# 可視化関数
create_evaluation_plots <- function(eval_report) {
# ROC曲線
roc_plot <- eval_report$roc_curve |>
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_line(size = 1.2, color = "blue") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
coord_equal() +
labs(title = "ROC Curve", x = "FPR", y = "TPR") +
theme_minimal()
# PR曲線
pr_plot <- eval_report$pr_curve |>
ggplot(aes(x = recall, y = precision)) +
geom_line(size = 1.2, color = "green") +
labs(title = "Precision-Recall Curve") +
theme_minimal()
# 混同行列
conf_plot <- autoplot(eval_report$confusion_matrix, type = "heatmap") +
scale_fill_gradient(low = "white", high = "lightblue") +
labs(title = "Confusion Matrix") +
theme_minimal()
# メトリクス棒グラフ
metrics_plot <- eval_report$metrics |>
filter(.metric %in% c("accuracy", "precision", "recall", "f_meas")) |>
ggplot(aes(x = .metric, y = .estimate)) +
geom_col(fill = "lightcoral") +
geom_text(aes(label = round(.estimate, 3)), vjust = -0.3) +
ylim(0, 1) +
labs(title = "Key Metrics", x = "Metric", y = "Value") +
theme_minimal()
# 4つのプロットを組み合わせ
library(patchwork)
(roc_plot + pr_plot) / (conf_plot + metrics_plot)
}
# 可視化実行
evaluation_plots <- create_evaluation_plots(evaluation_report)
evaluation_plots
異なる問題タイプやデータの性質に応じた最適な評価戦略を選択しましょう。
本章では、yardstickパッケージを使用した包括的なモデル評価技術をマスターしました。実用的な機械学習プロジェクトでの評価技術を習得しました。
🚀 これであなたもモデル評価のエキスパートです!