set.seed(456)
complex_data <- tibble(
customer_id = 1:2000,
age = sample(18:80, 2000, replace = TRUE),
income = exp(rnorm(2000, log(50000), 0.5)),
education = sample(c("高校", "大学", "大学院"), 2000, replace = TRUE,
prob = c(0.4, 0.5, 0.1)),
job_category = sample(c("営業", "技術", "管理", "サービス", "その他"),
2000, replace = TRUE),
months_employed = sample(1:240, 2000, replace = TRUE),
credit_score = round(rnorm(2000, 650, 100)),
previous_loans = rpois(2000, 2),
income = ifelse(runif(2000) < 0.05, NA_real_, income),
credit_score = ifelse(runif(2000) < 0.03, NA_real_, credit_score)
) %>%
mutate(
approval_prob = plogis(
-3 +
(age - 30) * 0.02 +
log(income + 1) * 0.3 +
(credit_score - 600) * 0.01 +
sqrt(months_employed) * 0.1 -
previous_loans * 0.2 +
ifelse(education == "大学院", 0.5, ifelse(education == "大学", 0.2, 0))
),
loan_approved = rbinom(n(), 1, approval_prob)
) %>%
select(-approval_prob, -customer_id)
print("データの概要:")
print(head(complex_data))
advanced_recipe <- recipe(loan_approved ~ ., data = complex_data) %>%
step_impute_median(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_YeoJohnson(income) %>%
step_mutate(
income_per_year = income / 12,
employment_ratio = months_employed / age,
credit_score_category = cut(credit_score,
breaks = c(0, 600, 700, 800, 1000),
labels = c("Poor", "Fair", "Good", "Excellent"))
) %>%
step_discretize(age, num_breaks = 4) %>%
step_other(job_category, threshold = 0.05) %>%
step_dummy(all_nominal_predictors()) %>%
step_poly(income_per_year, degree = 2) %>%
step_normalize(all_numeric_predictors()) %>%
step_corr(threshold = 0.9) %>%
step_zv(all_predictors())
print("高度な前処理レシピ:")
print(advanced_recipe)
prepped_recipe <- prep(advanced_recipe, training = complex_data)
processed_data <- bake(prepped_recipe, new_data = complex_data)
print(paste("元の特徴量数:", ncol(complex_data) - 1))
print(paste("処理後の特徴量数:", ncol(processed_data) - 1))
missing_before <- complex_data %>%
summarise_all(~ sum(is.na(.))) %>%
select_if(~ any(. > 0))
missing_after <- processed_data %>%
summarise_all(~ sum(is.na(.))) %>%
select_if(~ any(. > 0))
print("前処理前の欠損値:")
print(missing_before)
print("前処理後の欠損値:")
print(missing_after)