ggplot2の真価は、実際のデータを使って美しく洞察に富んだグラフを作成することにあります。ここでは、よく使われるグラフパターンと、プロフェッショナルな仕上げのテクニックを学びましょう。
時系列データの可視化
時系列データは、ビジネスや研究において最も重要なデータタイプの一つです。トレンド、季節性、異常値を効果的に可視化する方法を見てみましょう。
library(tidyverse)
library(lubridate)
sales_ts <- tibble(
date = seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month"),
product_a = cumsum(rnorm(48, mean = 50, sd = 20)) + 1000,
product_b = cumsum(rnorm(48, mean = 30, sd = 15)) + 800,
product_c = cumsum(rnorm(48, mean = 25, sd = 12)) + 600
) %>%
pivot_longer(
cols = starts_with("product"),
names_to = "product",
values_to = "sales"
)
timeseries_plot <- ggplot(sales_ts, aes(x = date, y = sales, color = product)) +
geom_line(size = 1.2, alpha = 0.8) +
geom_point(size = 2, alpha = 0.6) +
scale_x_date(
date_breaks = "6 months",
date_labels = "%Y年%m月",
expand = expansion(mult = c(0.02, 0.02))
) +
scale_y_continuous(
labels = scales::comma_format(suffix = "万円"),
expand = expansion(mult = c(0, 0.1))
) +
scale_color_manual(
values = c("product_a" = "#00ffff", "product_b" = "#ff00ff", "product_c" = "#39ff14"),
labels = c("製品A", "製品B", "製品C")
) +
labs(
title = "製品別売上高の推移",
subtitle = "2020年1月〜2023年12月",
x = "年月",
y = "売上高",
color = "製品",
caption = "データ:社内売上システム"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom",
plot.title = element_text(size = 16, face = "bold")
)
分布の比較と統計可視化
データの分布を理解することは、統計分析の基本です。複数グループの分布を効果的に比較する方法を学びましょう。
survey_data <- tibble(
age_group = rep(c("20代", "30代", "40代", "50代"), each = 250),
income = c(
rnorm(250, mean = 350, sd = 80),
rnorm(250, mean = 450, sd = 100),
rnorm(250, mean = 550, sd = 120),
rnorm(250, mean = 600, sd = 150)
),
satisfaction = sample(c("低", "中", "高"), 1000, replace = TRUE)
)
distribution_plot <- ggplot(survey_data, aes(x = age_group, y = income)) +
geom_violin(
aes(fill = age_group),
alpha = 0.7,
scale = "width"
) +
geom_boxplot(
width = 0.2,
alpha = 0.9,
outlier.color = "red",
outlier.size = 2
) +
stat_summary(
fun = mean,
geom = "point",
color = "white",
size = 3,
shape = 18
) +
scale_fill_manual(
values = c("#00ffff", "#39ff14", "#ff6600", "#ff00ff")
) +
scale_y_continuous(
labels = scales::comma_format(suffix = "万円")
) +
labs(
title = "年代別年収分布の比較",
subtitle = "バイオリンプロット + 箱ひげ図 + 平均値",
x = "年代",
y = "年収",
fill = "年代"
) +
theme_minimal() +
theme(legend.position = "none")
相関関係とパターンの発見
散布図は変数間の関係を理解するための最も強力なツールの一つです。回帰線、信頼区間、グループ分けを組み合わせた高度な分析を行いましょう。
car_data <- mtcars %>%
rownames_to_column("model") %>%
as_tibble() %>%
mutate(
transmission = ifelse(am == 1, "マニュアル", "オートマ"),
efficiency_class = case_when(
mpg >= 25 ~ "高効率",
mpg >= 20 ~ "中効率",
TRUE ~ "低効率"
)
)
correlation_plot <- ggplot(car_data, aes(x = wt, y = mpg)) +
stat_density_2d(alpha = 0.3, color = "gray70") +
geom_smooth(
method = "lm",
color = "#ff00ff",
fill = "#ff00ff",
alpha = 0.2
) +
geom_point(
aes(color = transmission, size = hp, shape = efficiency_class),
alpha = 0.8
) +
geom_text(
data = car_data %>% filter(mpg > 30 | wt > 5),
aes(label = model),
nudge_y = 1,
size = 3,
color = "white"
) +
scale_color_manual(
values = c("マニュアル" = "#00ffff", "オートマ" = "#39ff14")
) +
scale_size_continuous(range = c(3, 8)) +
labs(
title = "自動車の重量と燃費の関係",
subtitle = "変速機タイプ、馬力、効率クラス別分析",
x = "重量 (1000 lbs)",
y = "燃費 (mpg)",
color = "変速機",
size = "馬力",
shape = "効率クラス"
) +
theme_dark() +
theme(
plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "gray10"),
legend.position = "bottom"
)
ファセットによる多次元分析
ファセットは、データの複数の側面を同時に可視化する強力な機能です。Small Multiplesの原理により、複雑なパターンを理解しやすくします。
business_data <- expand_grid(
region = c("東京", "大阪", "名古屋", "福岡"),
quarter = c("Q1", "Q2", "Q3", "Q4"),
product_category = c("電子機器", "家具", "衣料品")
) %>%
mutate(
sales = case_when(
region == "東京" ~ rnorm(n(), mean = 1500, sd = 300),
region == "大阪" ~ rnorm(n(), mean = 1200, sd = 250),
region == "名古屋" ~ rnorm(n(), mean = 800, sd = 200),
TRUE ~ rnorm(n(), mean = 600, sd = 150)
),
profit_margin = case_when(
product_category == "電子機器" ~ runif(n(), 0.15, 0.25),
product_category == "家具" ~ runif(n(), 0.30, 0.45),
TRUE ~ runif(n(), 0.50, 0.70)
)
)
facet_plot <- ggplot(business_data, aes(x = quarter, y = sales)) +
geom_col(
aes(fill = product_category),
position = "dodge",
alpha = 0.8
) +
geom_text(
aes(label = scales::comma(sales, accuracy = 1), group = product_category),
position = position_dodge(width = 0.9),
vjust = -0.5,
size = 3,
color = "white"
) +
facet_wrap(~region, scales = "free_y", ncol = 2) +
scale_fill_manual(
values = c("電子機器" = "#00ffff", "家具" = "#ff6600", "衣料品" = "#39ff14")
) +
scale_y_continuous(
labels = scales::comma_format(suffix = "万円")
) +
labs(
title = "地域別・四半期別・商品カテゴリ別売上分析",
subtitle = "複数次元での売上パフォーマンス比較",
x = "四半期",
y = "売上高",
fill = "商品カテゴリ"
) +
theme_minimal() +
theme(
strip.text = element_text(size = 12, face = "bold"),
legend.position = "bottom"
)