第9章: tibble

モダンなデータフレーム操作

🏗️ 構造化データ 🔧 スマート表示 ⚡ 高速処理

🏗️ tibbleとdata.frameの違い

tibbleは、Tidyverseにおけるモダンなデータフレーム実装です。従来のdata.frameの問題点を解決し、より直感的で安全なデータ操作を可能にします。

基本的な違いの比較

data.frame

# 従来のdata.frame df <- data.frame( x = 1:1000, y = rnorm(1000) ) # 全てのデータが表示される print(df) # 文字列が自動的にfactorに変換 data.frame(name = "田中")

tibble

# モダンなtibble tb <- tibble( x = 1:1000, y = rnorm(1000) ) # 最初の10行のみ表示 print(tb) # 文字列はそのまま文字列 tibble(name = "田中")

tibbleの作成と基本操作

                    library(tibble)
                    library(dplyr)
                    
                    # tibbleの作成方法
                    # 方法1: tibble()関数
                    employee_tibble <- tibble(
                      id = 1:5,
                      name = c("田中太郎", "佐藤花子", "鈴木一郎", "高橋美咲", "山田健太"),
                      age = c(28, 32, 45, 25, 39),
                      salary = c(450000, 520000, 780000, 380000, 650000),
                      department = c("営業", "開発", "マネジメント", "デザイン", "開発")
                    )
                    
                    print(employee_tibble)
                    
                    # 方法2: data.frameからの変換
                    df <- data.frame(
                      x = 1:3,
                      y = c("a", "b", "c"),
                      stringsAsFactors = FALSE
                    )
                    tb_converted <- as_tibble(df)
                    print(tb_converted)
                    
                    # 方法3: tribble()で行単位作成
                    small_data <- tribble(
                      ~name,     ~score, ~grade,
                      "Alice",   85,     "A",
                      "Bob",     72,     "B",
                      "Charlie", 91,     "A"
                    )
                    print(small_data)
                    
                    # tibbleの基本情報
                    print(paste("クラス:", class(employee_tibble)))
                    print(paste("行数:", nrow(employee_tibble)))
                    print(paste("列数:", ncol(employee_tibble)))
                    print(paste("列名:", paste(names(employee_tibble), collapse = ", ")))
                

tibble作成結果

# A tibble: 5 × 5 id name age salary department <int> <chr> <dbl> <dbl> <chr> 1 1 田中太郎 28 450000 営業 2 2 佐藤花子 32 520000 開発 3 3 鈴木一郎 45 780000 マネジメント 4 4 高橋美咲 25 380000 デザイン 5 5 山田健太 39 650000 開発 # A tibble: 3 × 2 x y <int> <chr> 1 1 a 2 2 b 3 3 c # A tibble: 3 × 3 name score grade <chr> <dbl> <chr> 1 Alice 85 A 2 Bob 72 B 3 Charlie 91 A [1] "クラス: tbl_df tbl data.frame" [1] "行数: 5" [1] "列数: 5" [1] "列名: id, name, age, salary, department"

🔧 tibbleの高度な機能

リスト列とネストしたデータ

リスト列の活用

                    # リスト列を含むtibble
                    nested_data <- tibble(
                      group = c("A", "B", "C"),
                      values = list(
                        c(1, 3, 5, 7),
                        c(2, 4, 6),
                        c(10, 20, 30, 40, 50)
                      ),
                      stats = list(
                        list(mean = 4, sd = 2.58),
                        list(mean = 4, sd = 2),
                        list(mean = 30, sd = 15.81)
                      )
                    )
                    
                    print(nested_data)
                    
                    # リスト列から値を抽出
                    print("グループAの値:")
                    print(nested_data$values[[1]])
                    
                    # リスト列の各要素に対する操作
                    nested_with_summary <- nested_data %>%
                      mutate(
                        length = map_int(values, length),
                        sum_values = map_dbl(values, sum),
                        first_value = map_dbl(values, ~ .x[1]),
                        last_value = map_dbl(values, ~ .x[length(.x)])
                      )
                    
                    print(nested_with_summary)
                    
                    # ネストしたデータフレーム
                    sales_by_region <- tibble(
                      region = c("関東", "関西", "九州"),
                      data = list(
                        tibble(month = 1:3, sales = c(100, 120, 110)),
                        tibble(month = 1:3, sales = c(80, 95, 88)),
                        tibble(month = 1:3, sales = c(60, 70, 65))
                      )
                    )
                    
                    print(sales_by_region)
                    
                    # ネストしたデータの集計
                    region_summary <- sales_by_region %>%
                      mutate(
                        total_sales = map_dbl(data, ~ sum(.x$sales)),
                        avg_sales = map_dbl(data, ~ mean(.x$sales)),
                        growth_rate = map_dbl(data, ~ {
                          sales <- .x$sales
                          (sales[length(sales)] - sales[1]) / sales[1] * 100
                        })
                      )
                    
                    print(region_summary)
                

リスト列操作結果

# A tibble: 3 × 3 group values stats <chr> <list> <list> 1 A <dbl [4]> <list [2]> 2 B <dbl [3]> <list [2]> 3 C <dbl [5]> <list [2]> [1] "グループAの値:" [1] 1 3 5 7 # A tibble: 3 × 7 group values stats length sum_values first_value last_value <chr> <list> <list> <int> <dbl> <dbl> <dbl> 1 A <dbl [4]> <list [2]> 4 16 1 7 2 B <dbl [3]> <list [2]> 3 12 2 6 3 C <dbl [5]> <list [2]> 5 150 10 50 # A tibble: 3 × 2 region data <chr> <list> 1 関東 <tibble [3 × 2]> 2 関西 <tibble [3 × 2]> 3 九州 <tibble [3 × 2]> # A tibble: 3 × 5 region data total_sales avg_sales growth_rate <chr> <list> <dbl> <dbl> <dbl> 1 関東 <tibble [3 × 2]> 330 110 10 2 関西 <tibble [3 × 2]> 263 87.7 10 3 九州 <tibble [3 × 2]> 195 65 8.33

列の追加・削除・変更

動的な列操作

                    # 列の動的追加
                    employee_expanded <- employee_tibble %>%
                      add_column(
                        bonus = salary * 0.1,
                        hire_year = c(2020, 2019, 2015, 2021, 2017),
                        .before = "department"  # department列の前に挿入
                      )
                    
                    print(employee_expanded)
                    
                    # 行の追加
                    new_employee <- employee_expanded %>%
                      add_row(
                        id = 6,
                        name = "林さくら",
                        age = 29,
                        salary = 480000,
                        bonus = 48000,
                        hire_year = 2022,
                        department = "マーケティング"
                      )
                    
                    print(new_employee)
                    
                    # 条件付き列の追加
                    employee_with_status <- new_employee %>%
                      mutate(
                        experience_years = 2023 - hire_year,
                        salary_grade = case_when(
                          salary < 400000 ~ "初級",
                          salary < 600000 ~ "中級",
                          TRUE ~ "上級"
                        ),
                        is_senior = experience_years >= 5,
                        total_compensation = salary + bonus
                      )
                    
                    print(employee_with_status)
                    
                    # 列の選択と並び替え
                    selected_columns <- employee_with_status %>%
                      select(name, department, salary_grade, total_compensation, is_senior) %>%
                      arrange(desc(total_compensation))
                    
                    print(selected_columns)
                

列操作結果

# A tibble: 5 × 7 id name age salary bonus hire_year department <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr> 1 1 田中太郎 28 450000 45000 2020 営業 2 2 佐藤花子 32 520000 52000 2019 開発 3 3 鈴木一郎 45 780000 78000 2015 マネジメント 4 4 高橋美咲 25 380000 38000 2021 デザイン 5 5 山田健太 39 650000 65000 2017 開発 # A tibble: 6 × 11 id name age salary bonus hire_year department experience_years salary_grade is_senior total_compensation <int> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr> <lgl> <dbl> 1 3 鈴木一郎 45 780000 78000 2015 マネジメント 8 上級 TRUE 858000 2 5 山田健太 39 650000 65000 2017 開発 6 上級 TRUE 715000 3 2 佐藤花子 32 520000 52000 2019 開発 4 中級 FALSE 572000 4 6 林さくら 29 480000 48000 2022 マーケティング 1 中級 FALSE 528000 5 1 田中太郎 28 450000 45000 2020 営業 3 中級 FALSE 495000 6 4 高橋美咲 25 380000 38000 2021 デザイン 2 初級 FALSE 418000

⚡ tibbleの表示とデバッグ機能

表示オプションのカスタマイズ

                    # 大きなデータセットの作成
                    set.seed(123)
                    large_tibble <- tibble(
                      id = 1:100,
                      category = sample(c("A", "B", "C", "D", "E"), 100, replace = TRUE),
                      value_1 = rnorm(100, 50, 10),
                      value_2 = rnorm(100, 100, 20),
                      value_3 = rnorm(100, 25, 5),
                      long_text = paste0("これは非常に長いテキストです", 1:100, "番目のレコードを示しています"),
                      date = sample(seq(as.Date("2023-01-01"), as.Date("2023-12-31"), by = "day"), 100)
                    )
                    
                    # デフォルト表示
                    print("デフォルト表示:")
                    print(large_tibble)
                    
                    # 表示行数を変更
                    print("最初の3行のみ表示:")
                    print(large_tibble, n = 3)
                    
                    # すべての列を表示
                    print("全列表示:")
                    print(large_tibble, width = Inf)
                    
                    # glimpse()で構造確認
                    print("データ構造の概要:")
                    glimpse(large_tibble)
                    
                    # View()関数での詳細確認（実際にはRStudioで使用）
                    # View(large_tibble)
                    
                    # 特定の列のサンプル
                    print("カテゴリ列のサンプル:")
                    print(head(large_tibble$category, 10))
                    
                    # データ型の詳細確認
                    print("列の型情報:")
                    print(map_chr(large_tibble, class))
                    
                    # 欠損値の確認
                    # 一部のデータにNAを挿入
                    large_tibble_with_na <- large_tibble %>%
                      mutate(
                        value_1 = ifelse(row_number() %in% c(5, 15, 25), NA_real_, value_1),
                        category = ifelse(row_number() %in% c(10, 20), NA_character_, category)
                      )
                    
                    # 欠損値サマリー
                    na_summary <- large_tibble_with_na %>%
                      summarise_all(~ sum(is.na(.)))
                    
                    print("欠損値の数:")
                    print(na_summary)
                

表示とデバッグ結果

# A tibble: 100 × 7 id category value_1 value_2 value_3 long_text date <int> <chr> <dbl> <dbl> <dbl> <chr> <date> 1 1 A 44.4 87.3 24.4 これは非常に長いテキストです1番… 2023-04-25 2 2 D 47.7 98.2 26.2 これは非常に長いテキストです2番… 2023-06-24 3 3 E 65.6 116. 25.3 これは非常に長いテキストです3番… 2023-03-12 4 4 A 59.1 118. 23.6 これは非常に長いテキストです4番… 2023-08-30 5 5 C 50.3 95.7 19.9 これは非常に長いテキストです5番… 2023-04-23 6 6 B 57.0 101. 27.1 これは非常に長いテキストです6番… 2023-02-06 7 7 E 49.9 99.5 28.4 これは非常に長いテキストです7番… 2023-07-12 8 8 C 72.4 135. 26.9 これは非常に長いテキストです8番… 2023-10-18 9 9 A 44.1 88.6 24.9 これは非常に長いテキストです9番… 2023-05-31 10 10 B 48.8 110. 21.7 これは非常に長いテキストです10番… 2023-12-22 # ℹ 90 more rows 最初の3行のみ表示: # A tibble: 100 × 7 id category value_1 value_2 value_3 long_text date <int> <chr> <dbl> <dbl> <dbl> <chr> <date> 1 1 A 44.4 87.3 24.4 これは非常に長いテキストです1番… 2023-04-25 2 2 D 47.7 98.2 26.2 これは非常に長いテキストです2番… 2023-06-24 3 3 E 65.6 116. 25.3 これは非常に長いテキストです3番… 2023-03-12 # ℹ 97 more rows 欠損値の数: # A tibble: 1 × 7 id category value_1 value_2 value_3 long_text date <int> <int> <int> <int> <int> <int> <int> 1 0 2 3 0 0 0 0

📊 tibbleを使った実践的なデータ分析

売上データの包括的分析

                    # 複雑な売上データの作成
                    set.seed(456)
                    sales_analysis <- tibble(
                      transaction_id = 1:1000,
                      customer_id = sample(1:200, 1000, replace = TRUE),
                      product_category = sample(c("電子機器", "衣料品", "食品", "書籍", "スポーツ用品"), 
                                                    1000, replace = TRUE),
                      purchase_date = sample(seq(as.Date("2023-01-01"), as.Date("2023-12-31"), by = "day"), 
                                              1000, replace = TRUE),
                      amount = round(rlnorm(1000, log(5000), 1)),
                      payment_method = sample(c("クレジット", "現金", "電子マネー"), 
                                                1000, replace = TRUE, prob = c(0.6, 0.2, 0.2)),
                      store_location = sample(c("東京", "大阪", "名古屋", "福岡"), 
                                                1000, replace = TRUE)
                    ) %>%
                      mutate(
                        month = lubridate::month(purchase_date),
                        quarter = lubridate::quarter(purchase_date),
                        day_of_week = lubridate::wday(purchase_date, label = TRUE),
                        amount_category = case_when(
                          amount < 2000 ~ "小額",
                          amount < 10000 ~ "中額",
                          TRUE ~ "高額"
                        )
                      )
                    
                    print("売上データサンプル:")
                    print(head(sales_analysis))
                    
                    # 顧客別の購買パターン分析
                    customer_patterns <- sales_analysis %>%
                      group_by(customer_id) %>%
                      summarise(
                        total_purchases = n(),
                        total_amount = sum(amount),
                        avg_amount = mean(amount),
                        favorite_category = names(sort(table(product_category), decreasing = TRUE))[1],
                        preferred_payment = names(sort(table(payment_method), decreasing = TRUE))[1],
                        first_purchase = min(purchase_date),
                        last_purchase = max(purchase_date),
                        purchase_span_days = as.numeric(max(purchase_date) - min(purchase_date)),
                        .groups = 'drop'
                      ) %>%
                      mutate(
                        customer_segment = case_when(
                          total_amount > 50000 && total_purchases > 10 ~ "VIP",
                          total_amount > 20000 || total_purchases > 5 ~ "優良",
                          TRUE ~ "一般"
                        )
                      )
                    
                    print("顧客セグメント分析:")
                    print(customer_patterns %>% count(customer_segment))
                    
                    # 地域別・時系列分析
                    regional_monthly <- sales_analysis %>%
                      group_by(store_location, month) %>%
                      summarise(
                        transactions = n(),
                        total_sales = sum(amount),
                        avg_transaction = mean(amount),
                        unique_customers = n_distinct(customer_id),
                        .groups = 'drop'
                      ) %>%
                      arrange(store_location, month)
                    
                    print("地域別月次売上（上位10行）:")
                    print(head(regional_monthly, 10))
                    
                    # 商品カテゴリ別の支払い方法傾向
                    category_payment_analysis <- sales_analysis %>%
                      group_by(product_category, payment_method) %>%
                      summarise(
                        count = n(),
                        avg_amount = mean(amount),
                        .groups = 'drop'
                      ) %>%
                      group_by(product_category) %>%
                      mutate(
                        percentage = round(count / sum(count) * 100, 1)
                      ) %>%
                      arrange(product_category, desc(percentage))
                    
                    print("カテゴリ別支払い方法傾向:")
                    print(category_payment_analysis)
                

売上分析結果

売上データサンプル: # A tibble: 6 × 11 transaction_id customer_id product_category purchase_date amount payment_method store_location month quarter day_of_week amount_category <int> <int> <chr> <date> <dbl> <chr> <chr> <dbl> <int> <ord> <chr> 1 1 68 電子機器 2023-09-04 3676 クレジット東京 9 3 Mon 中額 2 2 177 書籍 2023-11-15 1547 現金名古屋 11 4 Wed 小額 3 3 122 食品 2023-03-15 4048 クレジット福岡 3 1 Wed 中額 4 4 132 スポーツ用品 2023-07-20 2738 クレジット東京 7 3 Thu 中額 5 5 168 食品 2023-12-25 2156 電子マネー大阪 12 4 Mon 中額 6 6 188 電子機器 2023-11-05 8556 クレジット大阪 11 4 Sun 中額顧客セグメント分析: # A tibble: 3 × 2 customer_segment n <chr> <int> 1 VIP 14 2 一般 147 3 優良 39 地域別月次売上（上位10行）: # A tibble: 10 × 6 store_location month transactions total_sales avg_transaction unique_customers <chr> <dbl> <int> <dbl> <dbl> <int> 1 大阪 1 19 105732 5565. 18 2 大阪 2 18 80654 4481. 17 3 大阪 3 19 81965 4314. 16 4 大阪 4 22 113256 5148. 18 5 大阪 5 20 122089 6104. 18

tibbleの主要関数	機能	使用例	備考
tibble()	tibbleの作成	tibble(x = 1:3, y = c("a", "b", "c"))	列同士の参照可能
tribble()	行単位での作成	tribble(~x, ~y, 1, "a", 2, "b")	直感的な行入力
add_column()	列の追加	add_column(df, z = 1:3, .before = "y")	位置指定可能
add_row()	行の追加	add_row(df, x = 4, y = "d")	位置指定可能
glimpse()	構造確認	glimpse(df)	strの改良版

第9章の重要ポイント

モダンデータフレーム：安全で直感的なデータ操作
スマート表示：大量データの見やすい出力
リスト列：複雑なデータ構造の効率的管理
型安全性：予期しない型変換を防止
デバッグ支援：glimpse()等の強力な分析ツール
Tidyverse統合：他パッケージとのシームレス連携

📊 高度なTibble操作とモダン技法

ネストしたデータ構造とリスト列の活用

                        # リスト列を含む高度なデータ構造
                        library(tidyverse)
                        library(broom)
                        library(modelr)
                        
                        # ネストしたデータフレーム作成
                        nested_sales <- sales_data %>%
                          group_by(region, product_category) %>%
                          nest() %>%
                          mutate(
                            # 各グループに対して線形回帰モデル
                            models = map(data, ~ lm(sales ~ month, data = .x)),
                            # モデルの統計情報
                            model_stats = map(models, broom::glance),
                            # 係数情報
                            coefficients = map(models, broom::tidy),
                            # 予測値
                            predictions = map(models, broom::augment)
                          )
                        
                        # R²値の抽出と分析
                        r_squared_analysis <- nested_sales %>%
                          unnest(model_stats) %>%
                          select(region, product_category, r.squared, p.value, AIC, BIC) %>%
                          arrange(desc(r.squared))
                        
                        print("地域・カテゴリ別回帰分析結果（R²値上位）:")
                        print(head(r_squared_analysis, 10))
                    

並列処理とメモリ効率的なデータ操作

                        # 大規模データセットの効率的処理
                        library(future)
                        library(furrr)
                        library(arrow)
                        
                        # 並列処理設定
                        plan(multisession, workers = 4)
                        
                        # 大容量データの分割処理
                        chunk_process <- function(data_chunk) {
                          data_chunk %>%
                            mutate(
                              # 複雑な特徴量エンジニアリング
                              rolling_avg_7d = slider::slide_dbl(
                                sales_amount, mean, .before = 6, .complete = TRUE
                              ),
                              rolling_std_7d = slider::slide_dbl(
                                sales_amount, sd, .before = 6, .complete = TRUE
                              ),
                              anomaly_score = abs(sales_amount - rolling_avg_7d) / rolling_std_7d,
                              is_anomaly = anomaly_score > 2
                            ) %>%
                            filter(!is.na(rolling_avg_7d))
                        }
                        
                        # データを1000行ずつ分割して並列処理
                        large_dataset <- tibble(
                          date = seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "day"),
                          sales_amount = runif(length(date), 100, 10000)
                        ) %>%
                          mutate(chunk_id = (row_number() - 1) %/% 1000 + 1) %>%
                          group_split(chunk_id)
                        
                        # 並列処理実行
                        processed_chunks <- large_dataset %>%
                          future_map_dfr(chunk_process, .progress = TRUE)
                        
                        cat("処理されたデータ行数:", nrow(processed_chunks), "\n")
                        cat("異常値検出数:", sum(processed_chunks$is_anomaly, na.rm = TRUE), "\n")
                    

Tibbleのパフォーマンス最適化技法

                        # メモリ使用量とパフォーマンス分析
                        library(pryr)
                        library(bench)
                        library(data.table)
                        
                        # 大規模データセット作成
                        n <- 1000000
                        large_tibble <- tibble(
                          id = 1:n,
                          group = sample(letters[1:10], n, replace = TRUE),
                          value1 = rnorm(n),
                          value2 = rnorm(n),
                          category = sample(paste0("cat_", 1:100), n, replace = TRUE)
                        )
                        
                        # パフォーマンス比較：tibble vs data.frame vs data.table
                        performance_test <- bench::mark(
                          tibble_summarise = {
                            large_tibble %>%
                              group_by(group, category) %>%
                              summarise(
                                mean_v1 = mean(value1),
                                mean_v2 = mean(value2),
                                count = n(),
                                .groups = 'drop'
                              )
                          },
                          data_frame_aggregate = {
                            df <- as.data.frame(large_tibble)
                            aggregate(
                              cbind(value1, value2) ~ group + category,
                              data = df,
                              FUN = mean
                            )
                          },
                          data_table_approach = {
                            dt <- data.table::as.data.table(large_tibble)
                            dt[, .(
                              mean_v1 = mean(value1),
                              mean_v2 = mean(value2),
                              count = .N
                            ), by = .(group, category)]
                          },
                          iterations = 5,
                          check = FALSE
                        )
                        
                        print("パフォーマンス比較結果:")
                        print(performance_test)
                        
                        # メモリ使用量分析
                        memory_usage <- list(
                          tibble_size = pryr::object_size(large_tibble),
                          dataframe_size = pryr::object_size(as.data.frame(large_tibble)),
                          datatable_size = pryr::object_size(data.table::as.data.table(large_tibble))
                        )
                        
                        cat("メモリ使用量比較:\n")
                        print(memory_usage)
                    

高度なTibble操作: 動的列操作とメタプログラミング

                        # 動的な列操作とメタプログラミング技法
                        library(rlang)
                        
                        # 動的集計関数の作成
                        create_summary_function <- function(group_vars, summary_vars, summary_funs) {
                          group_vars <- syms(group_vars)
                          
                          function(data) {
                            data %>%
                              group_by(!!!group_vars) %>%
                              summarise(
                                across(
                                  all_of(summary_vars),
                                  summary_funs,
                                  .names = "{.col}_{.fn}"
                                ),
                                count = n(),
                                .groups = 'drop'
                              )
                          }
                        }
                        
                        # 動的関数の使用例
                        custom_summarizer <- create_summary_function(
                          group_vars = c("product_category", "store_location"),
                          summary_vars = c("amount", "quantity"),
                          summary_funs = list(
                            mean = ~ mean(.x, na.rm = TRUE),
                            median = ~ median(.x, na.rm = TRUE),
                            sd = ~ sd(.x, na.rm = TRUE),
                            q75 = ~ quantile(.x, 0.75, na.rm = TRUE)
                          )
                        )
                        
                        # 条件付きmutate操作
                        conditional_feature_engineering <- function(data, conditions) {
                          for (condition in conditions) {
                            data <- data %>%
                              mutate(
                                !!condition$name := case_when(
                                  !!parse_expr(condition$condition) ~ !!parse_expr(condition$value),
                                  TRUE ~ NA_real_
                                )
                              )
                          }
                          data
                        }
                        
                        # 条件付き特徴量作成の例
                        feature_conditions <- list(
                          list(
                            name = "high_value_purchase",
                            condition = "amount > quantile(amount, 0.9, na.rm = TRUE)",
                            value = "amount"
                          ),
                          list(
                            name = "weekend_premium",
                            condition = "wday(purchase_date) %in% c(1, 7)",
                            value = "amount * 1.1"
                          )
                        )
                        
                        enhanced_data <- sales_analysis %>%
                          conditional_feature_engineering(feature_conditions)
                        
                        print("条件付き特徴量作成結果（サンプル）:")
                        print(glimpse(enhanced_data))
                    

実践的アドバイス

tibbleは、data.frameの改良版として、より安全で予測可能なデータ操作を提供します。特に大規模なデータ分析プロジェクトでは、tibbleの型安全性とスマート表示機能が開発効率を大幅に向上させます。リスト列とネスト構造を活用することで、複雑なデータ分析ワークフローを効率的に実装できます。