Tidy PCA

Published

2026

Lorem

download.file(url = "https://github.com/ramhiser/datamicroarray/raw/refs/heads/master/data/golub.RData",
              destfile = "data/golub.RData")
load("data/golub.RData")
library("tidyverse")
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("broom")
golub_tidy <- golub |>
  pluck("x") |>
  as_tibble() |>
  mutate(y = pluck(golub, "y")) |>
  relocate(y)
pca_data <- golub_tidy |> 
  select(-y) |> 
  prcomp(center = TRUE, scale. = TRUE)
scree_data <- pca_data |>
  tidy(matrix = "eigenvalues") |> 
  mutate(label = str_c("PC", PC, " (", round(percent*100, 1), "%)"))
scree_data |> 
  ggplot(aes(x = PC,
             y = percent)) +
  geom_col(colour = "black",
           alpha = 0.5) +
  geom_hline(yintercept = 0) +
  theme_minimal()

pca_data |>
  augment(golub_tidy) |> 
  ggplot(mapping = aes(x = .fittedPC1,
                       y = .fittedPC2,
                       colour = y)) +
  geom_vline(xintercept = 0) +
  geom_hline(yintercept = 0) +
  geom_point() +
  stat_ellipse() +
  scale_color_manual(values = c("ALL" = "#0072B2", "AML" = "#D55E00")) +
  theme_minimal() +
  labs(x = scree_data |> filter(PC == 1) |> pull(label),
       y = scree_data |> filter(PC == 2) |> pull(label))