# 1. Load Data

In this section, we read the NCI-60 dataset and inspect its structure. https://dctd.cancer.gov/drug-discovery-development/assays/high-throughput-screening-services/nci60

mtx <- read.delim(
  "/Users/mdozmorov/Documents/Work/Teaching/BIOS658.2025/static/slides/09_Clustering/data/nci60.tsv",
  sep = "\t", row.names = 1
)
dim(mtx)

## [1] 5244   61

colnames(mtx)

##  [1] "CNS"      "CNS.1"    "CNS.2"    "RENAL"    "BREAST"   "CNS.3"   
##  [7] "CNS.4"    "BREAST.1" "NSCLC"    "NSCLC.1"  "RENAL.1"  "RENAL.2" 
## [13] "RENAL.3"  "RENAL.4"  "RENAL.5"  "RENAL.6"  "RENAL.7"  "BREAST.2"
## [19] "NSCLC.2"  "RENAL.8"  "OVAR"     "MELA"     "OVAR.1"   "OVAR.2"  
## [25] "OVAR.3"   "OVAR.4"   "OVAR.5"   "NSCLC.3"  "NSCLC.4"  "NSCLC.5" 
## [31] "LEUK"     "K562B.re" "K562A.re" "LEUK.1"   "LEUK.2"   "LEUK.3"  
## [37] "LEUK.4"   "LEUK.5"   "COLON"    "COLON.1"  "COLON.2"  "COLON.3" 
## [43] "COLON.4"  "COLON.5"  "COLON.6"  "MCF7A.re" "BREAST.3" "MCF7D.re"
## [49] "BREAST.4" "NSCLC.6"  "NSCLC.7"  "NSCLC.8"  "MELA.1"   "BREAST.5"
## [55] "BREAST.6" "MELA.2"   "MELA.3"   "MELA.4"   "MELA.5"   "MELA.6"  
## [61] "MELA.7"

boxplot(mtx, main = "Expression Value Distributions")

# 2. Create Sample Annotations

Samples contain embedded group names (e.g., "K562.A.1"). We extract the group name and correct minor inconsistencies.

groups <- sapply(colnames(mtx), function(x){
  obj <- strsplit(x, ".", fixed = TRUE)
  unlist(obj)[1]
})

groups[ groups %in% c("K562A","K562B") ] <- "K562"
groups[ groups %in% c("MCF7A","MCF7D") ] <- "MCF7"

table(groups)

## groups
## BREAST    CNS  COLON   K562   LEUK   MCF7   MELA  NSCLC   OVAR  RENAL 
##      7      5      7      2      6      2      8      9      6      9

# 3. PCA (Principal Component Analysis)

We perform PCA on the scaled data and examine variance explained.

pca <- prcomp(scale(t(mtx)))
summary(pca)$importance[,1:5] %>% pander

	PC1	PC2	PC3	PC4	PC5
Standard deviation	25.05	19.45	16.74	14.81	13.58
Proportion of Variance	0.1197	0.07217	0.05343	0.04185	0.03515
Cumulative Proportion	0.1197	0.1919	0.2453	0.2871	0.3223

str(pca)

## List of 5
##  $ sdev    : num [1:61] 25.1 19.5 16.7 14.8 13.6 ...
##  $ rotation: num [1:5244, 1:61] 0.01001 0.00184 0.00162 -0.00613 0.0013 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
##   .. ..$ : chr [1:61] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:5244] -9.10e-18 -2.67e-17 1.82e-17 1.09e-17 -1.46e-17 ...
##   ..- attr(*, "names")= chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
##  $ scale   : Named num [1:5244] 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
##  $ x       : num [1:61, 1:61] 21.1 21.6 32.4 36.3 40.1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:61] "CNS" "CNS.1" "CNS.2" "RENAL" ...
##   .. ..$ : chr [1:61] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"

prcomp() returns a list with five main components. Here’s what each represents.

sdev — Standard Deviations of Principal Components
- A numeric vector of length = number of PCs.
- Each value is the standard deviation of a principal component.
- Larger = more variance explained by that PC.

Square them to get eigenvalues: \(\lambda_i = \text{sdev}_i^2\)

The proportion of variance explained:

(sdev^2) / sum(sdev^2)

rotation — Loadings (Eigenvectors)
- A matrix of gene loadings:
  - Rows = variables (genes)
  - Columns = principal components (PC1…PC61)
- Values represent how much each gene contributes to a PC.
- These are the eigenvectors of the covariance/correlation matrix.
- High absolute value = gene strongly influences that PC.
center — Means Used for Centering
- A named vector of gene-wise means.
- Each gene was mean-centered before PCA: \(x_{ij}' = x_{ij} - \text{center}_i\)
scale — Scaling Factors (if scale=TRUE)
- Contains gene-wise scaling constants.
- In other words: \(x' = \frac{x - \text{center}}{\text{scale}}\)
- If scale=FALSE, this component is NULL.
x — Principal Component Scores
- A matrix of transformed coordinates (scores) in PC space.
  - Rows = samples
  - Columns = PC1…PC61
- These are the coordinates you plot.
- x[i, j] = the position of sample i on PC j.
- They are computed as: \(X_\text{scores} = X_\text{centered} \cdot \text{rotation}\)

# 4. PCA Visualization with ggplot2

We visualize the first two principal components and label each point by sample name.

colorby <- "cells"

pt <- ggplot(
  data = data.frame(pca$x, cells = groups, samples = groups),
  aes(x = PC1, y = PC2, label = samples)
) +
  theme(plot.title = element_text(face = "bold")) +
  ggtitle(paste("PCA colored by", colorby)) +
  geom_point(aes(color = groups), size = 3) +
  geom_text_repel(size = 3) +
  geom_hline(yintercept = 0, color = "gray65") +
  geom_vline(xintercept = 0, color = "gray65") +
  scale_x_continuous(
    name = paste0("PC1 (", round(summary(pca)$importance[2,1]*100, 2), "%)")
  ) +
  scale_y_continuous(
    name = paste0("PC2 (", round(summary(pca)$importance[2,2]*100, 2), "%)")
  )

pt

# 5. Additional PCA Visualizations

ggord

library(ggord)
ggord(pca, groups, arrow = NULL)

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggord package.
##   Please report the issue at <https://github.com/fawda123/ggord/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggfortify

library(ggfortify)
autoplot(
  prcomp(scale(t(mtx))),
  data = data.frame(groups = groups),
  colour = "groups"
)

# 6. Add Artificial Batch Effect

We artificially square half of the samples to simulate batch effects.

mtx_with_batch <- cbind(
  mtx[, 1:round(ncol(mtx) / 2)],
  mtx[, (round(ncol(mtx) / 2) + 1):ncol(mtx)]^2
)
boxplot(mtx_with_batch, main = "Expression with Artificial Batch Effect")

batch <- c(rep(1, round(ncol(mtx)/2)), rep(2, ncol(mtx) - round(ncol(mtx)/2)))
batch

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

# 7. PCA with Batch Effect

pca <- prcomp(t(scale(mtx_with_batch)))
summary(pca)$importance[,1:5] %>% pander

	PC1	PC2	PC3	PC4	PC5
Standard deviation	15.8	15.33	14.62	13.91	13.48
Proportion of Variance	0.04813	0.04528	0.04119	0.03729	0.03502
Cumulative Proportion	0.04813	0.09341	0.1346	0.1719	0.2069

Visualization

ggplot(
  data = data.frame(pca$x, cells = groups, samples = groups,
                    batch = factor(batch)),
  aes(x = PC1, y = PC2, label = samples)
) +
  geom_point(aes(color = batch), size = 3) +
  geom_text(size = 3)

# 8. Which Covariate Explains PCs?

We test how strongly the batch variable associates with PC1–PC3.

lmp <- function(modelobject){
  f <- summary(modelobject)$fstatistic
  p <- pf(f[1], f[2], f[3], lower.tail = FALSE)
  attributes(p) <- NULL
  p
}

covariates <- c("batch")

for (covariate in covariates){
  df <- data.frame(batch = batch, pca$x)

  for (pc in c("PC1","PC2","PC3")){
    model <- lm(as.numeric(df[[pc]]) ~ factor(df[[covariate]]))
    rsq <- signif(summary(model)$adj.r.squared, 5)
    pv  <- signif(lmp(model), 5)
    print(paste(covariate, "explains", rsq, "of", pc, "p-value", pv))
  }
}

## [1] "batch explains 0.35193 of PC1 p-value 2.8222e-07"
## [1] "batch explains -0.016783 of PC2 p-value 0.92213"
## [1] "batch explains -0.006913 of PC3 p-value 0.44623"

# 9. MDS (Multidimensional Scaling)

plotMDS(
  mtx_with_batch,
  col = ifelse(batch == 1, "red", "blue"),
  main = "MDS (Before Batch Removal)"
)

# 10. t-SNE

library(Rtsne)

tsne <- Rtsne(
  (scale(t(mtx))),
  dims = 2,
  perplexity = 20,
  verbose = TRUE,
  max_iter = 500
)

## Performing PCA
## Read the 61 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.983607)!
## Learning embedding...
## Iteration 50: error is 53.837079 (50 iterations in 0.00 seconds)
## Iteration 100: error is 55.901960 (50 iterations in 0.00 seconds)
## Iteration 150: error is 53.315193 (50 iterations in 0.00 seconds)
## Iteration 200: error is 55.352412 (50 iterations in 0.00 seconds)
## Iteration 250: error is 55.316131 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.540285 (50 iterations in 0.00 seconds)
## Iteration 350: error is 1.113548 (50 iterations in 0.00 seconds)
## Iteration 400: error is 1.033850 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.856921 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.715258 (50 iterations in 0.00 seconds)
## Fitting performed in 0.01 seconds.

colors <- rainbow(length(unique(groups)))

scores <- as.data.frame(tsne$Y)
rownames(scores) <- colnames(mtx)
colnames(scores) <- c("Comp.1", "Comp.2")
scores$groups <- groups

ggplot(scores, aes(x = Comp.1, y = Comp.2,
                   label = groups, color = groups)) +
  geom_point(size = 2) +
  geom_text_repel(size = 2) +
  geom_hline(yintercept = 0, color = "gray65") +
  geom_vline(xintercept = 0, color = "gray65") +
  theme(plot.title = element_text(face="bold"))

# 11. 3D PCA + Animated GIF

scores <- as.data.frame(pca$x)

scatterplot3d(
  scores[,c("PC1","PC2","PC3")],
  color = as.numeric(factor(batch))
)

Optional: Rotation Frames for GIF (Mac/Linux)

rename <- function(i){
  if (i < 10)  return(paste0("000", i, "plot.png"))
  if (i < 100) return(paste0("00",  i, "plot.png"))
  paste0("0", i, "plot.png")
}

frames <- 360
for (i in 1:frames){
  name <- rename(i)
  png(name)
  s3d <- scatterplot3d(
    scores[,c("PC1","PC2","PC3")],
    main = paste("Angle", i),
    angle = i,
    pch = 19, cex.symbols = 0.5,
    color = as.numeric(factor(batch))
  )
  coords <- s3d$xyz.convert(scores[,c("PC1","PC2","PC3")])
  text(coords$x, coords$y, labels = groups, pos = 2, cex = 0.7)
  dev.off()
}

system('magick convert *.png -delay 1 -loop 0 3d.gif')
system("rm *.png")

Clustering and Dimensionality Reduction Demo

Mikhail Dozmorov