# 1. Load Data

In this section, we read the NCI-60 dataset and inspect its structure. https://dctd.cancer.gov/drug-discovery-development/assays/high-throughput-screening-services/nci60

mtx <- read.delim(
  "/Users/mdozmorov/Documents/Work/Teaching/BIOS658.2025/static/slides/09_Clustering/data/nci60.tsv",
  sep = "\t", row.names = 1
)
dim(mtx)
## [1] 5244   61
colnames(mtx)
##  [1] "CNS"      "CNS.1"    "CNS.2"    "RENAL"    "BREAST"   "CNS.3"   
##  [7] "CNS.4"    "BREAST.1" "NSCLC"    "NSCLC.1"  "RENAL.1"  "RENAL.2" 
## [13] "RENAL.3"  "RENAL.4"  "RENAL.5"  "RENAL.6"  "RENAL.7"  "BREAST.2"
## [19] "NSCLC.2"  "RENAL.8"  "OVAR"     "MELA"     "OVAR.1"   "OVAR.2"  
## [25] "OVAR.3"   "OVAR.4"   "OVAR.5"   "NSCLC.3"  "NSCLC.4"  "NSCLC.5" 
## [31] "LEUK"     "K562B.re" "K562A.re" "LEUK.1"   "LEUK.2"   "LEUK.3"  
## [37] "LEUK.4"   "LEUK.5"   "COLON"    "COLON.1"  "COLON.2"  "COLON.3" 
## [43] "COLON.4"  "COLON.5"  "COLON.6"  "MCF7A.re" "BREAST.3" "MCF7D.re"
## [49] "BREAST.4" "NSCLC.6"  "NSCLC.7"  "NSCLC.8"  "MELA.1"   "BREAST.5"
## [55] "BREAST.6" "MELA.2"   "MELA.3"   "MELA.4"   "MELA.5"   "MELA.6"  
## [61] "MELA.7"
boxplot(mtx, main = "Expression Value Distributions")


# 2. Create Sample Annotations

Samples contain embedded group names (e.g., "K562.A.1"). We extract the group name and correct minor inconsistencies.

groups <- sapply(colnames(mtx), function(x){
  obj <- strsplit(x, ".", fixed = TRUE)
  unlist(obj)[1]
})

groups[ groups %in% c("K562A","K562B") ] <- "K562"
groups[ groups %in% c("MCF7A","MCF7D") ] <- "MCF7"

table(groups)
## groups
## BREAST    CNS  COLON   K562   LEUK   MCF7   MELA  NSCLC   OVAR  RENAL 
##      7      5      7      2      6      2      8      9      6      9

# 3. PCA (Principal Component Analysis)

We perform PCA on the scaled data and examine variance explained.

pca <- prcomp(scale(t(mtx)))
summary(pca)$importance[,1:5] %>% pander
  PC1 PC2 PC3 PC4 PC5
Standard deviation 25.05 19.45 16.74 14.81 13.58
Proportion of Variance 0.1197 0.07217 0.05343 0.04185 0.03515
Cumulative Proportion 0.1197 0.1919 0.2453 0.2871 0.3223
str(pca)
## List of 5
##  $ sdev    : num [1:61] 25.1 19.5 16.7 14.8 13.6 ...
##  $ rotation: num [1:5244, 1:61] 0.01001 0.00184 0.00162 -0.00613 0.0013 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
##   .. ..$ : chr [1:61] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:5244] -9.10e-18 -2.67e-17 1.82e-17 1.09e-17 -1.46e-17 ...
##   ..- attr(*, "names")= chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
##  $ scale   : Named num [1:5244] 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
##  $ x       : num [1:61, 1:61] 21.1 21.6 32.4 36.3 40.1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:61] "CNS" "CNS.1" "CNS.2" "RENAL" ...
##   .. ..$ : chr [1:61] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"

prcomp() returns a list with five main components. Here’s what each represents.

Square them to get eigenvalues: \(\lambda_i = \text{sdev}_i^2\)

The proportion of variance explained:

(sdev^2) / sum(sdev^2)

# 4. PCA Visualization with ggplot2

We visualize the first two principal components and label each point by sample name.

colorby <- "cells"

pt <- ggplot(
  data = data.frame(pca$x, cells = groups, samples = groups),
  aes(x = PC1, y = PC2, label = samples)
) +
  theme(plot.title = element_text(face = "bold")) +
  ggtitle(paste("PCA colored by", colorby)) +
  geom_point(aes(color = groups), size = 3) +
  geom_text_repel(size = 3) +
  geom_hline(yintercept = 0, color = "gray65") +
  geom_vline(xintercept = 0, color = "gray65") +
  scale_x_continuous(
    name = paste0("PC1 (", round(summary(pca)$importance[2,1]*100, 2), "%)")
  ) +
  scale_y_continuous(
    name = paste0("PC2 (", round(summary(pca)$importance[2,2]*100, 2), "%)")
  )

pt


# 5. Additional PCA Visualizations

ggord

library(ggord)
ggord(pca, groups, arrow = NULL)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggord package.
##   Please report the issue at <https://github.com/fawda123/ggord/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggfortify

library(ggfortify)
autoplot(
  prcomp(scale(t(mtx))),
  data = data.frame(groups = groups),
  colour = "groups"
)


# 6. Add Artificial Batch Effect

We artificially square half of the samples to simulate batch effects.

mtx_with_batch <- cbind(
  mtx[, 1:round(ncol(mtx) / 2)],
  mtx[, (round(ncol(mtx) / 2) + 1):ncol(mtx)]^2
)
boxplot(mtx_with_batch, main = "Expression with Artificial Batch Effect")

batch <- c(rep(1, round(ncol(mtx)/2)), rep(2, ncol(mtx) - round(ncol(mtx)/2)))
batch
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

# 7. PCA with Batch Effect

pca <- prcomp(t(scale(mtx_with_batch)))
summary(pca)$importance[,1:5] %>% pander
  PC1 PC2 PC3 PC4 PC5
Standard deviation 15.8 15.33 14.62 13.91 13.48
Proportion of Variance 0.04813 0.04528 0.04119 0.03729 0.03502
Cumulative Proportion 0.04813 0.09341 0.1346 0.1719 0.2069

Visualization

ggplot(
  data = data.frame(pca$x, cells = groups, samples = groups,
                    batch = factor(batch)),
  aes(x = PC1, y = PC2, label = samples)
) +
  geom_point(aes(color = batch), size = 3) +
  geom_text(size = 3)


# 8. Which Covariate Explains PCs?

We test how strongly the batch variable associates with PC1–PC3.

lmp <- function(modelobject){
  f <- summary(modelobject)$fstatistic
  p <- pf(f[1], f[2], f[3], lower.tail = FALSE)
  attributes(p) <- NULL
  p
}

covariates <- c("batch")

for (covariate in covariates){
  df <- data.frame(batch = batch, pca$x)

  for (pc in c("PC1","PC2","PC3")){
    model <- lm(as.numeric(df[[pc]]) ~ factor(df[[covariate]]))
    rsq <- signif(summary(model)$adj.r.squared, 5)
    pv  <- signif(lmp(model), 5)
    print(paste(covariate, "explains", rsq, "of", pc, "p-value", pv))
  }
}
## [1] "batch explains 0.35193 of PC1 p-value 2.8222e-07"
## [1] "batch explains -0.016783 of PC2 p-value 0.92213"
## [1] "batch explains -0.006913 of PC3 p-value 0.44623"

# 9. MDS (Multidimensional Scaling)

plotMDS(
  mtx_with_batch,
  col = ifelse(batch == 1, "red", "blue"),
  main = "MDS (Before Batch Removal)"
)


# 10. t-SNE

library(Rtsne)

tsne <- Rtsne(
  (scale(t(mtx))),
  dims = 2,
  perplexity = 20,
  verbose = TRUE,
  max_iter = 500
)
## Performing PCA
## Read the 61 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.983607)!
## Learning embedding...
## Iteration 50: error is 53.837079 (50 iterations in 0.00 seconds)
## Iteration 100: error is 55.901960 (50 iterations in 0.00 seconds)
## Iteration 150: error is 53.315193 (50 iterations in 0.00 seconds)
## Iteration 200: error is 55.352412 (50 iterations in 0.00 seconds)
## Iteration 250: error is 55.316131 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.540285 (50 iterations in 0.00 seconds)
## Iteration 350: error is 1.113548 (50 iterations in 0.00 seconds)
## Iteration 400: error is 1.033850 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.856921 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.715258 (50 iterations in 0.00 seconds)
## Fitting performed in 0.01 seconds.
colors <- rainbow(length(unique(groups)))

scores <- as.data.frame(tsne$Y)
rownames(scores) <- colnames(mtx)
colnames(scores) <- c("Comp.1", "Comp.2")
scores$groups <- groups

ggplot(scores, aes(x = Comp.1, y = Comp.2,
                   label = groups, color = groups)) +
  geom_point(size = 2) +
  geom_text_repel(size = 2) +
  geom_hline(yintercept = 0, color = "gray65") +
  geom_vline(xintercept = 0, color = "gray65") +
  theme(plot.title = element_text(face="bold"))


# 11. 3D PCA + Animated GIF

scores <- as.data.frame(pca$x)

scatterplot3d(
  scores[,c("PC1","PC2","PC3")],
  color = as.numeric(factor(batch))
)

Optional: Rotation Frames for GIF (Mac/Linux)

rename <- function(i){
  if (i < 10)  return(paste0("000", i, "plot.png"))
  if (i < 100) return(paste0("00",  i, "plot.png"))
  paste0("0", i, "plot.png")
}

frames <- 360
for (i in 1:frames){
  name <- rename(i)
  png(name)
  s3d <- scatterplot3d(
    scores[,c("PC1","PC2","PC3")],
    main = paste("Angle", i),
    angle = i,
    pch = 19, cex.symbols = 0.5,
    color = as.numeric(factor(batch))
  )
  coords <- s3d$xyz.convert(scores[,c("PC1","PC2","PC3")])
  text(coords$x, coords$y, labels = groups, pos = 2, cex = 0.7)
  dev.off()
}

system('magick convert *.png -delay 1 -loop 0 3d.gif')
system("rm *.png")