In this section, we read the NCI-60 dataset and inspect its structure. https://dctd.cancer.gov/drug-discovery-development/assays/high-throughput-screening-services/nci60
mtx <- read.delim(
"/Users/mdozmorov/Documents/Work/Teaching/BIOS658.2025/static/slides/09_Clustering/data/nci60.tsv",
sep = "\t", row.names = 1
)
dim(mtx)
## [1] 5244 61
colnames(mtx)
## [1] "CNS" "CNS.1" "CNS.2" "RENAL" "BREAST" "CNS.3"
## [7] "CNS.4" "BREAST.1" "NSCLC" "NSCLC.1" "RENAL.1" "RENAL.2"
## [13] "RENAL.3" "RENAL.4" "RENAL.5" "RENAL.6" "RENAL.7" "BREAST.2"
## [19] "NSCLC.2" "RENAL.8" "OVAR" "MELA" "OVAR.1" "OVAR.2"
## [25] "OVAR.3" "OVAR.4" "OVAR.5" "NSCLC.3" "NSCLC.4" "NSCLC.5"
## [31] "LEUK" "K562B.re" "K562A.re" "LEUK.1" "LEUK.2" "LEUK.3"
## [37] "LEUK.4" "LEUK.5" "COLON" "COLON.1" "COLON.2" "COLON.3"
## [43] "COLON.4" "COLON.5" "COLON.6" "MCF7A.re" "BREAST.3" "MCF7D.re"
## [49] "BREAST.4" "NSCLC.6" "NSCLC.7" "NSCLC.8" "MELA.1" "BREAST.5"
## [55] "BREAST.6" "MELA.2" "MELA.3" "MELA.4" "MELA.5" "MELA.6"
## [61] "MELA.7"
boxplot(mtx, main = "Expression Value Distributions")
Samples contain embedded group names (e.g., "K562.A.1").
We extract the group name and correct minor inconsistencies.
groups <- sapply(colnames(mtx), function(x){
obj <- strsplit(x, ".", fixed = TRUE)
unlist(obj)[1]
})
groups[ groups %in% c("K562A","K562B") ] <- "K562"
groups[ groups %in% c("MCF7A","MCF7D") ] <- "MCF7"
table(groups)
## groups
## BREAST CNS COLON K562 LEUK MCF7 MELA NSCLC OVAR RENAL
## 7 5 7 2 6 2 8 9 6 9
We perform PCA on the scaled data and examine variance explained.
pca <- prcomp(scale(t(mtx)))
summary(pca)$importance[,1:5] %>% pander
| Â | PC1 | PC2 | PC3 | PC4 | PC5 |
|---|---|---|---|---|---|
| Standard deviation | 25.05 | 19.45 | 16.74 | 14.81 | 13.58 |
| Proportion of Variance | 0.1197 | 0.07217 | 0.05343 | 0.04185 | 0.03515 |
| Cumulative Proportion | 0.1197 | 0.1919 | 0.2453 | 0.2871 | 0.3223 |
str(pca)
## List of 5
## $ sdev : num [1:61] 25.1 19.5 16.7 14.8 13.6 ...
## $ rotation: num [1:5244, 1:61] 0.01001 0.00184 0.00162 -0.00613 0.0013 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
## .. ..$ : chr [1:61] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:5244] -9.10e-18 -2.67e-17 1.82e-17 1.09e-17 -1.46e-17 ...
## ..- attr(*, "names")= chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
## $ scale : Named num [1:5244] 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:5244] "GENE6683X" "GENE2811X" "GENE3496X" "GENE4158X" ...
## $ x : num [1:61, 1:61] 21.1 21.6 32.4 36.3 40.1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:61] "CNS" "CNS.1" "CNS.2" "RENAL" ...
## .. ..$ : chr [1:61] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
prcomp() returns a list with five main components.
Here’s what each represents.
sdev — Standard Deviations of Principal
Components
Square them to get eigenvalues: \(\lambda_i = \text{sdev}_i^2\)
The proportion of variance explained:
(sdev^2) / sum(sdev^2)
rotation — Loadings (Eigenvectors)
center — Means Used for Centering
scale — Scaling Factors (if
scale=TRUE)
scale=FALSE, this component is
NULL.x — Principal Component Scores
x[i, j] = the position of sample i on PC
j.We visualize the first two principal components and label each point by sample name.
colorby <- "cells"
pt <- ggplot(
data = data.frame(pca$x, cells = groups, samples = groups),
aes(x = PC1, y = PC2, label = samples)
) +
theme(plot.title = element_text(face = "bold")) +
ggtitle(paste("PCA colored by", colorby)) +
geom_point(aes(color = groups), size = 3) +
geom_text_repel(size = 3) +
geom_hline(yintercept = 0, color = "gray65") +
geom_vline(xintercept = 0, color = "gray65") +
scale_x_continuous(
name = paste0("PC1 (", round(summary(pca)$importance[2,1]*100, 2), "%)")
) +
scale_y_continuous(
name = paste0("PC2 (", round(summary(pca)$importance[2,2]*100, 2), "%)")
)
pt
library(ggord)
ggord(pca, groups, arrow = NULL)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggord package.
## Please report the issue at <https://github.com/fawda123/ggord/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(ggfortify)
autoplot(
prcomp(scale(t(mtx))),
data = data.frame(groups = groups),
colour = "groups"
)
We artificially square half of the samples to simulate batch effects.
mtx_with_batch <- cbind(
mtx[, 1:round(ncol(mtx) / 2)],
mtx[, (round(ncol(mtx) / 2) + 1):ncol(mtx)]^2
)
boxplot(mtx_with_batch, main = "Expression with Artificial Batch Effect")
batch <- c(rep(1, round(ncol(mtx)/2)), rep(2, ncol(mtx) - round(ncol(mtx)/2)))
batch
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
pca <- prcomp(t(scale(mtx_with_batch)))
summary(pca)$importance[,1:5] %>% pander
| Â | PC1 | PC2 | PC3 | PC4 | PC5 |
|---|---|---|---|---|---|
| Standard deviation | 15.8 | 15.33 | 14.62 | 13.91 | 13.48 |
| Proportion of Variance | 0.04813 | 0.04528 | 0.04119 | 0.03729 | 0.03502 |
| Cumulative Proportion | 0.04813 | 0.09341 | 0.1346 | 0.1719 | 0.2069 |
ggplot(
data = data.frame(pca$x, cells = groups, samples = groups,
batch = factor(batch)),
aes(x = PC1, y = PC2, label = samples)
) +
geom_point(aes(color = batch), size = 3) +
geom_text(size = 3)
We test how strongly the batch variable associates with PC1–PC3.
lmp <- function(modelobject){
f <- summary(modelobject)$fstatistic
p <- pf(f[1], f[2], f[3], lower.tail = FALSE)
attributes(p) <- NULL
p
}
covariates <- c("batch")
for (covariate in covariates){
df <- data.frame(batch = batch, pca$x)
for (pc in c("PC1","PC2","PC3")){
model <- lm(as.numeric(df[[pc]]) ~ factor(df[[covariate]]))
rsq <- signif(summary(model)$adj.r.squared, 5)
pv <- signif(lmp(model), 5)
print(paste(covariate, "explains", rsq, "of", pc, "p-value", pv))
}
}
## [1] "batch explains 0.35193 of PC1 p-value 2.8222e-07"
## [1] "batch explains -0.016783 of PC2 p-value 0.92213"
## [1] "batch explains -0.006913 of PC3 p-value 0.44623"
plotMDS(
mtx_with_batch,
col = ifelse(batch == 1, "red", "blue"),
main = "MDS (Before Batch Removal)"
)
library(Rtsne)
tsne <- Rtsne(
(scale(t(mtx))),
dims = 2,
perplexity = 20,
verbose = TRUE,
max_iter = 500
)
## Performing PCA
## Read the 61 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.983607)!
## Learning embedding...
## Iteration 50: error is 53.837079 (50 iterations in 0.00 seconds)
## Iteration 100: error is 55.901960 (50 iterations in 0.00 seconds)
## Iteration 150: error is 53.315193 (50 iterations in 0.00 seconds)
## Iteration 200: error is 55.352412 (50 iterations in 0.00 seconds)
## Iteration 250: error is 55.316131 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.540285 (50 iterations in 0.00 seconds)
## Iteration 350: error is 1.113548 (50 iterations in 0.00 seconds)
## Iteration 400: error is 1.033850 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.856921 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.715258 (50 iterations in 0.00 seconds)
## Fitting performed in 0.01 seconds.
colors <- rainbow(length(unique(groups)))
scores <- as.data.frame(tsne$Y)
rownames(scores) <- colnames(mtx)
colnames(scores) <- c("Comp.1", "Comp.2")
scores$groups <- groups
ggplot(scores, aes(x = Comp.1, y = Comp.2,
label = groups, color = groups)) +
geom_point(size = 2) +
geom_text_repel(size = 2) +
geom_hline(yintercept = 0, color = "gray65") +
geom_vline(xintercept = 0, color = "gray65") +
theme(plot.title = element_text(face="bold"))
scores <- as.data.frame(pca$x)
scatterplot3d(
scores[,c("PC1","PC2","PC3")],
color = as.numeric(factor(batch))
)
rename <- function(i){
if (i < 10) return(paste0("000", i, "plot.png"))
if (i < 100) return(paste0("00", i, "plot.png"))
paste0("0", i, "plot.png")
}
frames <- 360
for (i in 1:frames){
name <- rename(i)
png(name)
s3d <- scatterplot3d(
scores[,c("PC1","PC2","PC3")],
main = paste("Angle", i),
angle = i,
pch = 19, cex.symbols = 0.5,
color = as.numeric(factor(batch))
)
coords <- s3d$xyz.convert(scores[,c("PC1","PC2","PC3")])
text(coords$x, coords$y, labels = groups, pos = 2, cex = 0.7)
dev.off()
}
system('magick convert *.png -delay 1 -loop 0 3d.gif')
system("rm *.png")