—title: “MA plot and TMM normalization — step-by-step demo” author: “Mikhail Dozmorov” date: “2025-09-23” output: html_document: toc: true number_sections: true —
This document:
edgeR::calcNormFactors().# Install if needed:
# if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install("edgeR")
library(edgeR)
library(ggplot2)
library(gridExtra)
set.seed(2024)
G <- 10000 # genes
S <- 6 # samples
# baseline expression (varying gene means)
base_means <- rgamma(G, shape = 2, scale = 50)
# library size multipliers
lib_factors <- c(1.0, 0.9, 1.1, 1.2, 0.95, 1.4)
lib_sizes <- round(lib_factors * 10e6) # raw sequencing depths
# simulate counts per-sample via NB (some overdispersion)
dispersion <- 20
counts <- matrix(0, nrow = G, ncol = S)
for (s in 1:S){
mu <- base_means * lib_factors[s]
counts[,s] <- rnbinom(G, mu = mu, size = dispersion)
}
colnames(counts) <- paste0("S", 1:S)
rownames(counts) <- paste0("gene", seq_len(G))
# inject composition bias: sample 6 has a small set of hugely expressed genes
set.seed(999)
hi <- sample(1:G, 100)
counts[hi, 6] <- counts[hi, 6] * 50
# quick library sizes
lib_summary <- colSums(counts)
lib_summary
## S1 S2 S3 S4 S5 S6
## 1000237 899395 1104535 1199387 951491 2053329
For two libraries (k and r) we compute:
Genes with zero counts in either library are omitted for the plot/for TMM calculation.
# helper: compute log2-CPM avoiding -Inf (using edgeR::cpm with prior.count)
log2cpm_mat <- function(counts_mat, lib_sizes = colSums(counts_mat), prior.count = 0.5){
# use edgeR cpm (returns counts-per-million)
cpm_mat <- edgeR::cpm(counts_mat, lib.size = lib_sizes, prior.count = prior.count)
log2(cpm_mat + 1e-6) # tiny offset for numerical safety (visualization)
}
# helper: make MA data (M, A) for two columns k and r
make_MA <- function(counts, k, r, lib_sizes = colSums(counts), prior.count = 0.5){
yk <- counts[, k]; yr <- counts[, r]
Nk <- lib_sizes[k]; Nr <- lib_sizes[r]
# remove genes with zero in either (TMM trims these anyway)
valid <- (yk > 0) & (yr > 0)
yk_v <- yk[valid]; yr_v <- yr[valid]
M <- log2( (yk_v / Nk) / (yr_v / Nr) )
A <- 0.5 * log2( (yk_v / Nk) * (yr_v / Nr) )
data.frame(gene = rownames(counts)[valid], M = M, A = A, yk = yk_v, yr = yr_v)
}
# plotting function
plot_MA <- function(ma_df, title = "MA plot", highlight = NULL){
p <- ggplot(ma_df, aes(x = A, y = M)) +
geom_hex(bins = 80) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = title, x = "A = 0.5 * log2( (p_k)*(p_r) )", y = "M = log2( p_k / p_r )") +
theme_minimal()
if (!is.null(highlight)) {
p <- p + geom_point(data = ma_df[rownames(ma_df) %in% highlight, , drop = FALSE],
aes(x = A, y = M), color = "orange", size = 1)
}
p
}
Pick a reference sample. edgeR chooses the sample whose upper quartile CPM is closest to the mean upper quartile; we replicate that to pick the ref.
# choose reference as edgeR would (upper-quartile closeness)
uq <- apply(edgeR::cpm(counts, prior.count = 0.5), 2, function(x) quantile(x, 0.75))
ref <- which.min(abs(uq - mean(uq)))
ref
## S3
## 3
# plot MA: sample 6 vs reference
ma_6_vs_ref <- make_MA(counts, k = 6, r = ref)
p_ma_before <- plot_MA(ma_6_vs_ref, title = paste0("MA: sample 6 vs ref (S", ref, ") — BEFORE TMM"))
p_ma_before
Interpretation: See a global shift away from \(M=0\) if composition bias exists (we injected such bias into sample 6).
Steps implemented below (mirrors the algorithm in Robinson & Oshlack, 2010):
Compute \(M_g\) and \(A_g\) for genes with positive counts in both libraries.
Trim genes with extreme \(M\) (default 30% at each tail) and extreme \(A\) (default 5% at each tail). The double trim keeps genes that survive both trims.
Compute approximate variance of \(M_g\) by delta method (for a binomial proportion):
\[ \operatorname{Var}(M_g) \approx \left(\frac{1}{y_{gk}} - \frac{1}{N_k}\right) + \left(\frac{1}{y_{gr}} - \frac{1}{N_r}\right). \]
Then weights = inverse variance.
Weighted mean of kept \(M_g\) gives the log2 normalization factor for sample \(k\) vs reference \(r\): \(\widehat{\log_2 f_k} = \dfrac{\sum_{g\in G^*} w_g M_g}{\sum_{g\in G^*} w_g}\). Then \(f_k = 2^{\widehat{\log_2 f_k}}\).
# parameters
trim.M <- 0.30 # 30% trimming for M
trim.A <- 0.05 # 5% trimming for A
# function to compute manual TMM factor for sample k relative to ref r
calc_TMM_manual <- function(counts, k, r, lib_sizes = colSums(counts),
trim.M = 0.30, trim.A = 0.05){
yk <- counts[,k]; yr <- counts[,r]
Nk <- lib_sizes[k]; Nr <- lib_sizes[r]
valid <- (yk > 0) & (yr > 0)
yk_v <- yk[valid]; yr_v <- yr[valid]
M <- log2( (yk_v / Nk) / (yr_v / Nr) )
A <- 0.5 * log2( (yk_v / Nk) * (yr_v / Nr) )
# compute delta-method approximate variance of M
varM <- (1 / yk_v - 1 / Nk) + (1 / yr_v - 1 / Nr)
wg <- 1 / varM
# trimming
m_low <- quantile(M, probs = trim.M); m_high <- quantile(M, probs = 1 - trim.M)
a_low <- quantile(A, probs = trim.A); a_high <- quantile(A, probs = 1 - trim.A)
keep <- (M > m_low) & (M < m_high) & (A > a_low) & (A < a_high) & is.finite(wg)
# weighted mean of M over kept genes
w <- wg[keep]
wm <- if (sum(w) > 0) sum( w * M[keep] ) / sum(w) else NA_real_
# log2 factor -> linear factor
list(log2_factor = wm, factor = 2^wm, keep_indices = which(valid)[keep],
M = M, A = A, wg = wg)
}
# compute for sample 6 relative to chosen ref
tmm6 <- calc_TMM_manual(counts, k = 6, r = ref, lib_sizes = colSums(counts),
trim.M = trim.M, trim.A = trim.A)
tmm6$log2_factor
## [1] -0.5393223
tmm6$factor
## [1] 0.688094
Print the result and show the trimmed genes on the MA plot (orange):
# highlight trimmed genes on MA plot (showing kept genes)
kept_genes <- rownames(counts)[tmm6$keep_indices]
p_ma_before + geom_point(data = subset(ma_6_vs_ref, gene %in% kept_genes),
aes(x = A, y = M), color = "cyan", size = 0.6) +
ggtitle(sprintf("MA: S6 vs S%d — BEFORE TMM\nmanual log2 factor = %.4f (factor %.3f)",
ref, tmm6$log2_factor, tmm6$factor))
ref and then scale the manual factors to
have geometric mean = 1 so they are comparable to edgeR
output.edgeR::calcNormFactors() and compare.# manual factors for all samples relative to ref
manual_log2 <- numeric(S)
manual_fac <- numeric(S)
keep_list <- vector("list", S)
for (k in 1:S){
if (k == ref){
manual_log2[k] <- 0
manual_fac[k] <- 1
keep_list[[k]] <- integer(0)
} else {
res <- calc_TMM_manual(counts, k = k, r = ref, lib_sizes = colSums(counts),
trim.M = trim.M, trim.A = trim.A)
manual_log2[k] <- res$log2_factor
manual_fac[k] <- res$factor
keep_list[[k]] <- rownames(counts)[res$keep_indices]
}
}
# scale manual factors so that geometric mean = 1 (edgeR reports factors with geo mean ~1)
gm <- exp(mean(log(manual_fac)))
manual_norm_factors <- manual_fac / gm
# edgeR calcNormFactors (specify refColumn to match our chosen ref if desired)
dge <- DGEList(counts = counts)
dge$samples$lib.size <- colSums(counts)
# specify refColumn so edgeR computes factors relative to that ref (for close comparability)
edgef <- calcNormFactors(dge, method = "TMM", refColumn = ref)
edge_factors <- edgef$samples$norm.factors
# report
data.frame(sample = colnames(counts),
lib.size = colSums(counts),
manual_factor = manual_norm_factors,
edgeR_factor = edge_factors)
## sample lib.size manual_factor edgeR_factor
## S1 S1 1000237 1.064607 1.0644877
## S2 S2 899395 1.064210 1.0642979
## S3 S3 1104535 1.064482 1.0645199
## S4 S4 1199387 1.063999 1.0639378
## S5 S5 951491 1.063941 1.0639617
## S6 S6 2053329 0.732464 0.7324873
Interpretation: The manual_factor and
edgeR_factor are very close. Small differences can arise
due to tie handling and small numeric details.
To visualize the effect of normalization on MA, we compute CPM using
effective library sizes:
effective_libsize = lib.size * norm.factor. We’ll use
edgeR’s normalized factors for this visualization.
# effective library sizes using edgeR's factors
eff_libsizes_edgeR <- colSums(counts) * edge_factors
# MA between sample 6 and ref after applying TMM (via effective lib sizes)
ma_after <- make_MA(counts, k = 6, r = ref, lib_sizes = eff_libsizes_edgeR)
p_ma_after <- plot_MA(ma_after, title = paste0("MA: sample 6 vs ref (S", ref, ") — AFTER TMM (edgeR factors)"))
grid.arrange(p_ma_before, p_ma_after, ncol = 2)
Interpretation: After TMM normalization, the bulk of M values should be centered near zero (dashed red line), demonstrating removal of the global composition bias.
log2cpm_before <- log2(edgeR::cpm(counts, lib.size = colSums(counts), prior.count = 0.5) + 1e-6)
log2cpm_after <- log2(edgeR::cpm(counts, lib.size = eff_libsizes_edgeR, prior.count = 0.5) + 1e-6)
df_before <- data.frame(value = as.vector(log2cpm_before), sample = rep(colnames(counts), each = nrow(counts)), stage = "before")
df_after <- data.frame(value = as.vector(log2cpm_after), sample = rep(colnames(counts), each = nrow(counts)), stage = "after")
df_both <- rbind(df_before, df_after)
ggplot(df_both, aes(x = sample, y = value, fill = sample)) +
geom_boxplot(outlier.size = 0.4) +
facet_wrap(~stage) +
theme_minimal() +
labs(title = "log2-CPM distributions: before vs after TMM", y = "log2(CPM)")
edgeR.calcNormFactors() in
practice. Manual implementation is useful for teaching and
validation.