We simulate raw counts for two samples (e.g., Control and Treatment).
set.seed(123)
n_genes <- 1000
# Skewed distribution of mean expression (many low, few high)
lambdas <- round(rlnorm(n_genes, meanlog = 1, sdlog = 1.2))
# Simulate counts for two samples
counts <- data.frame(
Gene = paste0("Gene", 1:n_genes),
Control = rpois(n_genes, lambdas),
Treatment = rpois(n_genes, lambdas * runif(n_genes, 0.8, 1.2)) # add noise
)
head(counts, 10)
## Gene Control Treatment
## 1 Gene1 0 0
## 2 Gene2 1 1
## 3 Gene3 13 15
## 4 Gene4 3 4
## 5 Gene5 3 3
## 6 Gene6 13 15
## 7 Gene7 7 3
## 8 Gene8 0 0
## 9 Gene9 1 1
## 10 Gene10 2 0
RNA-seq counts vary over several orders of magnitude. Let’s visualize:
library(ggplot2)
library(dplyr)
library(tidyr)
# Reshape to long format
counts_long <- counts %>%
pivot_longer(cols = c("Control", "Treatment"),
names_to = "Sample",
values_to = "Counts")
# Density plot (log scale helps with skewed counts)
ggplot(counts_long, aes(x = Counts, fill = Sample)) +
geom_density(alpha = 0.4) +
scale_fill_manual(values = c("steelblue", "tomato")) +
# scale_x_log10() +
theme_bw() +
labs(title = "Distribution of RNA-seq counts",
x = "Counts",
y = "Density")
We use log2(x + 1) to avoid taking log of zero.
log_counts <- counts
log_counts$Control <- log2(counts$Control + 1)
log_counts$Treatment <- log2(counts$Treatment + 1)
head(log_counts)
## Gene Control Treatment
## 1 Gene1 0.000000 0.000000
## 2 Gene2 1.000000 1.000000
## 3 Gene3 3.807355 4.000000
## 4 Gene4 2.000000 2.321928
## 5 Gene5 2.000000 2.000000
## 6 Gene6 3.807355 4.000000
Compare the distribution of counts before and after transformation.
par(mfrow = c(1,2))
boxplot(counts$Control, counts$Treatment,
names = c("Control","Treatment"),
main = "Raw counts", col = "lightgray")
boxplot(log_counts$Control, log_counts$Treatment,
names = c("Control","Treatment"),
main = "Log2-transformed counts", col = "lightgreen")
Compute fold changes before and after log2 transformation.
counts$FoldChange <- counts$Treatment / (counts$Control + 1)
log_counts$Log2FC <- log_counts$Treatment - log_counts$Control
head(counts[, c("Gene","Control","Treatment","FoldChange")])
## Gene Control Treatment FoldChange
## 1 Gene1 0 0 0.000000
## 2 Gene2 1 1 0.500000
## 3 Gene3 13 15 1.071429
## 4 Gene4 3 4 1.000000
## 5 Gene5 3 3 0.750000
## 6 Gene6 13 15 1.071429
head(log_counts[, c("Gene","Control","Treatment","Log2FC")])
## Gene Control Treatment Log2FC
## 1 Gene1 0.000000 0.000000 0.0000000
## 2 Gene2 1.000000 1.000000 0.0000000
## 3 Gene3 3.807355 4.000000 0.1926451
## 4 Gene4 2.000000 2.321928 0.3219281
## 5 Gene5 2.000000 2.000000 0.0000000
## 6 Gene6 3.807355 4.000000 0.1926451
Fold change (FC): ratio of raw counts.
Log2 fold change (log2FC): difference between log2-transformed counts.
Easier to interpret:
plot(counts$FoldChange, log_counts$Log2FC,
xlab = "Fold Change (ratio)",
ylab = "Log2 Fold Change",
pch = 19, col = "purple")
abline(h = c(-1,0,1), lty = 2, col = "gray")
library(ggplot2)
library(dplyr)
library(tidyr)
# log2-transform (+1 to avoid log(0))
log_counts <- counts %>%
mutate(Control = log2(Control + 1),
Treatment = log2(Treatment + 1))
# sqrt-transform
sqrt_counts <- counts %>%
mutate(Control = sqrt(Control),
Treatment = sqrt(Treatment))
# Convert to long format
counts_long <- counts %>%
select(Control, Treatment) %>%
pivot_longer(cols = everything(), names_to = "Sample", values_to = "Counts") %>%
mutate(Type = "Raw")
log_counts_long <- log_counts %>%
select(Control, Treatment) %>%
pivot_longer(cols = everything(), names_to = "Sample", values_to = "Counts") %>%
mutate(Type = "Log2 (+1)")
sqrt_counts_long <- sqrt_counts %>%
select(Control, Treatment) %>%
pivot_longer(cols = everything(), names_to = "Sample", values_to = "Counts") %>%
mutate(Type = "Sqrt")
# Combine all
all_counts_long <- bind_rows(counts_long, log_counts_long, sqrt_counts_long)
# Plot densities
ggplot(all_counts_long, aes(x = Counts, fill = Sample)) +
geom_density(alpha = 0.4) +
facet_wrap(~Type, scales = "free", ncol = 3) +
scale_fill_manual(values = c("steelblue", "tomato")) +
theme_bw() +
labs(title = "Effect of transformations on RNA-seq counts",
x = "Counts",
y = "Density")
Conclusion: