1 Example: Raw RNA-seq counts

We simulate raw counts for two samples (e.g., Control and Treatment).

set.seed(123)
n_genes <- 1000

# Skewed distribution of mean expression (many low, few high)
lambdas <- round(rlnorm(n_genes, meanlog = 1, sdlog = 1.2))

# Simulate counts for two samples
counts <- data.frame(
  Gene = paste0("Gene", 1:n_genes),
  Control = rpois(n_genes, lambdas),
  Treatment = rpois(n_genes, lambdas * runif(n_genes, 0.8, 1.2)) # add noise
)

head(counts, 10)
##      Gene Control Treatment
## 1   Gene1       0         0
## 2   Gene2       1         1
## 3   Gene3      13        15
## 4   Gene4       3         4
## 5   Gene5       3         3
## 6   Gene6      13        15
## 7   Gene7       7         3
## 8   Gene8       0         0
## 9   Gene9       1         1
## 10 Gene10       2         0

2 Raw counts are highly skewed

RNA-seq counts vary over several orders of magnitude. Let’s visualize:

library(ggplot2)
library(dplyr)
library(tidyr)

# Reshape to long format
counts_long <- counts %>%
  pivot_longer(cols = c("Control", "Treatment"),
               names_to = "Sample",
               values_to = "Counts")

# Density plot (log scale helps with skewed counts)
ggplot(counts_long, aes(x = Counts, fill = Sample)) +
  geom_density(alpha = 0.4) +
  scale_fill_manual(values = c("steelblue", "tomato")) +
  # scale_x_log10() +
  theme_bw() +
  labs(title = "Distribution of RNA-seq counts",
       x = "Counts",
       y = "Density")

3 Log2 transformation

We use log2(x + 1) to avoid taking log of zero.

log_counts <- counts
log_counts$Control <- log2(counts$Control + 1)
log_counts$Treatment <- log2(counts$Treatment + 1)
head(log_counts)
##    Gene  Control Treatment
## 1 Gene1 0.000000  0.000000
## 2 Gene2 1.000000  1.000000
## 3 Gene3 3.807355  4.000000
## 4 Gene4 2.000000  2.321928
## 5 Gene5 2.000000  2.000000
## 6 Gene6 3.807355  4.000000

4 Effect of log2 transformation

Compare the distribution of counts before and after transformation.

par(mfrow = c(1,2))
boxplot(counts$Control, counts$Treatment,
        names = c("Control","Treatment"),
        main = "Raw counts", col = "lightgray")
boxplot(log_counts$Control, log_counts$Treatment,
        names = c("Control","Treatment"),
        main = "Log2-transformed counts", col = "lightgreen")

5 Fold changes

Compute fold changes before and after log2 transformation.

counts$FoldChange <- counts$Treatment / (counts$Control + 1)
log_counts$Log2FC <- log_counts$Treatment - log_counts$Control

head(counts[, c("Gene","Control","Treatment","FoldChange")])
##    Gene Control Treatment FoldChange
## 1 Gene1       0         0   0.000000
## 2 Gene2       1         1   0.500000
## 3 Gene3      13        15   1.071429
## 4 Gene4       3         4   1.000000
## 5 Gene5       3         3   0.750000
## 6 Gene6      13        15   1.071429
head(log_counts[, c("Gene","Control","Treatment","Log2FC")])
##    Gene  Control Treatment    Log2FC
## 1 Gene1 0.000000  0.000000 0.0000000
## 2 Gene2 1.000000  1.000000 0.0000000
## 3 Gene3 3.807355  4.000000 0.1926451
## 4 Gene4 2.000000  2.321928 0.3219281
## 5 Gene5 2.000000  2.000000 0.0000000
## 6 Gene6 3.807355  4.000000 0.1926451

6 Visualization of fold changes

plot(counts$FoldChange, log_counts$Log2FC,
     xlab = "Fold Change (ratio)",
     ylab = "Log2 Fold Change",
     pch = 19, col = "purple")
abline(h = c(-1,0,1), lty = 2, col = "gray")

7 Square root transformation

library(ggplot2)
library(dplyr)
library(tidyr)

# log2-transform (+1 to avoid log(0))
log_counts <- counts %>%
  mutate(Control = log2(Control + 1),
         Treatment = log2(Treatment + 1))

# sqrt-transform
sqrt_counts <- counts %>%
  mutate(Control = sqrt(Control),
         Treatment = sqrt(Treatment))

# Convert to long format
counts_long <- counts %>%
  select(Control, Treatment) %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Counts") %>%
  mutate(Type = "Raw")

log_counts_long <- log_counts %>%
  select(Control, Treatment) %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Counts") %>%
  mutate(Type = "Log2 (+1)")

sqrt_counts_long <- sqrt_counts %>%
  select(Control, Treatment) %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Counts") %>%
  mutate(Type = "Sqrt")

# Combine all
all_counts_long <- bind_rows(counts_long, log_counts_long, sqrt_counts_long)

# Plot densities
ggplot(all_counts_long, aes(x = Counts, fill = Sample)) +
  geom_density(alpha = 0.4) +
  facet_wrap(~Type, scales = "free", ncol = 3) +
  scale_fill_manual(values = c("steelblue", "tomato")) +
  theme_bw() +
  labs(title = "Effect of transformations on RNA-seq counts",
       x = "Counts",
       y = "Density")

Conclusion: