1 Purpose
2 0. Packages & paths
3 1. Read files and extract expression matrices
4 2. Build expression matrix (genes × samples) and batch vector
5 3. Filter rows (optional) and log2 transform for visualization
6 4. Boxplots colored by batch (raw / log2)
7 5. Correlogram (top 1000 most variable genes)
8 6. ComBat batch correction
9 7. Visualize ComBat results: boxplots and correlogram
10 8. SVA: estimate surrogate variables and remove their effects
11 9. Summary table of correlation summaries
12 10. Notes & recommendations

1 Purpose

This document:

Loads multiple processed expression Excel files (sheet "Expression") and merges them by probe_id.
Treats each file as a separate batch.
Visualizes distributions (boxplots) and correlogram (top 1000 most variable genes).
Applies ComBat (sva::ComBat) and visualizes the results.
Applies SVA (svaseq + remove surrogate variables) and visualizes the results.

2 0. Packages & paths

# Install packages if you don't have them (uncomment)
# install.packages(c("readxl","tidyverse","matrixStats","corrplot","pheatmap","reshape2"))
# if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager")
# BiocManager::install(c("sva","limma"))

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(matrixStats)

## 
## Attaching package: 'matrixStats'

## The following object is masked from 'package:dplyr':
## 
##     count

library(corrplot)

## corrplot 0.95 loaded

library(pheatmap)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(sva)

## Loading required package: mgcv

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.

## Loading required package: genefilter

## 
## Attaching package: 'genefilter'

## The following objects are masked from 'package:matrixStats':
## 
##     rowSds, rowVars

## Loading required package: BiocParallel

library(limma)

Set path to your folder (update if needed):

base_dir <- "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO"
files <- c("E-MTAB-1361_processed.xlsx",
           "GSE12102_processed.xlsx",
           "GSE13433_processed.xlsx",
           "GSE20196_processed.xlsx",
           "GSE32569_processed.xlsx",
           "GSE66533_processed.xlsx")
files <- file.path(base_dir, files)
files

## [1] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/E-MTAB-1361_processed.xlsx"
## [2] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE12102_processed.xlsx"   
## [3] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE13433_processed.xlsx"   
## [4] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE20196_processed.xlsx"   
## [5] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE32569_processed.xlsx"   
## [6] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE66533_processed.xlsx"

3 1. Read files and extract expression matrices

Each file is expected to have a sheet named "Expression", with the last two columns probe_id and gene_symbol. We will:

Read each file,
Move probe_id to first column,
Keep sample columns (everything except probe_id/gene_symbol),
Prefix sample names with dataset id so columns are unique,
Save batch assignment for each sample.

read_expr <- function(path) {
  # Read the "Expression" sheet
  df <- readxl::read_excel(path, sheet = "Expression")
  df <- as.data.frame(df) # ensure data.frame
  # Expect last two cols to be probe_id and gene_symbol:
  n <- ncol(df)
  probe_col <- names(df)[n-1]
  gene_symbol_col <- names(df)[n]
  if (!all(c(probe_col, gene_symbol_col) %in% c("probe_id","gene_symbol"))) {
    # if columns are named differently, attempt to rename them accordingly:
    # We assume user said last two columns are probe_id and gene_symbol
    names(df)[(n-1):n] <- c("probe_id","gene_symbol")
  }
  # Move probe_id to first column (for easier merges)
  # Sample columns: everything except last two
  sample_df <- df[, setdiff(names(df), c("probe_id","gene_symbol")), drop = FALSE]
  # Convert sample columns to numeric (in case they are character)
  sample_df <- as.data.frame(lapply(sample_df, function(x) as.numeric(as.character(x))))
  # Add probe_id
  out <- cbind(probe_id = df$probe_id, sample_df)
  # Return: data.frame with probe_id and sample columns, and gene_symbol for later mapping
  attr(out, "gene_symbol") <- df$gene_symbol
  out
}

# Read all datasets into list
datasets <- list()
dataset_names <- basename(files) %>% tools::file_path_sans_ext()
for (i in seq_along(files)) {
  message("Reading: ", files[i])
  datasets[[ dataset_names[i] ]] <- read_expr(files[i])
}

## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/E-MTAB-1361_processed.xlsx

## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE12102_processed.xlsx

## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE13433_processed.xlsx

## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE20196_processed.xlsx

## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE32569_processed.xlsx

## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE66533_processed.xlsx

# Build a merged table by probe_id (full join)
merged <- datasets[[1]]
for (i in 2:length(datasets)) {
  merged <- full_join(merged, datasets[[i]], by = "probe_id")
}

# mergedf <- function(x, y) { merge(x, y, by = "probe_id") }
# merged <- Reduce(mergedf, datasets)
# 
# merged <- Reduce(function(x, y) merge(x, y, by = "probe_id") , datasets)

# Inspect merged structure
cat("Merged dimensions (rows probes x cols samples+probe):", dim(merged), "\n")

## Merged dimensions (rows probes x cols samples+probe): 54675 174

head(merged[,1:6])

##    probe_id X20100429_PF_01.CEL X20100429_PF_02.CEL X20100429_PF_03.CEL
## 1 1007_s_at           10.847742           10.424886           10.403364
## 2   1053_at            8.387543            7.620070            7.737997
## 3    117_at            5.404436            4.727790            5.454780
## 4    121_at            7.714166            7.606236            7.340357
## 5 1255_g_at            3.469701            3.448380            3.445810
## 6   1294_at            9.354246            9.292688            8.933648
##   X20100429_PF_06.CEL X20100429_PF_07.CEL
## 1           10.703058           10.075131
## 2            7.453109            7.513165
## 3            6.596276            7.138303
## 4            9.471454            7.644979
## 5            3.526868            3.212224
## 6            8.708730            8.898202

4 2. Build expression matrix (genes × samples) and batch vector

We will:

Use probe_id as rownames,
Drop any non-sample columns,
Construct batch vector: sample columns originating from same Excel file share batch label.

# probe ids
probes <- merged$probe_id

# expression matrix: all columns except probe_id (there may be NA columns if different datasets had different sample names)
expr_df <- merged %>% select(-probe_id)

# Keep only numeric columns (some merges may produce non-numeric columns)
is_num <- sapply(expr_df, is.numeric)
expr_df <- expr_df[, is_num, drop = FALSE]

# Set colnames: may currently be duplicate or NA if two datasets used same sample names.
# We'll ensure uniqueness by prefixing with dataset name where necessary:
# Build vector of original sample names by scanning each dataset and recording their source
sample_names <- c()
sample_batches <- c()
for (nm in dataset_names) {
  d <- datasets[[nm]]
  # sample columns in d (excluding probe_id)
  sn <- names(d)[names(d) != "probe_id"]
  # prefix with dataset short name to avoid collisions
  sn_pref <- paste0(nm, "_", sn)
  sample_names <- c(sample_names, sn_pref)
  sample_batches <- c(sample_batches, rep(nm, length(sn)))
}

# If sample_names length doesn't match expr_df columns, try to align by position:
if (length(sample_names) != ncol(expr_df)) {
  # fallback: assign generic sample names
  sample_names <- paste0("Sample", seq_len(ncol(expr_df)))
  sample_batches <- rep("batch_unknown", length(sample_names))
  warning("Could not derive sample names from datasets; using generic names. Check merging logic.")
}

colnames(expr_df) <- sample_names
rownames(expr_df) <- probes

# Convert to matrix
expr_mat <- as.matrix(expr_df)
mode(expr_mat) <- "numeric"

# Build batch factor matching columns
batch <- factor(sample_batches)
names(batch) <- colnames(expr_mat)

cat("Expression matrix dims:", dim(expr_mat), "\n")

## Expression matrix dims: 54675 173

table(batch)

## batch
## E-MTAB-1361_processed    GSE12102_processed    GSE13433_processed 
##                    16                    37                    16 
##    GSE20196_processed    GSE32569_processed    GSE66533_processed 
##                    34                    12                    58

5 3. Filter rows (optional) and log2 transform for visualization

Remove probes with all NA or zero, then log2(x + 1) transform for downstream visualization.

# Remove probes with all NA or zero across samples
valid <- rowSums(!is.na(expr_mat)) >= 1 & rowSums(expr_mat, na.rm = TRUE) > 0
expr_mat_filt <- expr_mat[valid, , drop = FALSE]

# Replace remaining NAs with small value (0) for visualization (but keep note)
expr_mat_filt[is.na(expr_mat_filt)] <- 0

# Log2 transform (+1)
log2_mat <- log2(expr_mat_filt + 1)

dim(log2_mat)

## [1] 54675   173

6 4. Boxplots colored by batch (raw / log2)

We melt the matrix and plot sample-wise boxplots, coloring boxes by batch.

library(tidyr)
library(dplyr)

df_long <- log2_mat %>%
  as.data.frame() %>%
  tibble::rownames_to_column("probe_id") %>%
  pivot_longer(
    cols = -probe_id,
    names_to = "sample",
    values_to = "log2expr"
  ) %>%
  mutate(batch = batch[match(sample, names(batch))])

# attach batch info
df_long$batch <- batch[match(df_long$sample, names(batch))]

# order samples by batch for easier visual grouping
# sample_order <- names(batch)[order(batch)]
# df_long$sample <- factor(df_long$sample, levels = sample_order)

ggplot(df_long, aes(x = sample, y = log2expr, color = batch, fill = batch)) +
  geom_boxplot(outlier.size = 0.3) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),   # remove sample labels
    axis.ticks.x = element_blank()   # remove ticks if you also want them gone
  ) +
  labs(
    title = "Log2(expression + 1) — raw merged data",
    x = "Sample",
    y = "log2(expr + 1)"
  )

7 5. Correlogram (top 1000 most variable genes)

Compute per-gene variance, select top 1000 variable genes, compute sample–sample Pearson correlation, display correlogram and print summary stats (off-diagonal correlations).

# compute row variances
rv <- matrixStats::rowVars(log2_mat)
ord <- order(rv, decreasing = TRUE)
topN <- min(1000, sum(!is.na(rv)))
top_idx <- ord[1:topN]

mat_top <- log2_mat[top_idx, , drop = FALSE]

# sample-sample correlation
cor_mat <- cor(mat_top, use = "pairwise.complete.obs", method = "pearson")

# correlogram
corrplot::corrplot(cor_mat, method = "color", type = "upper", 
                   tl.pos = 'n', addgrid.col = NA, outline = FALSE, add = FALSE,
                   tl.cex = 0.7, tl.col = "black", addCoef.col="black", number.cex=0.6)

pheatmap::pheatmap(
  cor_mat,
  cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
  show_rownames = FALSE, show_colnames = FALSE, # no labels
  color = colorRampPalette(c("blue", "white", "red"))(100)
)

# summary statistics of off-diagonal correlations
off_diag <- cor_mat[upper.tri(cor_mat)]
cat("Correlation summary (top", topN, "genes):\n")

## Correlation summary (top 1000 genes):

summary(off_diag)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.17504  0.06151  0.14316  0.22682  0.29663  0.99559

8 6. ComBat batch correction

We will use sva::ComBat() to adjust for known batches. We do not provide an explicit model matrix mod (i.e. mod = NULL) because we have no additional biological covariates here. If you have biological covariates (e.g., condition) you should include them in mod.

# Prepare data for ComBat: ComBat expects genes x samples (matrix)
combat_input <- as.matrix(log2_mat)  # currently genes × samples

# Use ComBat (parametric empirical Bayes) to adjust for batch
batch_vec <- as.character(batch)
mod <- NULL  # no biological covariates in this example; modify if you have them

combat_corrected <- sva::ComBat(dat = combat_input, batch = batch_vec, mod = mod, par.prior = TRUE, prior.plots = FALSE)

## Found6batches

## Adjusting for0covariate(s) or covariate level(s)

## Standardizing Data across genes

## Fitting L/S model and finding priors

## Finding parametric adjustments

## Adjusting the Data

# sanity check dims
dim(combat_corrected)

## [1] 54675   173

9 7. Visualize ComBat results: boxplots and correlogram

library(tidyr)
library(dplyr)

df_combat <- combat_corrected %>%
  as.data.frame() %>%
  tibble::rownames_to_column("probe_id") %>%
  pivot_longer(
    cols = -probe_id,
    names_to = "sample",
    values_to = "log2expr"
  ) %>%
  mutate(batch = batch[match(sample, names(batch))])

# Boxplots after ComBat
ggplot(df_combat, aes(x = sample, y = log2expr, fill = batch)) +
  geom_boxplot(outlier.size = 0.3) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),   # remove sample labels
    axis.ticks.x = element_blank()   # remove ticks if you also want them gone
  ) +
  labs(title = "Log2(expr + 1) — after ComBat", x = "Sample", y = "log2(expr + 1)")

# Correlogram after ComBat
mat_top_combat <- combat_corrected[top_idx, , drop = FALSE]
cor_mat_combat <- cor(mat_top_combat, use = "pairwise.complete.obs", method = "pearson")
corrplot::corrplot(
  cor_mat_combat,
  method = "color",
  type = "upper",
  tl.pos = "n",          # disable text labels
  addCoef.col = "black",
  number.cex = 0.6
)

pheatmap::pheatmap(
  cor_mat_combat,
  cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
  show_rownames = FALSE, show_colnames = FALSE, # no labels
  color = colorRampPalette(c("blue", "white", "red"))(100)
)

off_diag_combat <- cor_mat_combat[upper.tri(cor_mat_combat)]
cat("ComBat-corrected correlation summary (top", topN, "genes):\n")

## ComBat-corrected correlation summary (top 1000 genes):

summary(off_diag_combat)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.07934  0.38089  0.45003  0.44590  0.51794  0.97292

10 8. SVA: estimate surrogate variables and remove their effects

We will estimate surrogate variables with sva::svaseq() and then remove them via limma::removeBatchEffect() (which regresses out covariates / surrogate variables). In a typical analysis you would include known biological variables in the mod matrix so that svaseq finds variation orthogonal to your variables of interest.

# Prepare data and design matrices for svaseq
dat_for_sva <- as.matrix(expr_mat_filt)  # use non-log counts ideally for svaseq; but svaseq can accept log-scale too in tutorial
# For svaseq, the function expects (genes x samples) on a scale where counts make sense.
# If you have raw counts, consider using voom/limma or the recommendation in sva vignette.
# Here we'll use the log2-transformed data as a demonstration.

# dat_for_sva <- combat_input  # use the previously log2 matrix (genes x samples)

# Create a treatment variable by splitting each batch
treatment <- unlist(lapply(split(seq_along(batch), batch), function(idx) {
  n <- length(idx)
  half <- floor(n / 2)
  
  # Assign "control" to the first half, "treatment" to the rest
  group <- rep("control", n)
  group[(half + 1):n] <- "treatment"
  
  group
}))

# Known biological covariates
mod <- model.matrix(~treatment, data = data.frame(treatment = treatment))

mod0 <- model.matrix(~1, data = data.frame(batch = batch))  # null model (intercept-only)
# Estimate number of surrogate variables automatically (Leek method)
n.sv <- num.sv(dat_for_sva, mod, method = "leek")
cat("Estimated number of surrogate variables:", n.sv, "\n")

## Estimated number of surrogate variables: 4

# Run svaseq to estimate surrogate variables
svseq <- svaseq(dat = dat_for_sva, mod = mod, mod0 = mod0, n.sv = n.sv)

## Number of significant surrogate variables is:  4 
## Iteration (out of 5 ):1  2  3  4  5

str(svseq$sv)

##  num [1:173, 1:4] -0.0793 -0.0828 -0.0797 -0.0767 -0.0763 ...

# Remove surrogate variable effects using limma::removeBatchEffect (treat sv's as covariates)
sv_mat <- svseq$sv
sva_corrected <- limma::removeBatchEffect(dat_for_sva, covariates = sv_mat)

## design matrix of interest not specified. Assuming a one-group experiment.

# Visualize boxplots after SVA correction
df_sva <- sva_corrected %>%
  as.data.frame() %>%
  tibble::rownames_to_column("probe_id") %>%
  pivot_longer(
    cols = -probe_id,
    names_to = "sample",
    values_to = "log2expr"
  ) %>%
  mutate(batch = batch[match(sample, names(batch))])

ggplot(df_sva, aes(x = sample, y = log2expr, fill = batch)) +
  geom_boxplot(outlier.size = 0.3) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),   # remove sample labels
    axis.ticks.x = element_blank()   # remove ticks if you also want them gone
  ) +
  labs(title = sprintf("Log2(expr + 1) — after SVA correction (n.sv=%d)", n.sv),
       x = "Sample", y = "log2(expr + 1)")

# Correlogram after SVA
mat_top_sva <- sva_corrected[top_idx, , drop = FALSE]
cor_mat_sva <- cor(mat_top_sva, use = "pairwise.complete.obs", method = "pearson")
corrplot::corrplot(cor_mat_sva, method = "color", type = "upper", tl.cex = 0.7, tl.pos = "n")

pheatmap::pheatmap(
  cor_mat_sva,
  cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
  show_rownames = FALSE, show_colnames = FALSE, # no labels
  color = colorRampPalette(c("blue", "white", "red"))(100)
)

off_diag_sva <- cor_mat_sva[upper.tri(cor_mat_sva)]
cat("SVA-corrected correlation summary (top", topN, "genes):\n")

## SVA-corrected correlation summary (top 1000 genes):

summary(off_diag_sva)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.04122  0.32613  0.39813  0.39653  0.46703  0.99220

11 9. Summary table of correlation summaries

summary_df <- data.frame(
  stage = c("raw", "ComBat", "SVA"),
  mean_cor = c(mean(off_diag), mean(off_diag_combat), mean(off_diag_sva)),
  median_cor = c(median(off_diag), median(off_diag_combat), median(off_diag_sva)),
  min_cor = c(min(off_diag), min(off_diag_combat), min(off_diag_sva)),
  max_cor = c(max(off_diag), max(off_diag_combat), max(off_diag_sva))
)
summary_df

##    stage  mean_cor median_cor     min_cor   max_cor
## 1    raw 0.2268240  0.1431615 -0.17503699 0.9955896
## 2 ComBat 0.4458957  0.4500257 -0.07934069 0.9729193
## 3    SVA 0.3965255  0.3981312 -0.04121872 0.9921982

12 10. Notes & recommendations

ComBat is very effective when batch labels are known and you want to remove batch-specific shifts. If you have biological covariates, include them in mod (so you don’t remove biological signal).
SVA finds unknown latent factors; estimated surrogate variables should be interpreted carefully. If you include the biological variable(s) of interest in mod, svaseq will try to find variation orthogonal to those variables.
In practice, apply these corrections on appropriately preprocessed data (e.g., after filtering low counts). For RNA-seq differential expression workflows, consider pipelines such as edgeR/DESeq2 with appropriate normalization (TMM, median ratios) and then use svaseq/ComBat as needed on transformed expression values (e.g., voom output or log-CPM).
Always visualize results (boxplots, PCA, correlograms) and verify that biological signal of interest is preserved.

library(readxl)
library(dplyr)
library(writexl)

# Folder with data
data_folder <- "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO"

files <- c(
  "E-MTAB-1361_processed.xlsx",
  "GSE12102_processed.xlsx",
  "GSE13433_processed.xlsx",
  "GSE20196_processed.xlsx",
  "GSE32569_processed.xlsx",
  "GSE66533_processed.xlsx"
)

# Dataset names (same order)
dataset_names <- c("E-MTAB-1361", "GSE12102", "GSE13433", "GSE20196", "GSE32569", "GSE66533")

# 1. Add gene_symbol column back to combat_corrected
gene_symbols <- attr(datasets[[1]], "gene_symbol")
combat_corrected_df <- as.data.frame(combat_corrected)
combat_corrected_df$gene_symbol <- gene_symbols

# Add to the list
datasets[["combat_corrected"]] <- combat_corrected_df

# 2. Read Phenodata sheets
phenodata_list <- lapply(files, function(f) {
  readxl::read_excel(file.path(data_folder, f), sheet = "Phenodata")
})

# Name phenodata list by dataset names
names(phenodata_list) <- dataset_names

# Append to datasets list
datasets <- c(datasets, phenodata_list)

# 3. Save full list into Excel
writexl::write_xlsx(datasets, path = file.path(data_folder, "combat_corrected.xlsx"))

Batch effect removal demo: ComBat and SVA

Mikhai Dozmorov

2025-10-06