1 Purpose

This document:

2 0. Packages & paths

# Install packages if you don't have them (uncomment)
# install.packages(c("readxl","tidyverse","matrixStats","corrplot","pheatmap","reshape2"))
# if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager")
# BiocManager::install(c("sva","limma"))

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(matrixStats)
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
library(corrplot)
## corrplot 0.95 loaded
library(pheatmap)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
## 
## Attaching package: 'genefilter'
## The following objects are masked from 'package:matrixStats':
## 
##     rowSds, rowVars
## Loading required package: BiocParallel
library(limma)

Set path to your folder (update if needed):

base_dir <- "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO"
files <- c("E-MTAB-1361_processed.xlsx",
           "GSE12102_processed.xlsx",
           "GSE13433_processed.xlsx",
           "GSE20196_processed.xlsx",
           "GSE32569_processed.xlsx",
           "GSE66533_processed.xlsx")
files <- file.path(base_dir, files)
files
## [1] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/E-MTAB-1361_processed.xlsx"
## [2] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE12102_processed.xlsx"   
## [3] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE13433_processed.xlsx"   
## [4] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE20196_processed.xlsx"   
## [5] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE32569_processed.xlsx"   
## [6] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE66533_processed.xlsx"

3 1. Read files and extract expression matrices

Each file is expected to have a sheet named "Expression", with the last two columns probe_id and gene_symbol. We will:

read_expr <- function(path) {
  # Read the "Expression" sheet
  df <- readxl::read_excel(path, sheet = "Expression")
  df <- as.data.frame(df) # ensure data.frame
  # Expect last two cols to be probe_id and gene_symbol:
  n <- ncol(df)
  probe_col <- names(df)[n-1]
  gene_symbol_col <- names(df)[n]
  if (!all(c(probe_col, gene_symbol_col) %in% c("probe_id","gene_symbol"))) {
    # if columns are named differently, attempt to rename them accordingly:
    # We assume user said last two columns are probe_id and gene_symbol
    names(df)[(n-1):n] <- c("probe_id","gene_symbol")
  }
  # Move probe_id to first column (for easier merges)
  # Sample columns: everything except last two
  sample_df <- df[, setdiff(names(df), c("probe_id","gene_symbol")), drop = FALSE]
  # Convert sample columns to numeric (in case they are character)
  sample_df <- as.data.frame(lapply(sample_df, function(x) as.numeric(as.character(x))))
  # Add probe_id
  out <- cbind(probe_id = df$probe_id, sample_df)
  # Return: data.frame with probe_id and sample columns, and gene_symbol for later mapping
  attr(out, "gene_symbol") <- df$gene_symbol
  out
}

# Read all datasets into list
datasets <- list()
dataset_names <- basename(files) %>% tools::file_path_sans_ext()
for (i in seq_along(files)) {
  message("Reading: ", files[i])
  datasets[[ dataset_names[i] ]] <- read_expr(files[i])
}
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/E-MTAB-1361_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE12102_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE13433_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE20196_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE32569_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE66533_processed.xlsx
# Build a merged table by probe_id (full join)
merged <- datasets[[1]]
for (i in 2:length(datasets)) {
  merged <- full_join(merged, datasets[[i]], by = "probe_id")
}

# mergedf <- function(x, y) { merge(x, y, by = "probe_id") }
# merged <- Reduce(mergedf, datasets)
# 
# merged <- Reduce(function(x, y) merge(x, y, by = "probe_id") , datasets)

# Inspect merged structure
cat("Merged dimensions (rows probes x cols samples+probe):", dim(merged), "\n")
## Merged dimensions (rows probes x cols samples+probe): 54675 174
head(merged[,1:6])
##    probe_id X20100429_PF_01.CEL X20100429_PF_02.CEL X20100429_PF_03.CEL
## 1 1007_s_at           10.847742           10.424886           10.403364
## 2   1053_at            8.387543            7.620070            7.737997
## 3    117_at            5.404436            4.727790            5.454780
## 4    121_at            7.714166            7.606236            7.340357
## 5 1255_g_at            3.469701            3.448380            3.445810
## 6   1294_at            9.354246            9.292688            8.933648
##   X20100429_PF_06.CEL X20100429_PF_07.CEL
## 1           10.703058           10.075131
## 2            7.453109            7.513165
## 3            6.596276            7.138303
## 4            9.471454            7.644979
## 5            3.526868            3.212224
## 6            8.708730            8.898202

4 2. Build expression matrix (genes Ă— samples) and batch vector

We will:

# probe ids
probes <- merged$probe_id

# expression matrix: all columns except probe_id (there may be NA columns if different datasets had different sample names)
expr_df <- merged %>% select(-probe_id)

# Keep only numeric columns (some merges may produce non-numeric columns)
is_num <- sapply(expr_df, is.numeric)
expr_df <- expr_df[, is_num, drop = FALSE]

# Set colnames: may currently be duplicate or NA if two datasets used same sample names.
# We'll ensure uniqueness by prefixing with dataset name where necessary:
# Build vector of original sample names by scanning each dataset and recording their source
sample_names <- c()
sample_batches <- c()
for (nm in dataset_names) {
  d <- datasets[[nm]]
  # sample columns in d (excluding probe_id)
  sn <- names(d)[names(d) != "probe_id"]
  # prefix with dataset short name to avoid collisions
  sn_pref <- paste0(nm, "_", sn)
  sample_names <- c(sample_names, sn_pref)
  sample_batches <- c(sample_batches, rep(nm, length(sn)))
}

# If sample_names length doesn't match expr_df columns, try to align by position:
if (length(sample_names) != ncol(expr_df)) {
  # fallback: assign generic sample names
  sample_names <- paste0("Sample", seq_len(ncol(expr_df)))
  sample_batches <- rep("batch_unknown", length(sample_names))
  warning("Could not derive sample names from datasets; using generic names. Check merging logic.")
}

colnames(expr_df) <- sample_names
rownames(expr_df) <- probes

# Convert to matrix
expr_mat <- as.matrix(expr_df)
mode(expr_mat) <- "numeric"

# Build batch factor matching columns
batch <- factor(sample_batches)
names(batch) <- colnames(expr_mat)

cat("Expression matrix dims:", dim(expr_mat), "\n")
## Expression matrix dims: 54675 173
table(batch)
## batch
## E-MTAB-1361_processed    GSE12102_processed    GSE13433_processed 
##                    16                    37                    16 
##    GSE20196_processed    GSE32569_processed    GSE66533_processed 
##                    34                    12                    58

5 3. Filter rows (optional) and log2 transform for visualization

Remove probes with all NA or zero, then log2(x + 1) transform for downstream visualization.

# Remove probes with all NA or zero across samples
valid <- rowSums(!is.na(expr_mat)) >= 1 & rowSums(expr_mat, na.rm = TRUE) > 0
expr_mat_filt <- expr_mat[valid, , drop = FALSE]

# Replace remaining NAs with small value (0) for visualization (but keep note)
expr_mat_filt[is.na(expr_mat_filt)] <- 0

# Log2 transform (+1)
log2_mat <- log2(expr_mat_filt + 1)

dim(log2_mat)
## [1] 54675   173

6 4. Boxplots colored by batch (raw / log2)

We melt the matrix and plot sample-wise boxplots, coloring boxes by batch.

library(tidyr)
library(dplyr)

df_long <- log2_mat %>%
  as.data.frame() %>%
  tibble::rownames_to_column("probe_id") %>%
  pivot_longer(
    cols = -probe_id,
    names_to = "sample",
    values_to = "log2expr"
  ) %>%
  mutate(batch = batch[match(sample, names(batch))])

# attach batch info
df_long$batch <- batch[match(df_long$sample, names(batch))]

# order samples by batch for easier visual grouping
# sample_order <- names(batch)[order(batch)]
# df_long$sample <- factor(df_long$sample, levels = sample_order)

ggplot(df_long, aes(x = sample, y = log2expr, color = batch, fill = batch)) +
  geom_boxplot(outlier.size = 0.3) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),   # remove sample labels
    axis.ticks.x = element_blank()   # remove ticks if you also want them gone
  ) +
  labs(
    title = "Log2(expression + 1) — raw merged data",
    x = "Sample",
    y = "log2(expr + 1)"
  )

7 5. Correlogram (top 1000 most variable genes)

Compute per-gene variance, select top 1000 variable genes, compute sample–sample Pearson correlation, display correlogram and print summary stats (off-diagonal correlations).

# compute row variances
rv <- matrixStats::rowVars(log2_mat)
ord <- order(rv, decreasing = TRUE)
topN <- min(1000, sum(!is.na(rv)))
top_idx <- ord[1:topN]

mat_top <- log2_mat[top_idx, , drop = FALSE]

# sample-sample correlation
cor_mat <- cor(mat_top, use = "pairwise.complete.obs", method = "pearson")

# correlogram
corrplot::corrplot(cor_mat, method = "color", type = "upper", 
                   tl.pos = 'n', addgrid.col = NA, outline = FALSE, add = FALSE,
                   tl.cex = 0.7, tl.col = "black", addCoef.col="black", number.cex=0.6)

pheatmap::pheatmap(
  cor_mat,
  cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
  show_rownames = FALSE, show_colnames = FALSE, # no labels
  color = colorRampPalette(c("blue", "white", "red"))(100)
)

# summary statistics of off-diagonal correlations
off_diag <- cor_mat[upper.tri(cor_mat)]
cat("Correlation summary (top", topN, "genes):\n")
## Correlation summary (top 1000 genes):
summary(off_diag)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.17504  0.06151  0.14316  0.22682  0.29663  0.99559

8 6. ComBat batch correction

We will use sva::ComBat() to adjust for known batches. We do not provide an explicit model matrix mod (i.e. mod = NULL) because we have no additional biological covariates here. If you have biological covariates (e.g., condition) you should include them in mod.

# Prepare data for ComBat: ComBat expects genes x samples (matrix)
combat_input <- as.matrix(log2_mat)  # currently genes Ă— samples

# Use ComBat (parametric empirical Bayes) to adjust for batch
batch_vec <- as.character(batch)
mod <- NULL  # no biological covariates in this example; modify if you have them

combat_corrected <- sva::ComBat(dat = combat_input, batch = batch_vec, mod = mod, par.prior = TRUE, prior.plots = FALSE)
## Found6batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# sanity check dims
dim(combat_corrected)
## [1] 54675   173

9 7. Visualize ComBat results: boxplots and correlogram

library(tidyr)
library(dplyr)

df_combat <- combat_corrected %>%
  as.data.frame() %>%
  tibble::rownames_to_column("probe_id") %>%
  pivot_longer(
    cols = -probe_id,
    names_to = "sample",
    values_to = "log2expr"
  ) %>%
  mutate(batch = batch[match(sample, names(batch))])

# Boxplots after ComBat
ggplot(df_combat, aes(x = sample, y = log2expr, fill = batch)) +
  geom_boxplot(outlier.size = 0.3) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),   # remove sample labels
    axis.ticks.x = element_blank()   # remove ticks if you also want them gone
  ) +
  labs(title = "Log2(expr + 1) — after ComBat", x = "Sample", y = "log2(expr + 1)")

# Correlogram after ComBat
mat_top_combat <- combat_corrected[top_idx, , drop = FALSE]
cor_mat_combat <- cor(mat_top_combat, use = "pairwise.complete.obs", method = "pearson")
corrplot::corrplot(
  cor_mat_combat,
  method = "color",
  type = "upper",
  tl.pos = "n",          # disable text labels
  addCoef.col = "black",
  number.cex = 0.6
)

pheatmap::pheatmap(
  cor_mat_combat,
  cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
  show_rownames = FALSE, show_colnames = FALSE, # no labels
  color = colorRampPalette(c("blue", "white", "red"))(100)
)

off_diag_combat <- cor_mat_combat[upper.tri(cor_mat_combat)]
cat("ComBat-corrected correlation summary (top", topN, "genes):\n")
## ComBat-corrected correlation summary (top 1000 genes):
summary(off_diag_combat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.07934  0.38089  0.45003  0.44590  0.51794  0.97292

10 8. SVA: estimate surrogate variables and remove their effects

We will estimate surrogate variables with sva::svaseq() and then remove them via limma::removeBatchEffect() (which regresses out covariates / surrogate variables). In a typical analysis you would include known biological variables in the mod matrix so that svaseq finds variation orthogonal to your variables of interest.

# Prepare data and design matrices for svaseq
dat_for_sva <- as.matrix(expr_mat_filt)  # use non-log counts ideally for svaseq; but svaseq can accept log-scale too in tutorial
# For svaseq, the function expects (genes x samples) on a scale where counts make sense.
# If you have raw counts, consider using voom/limma or the recommendation in sva vignette.
# Here we'll use the log2-transformed data as a demonstration.

# dat_for_sva <- combat_input  # use the previously log2 matrix (genes x samples)

# Create a treatment variable by splitting each batch
treatment <- unlist(lapply(split(seq_along(batch), batch), function(idx) {
  n <- length(idx)
  half <- floor(n / 2)
  
  # Assign "control" to the first half, "treatment" to the rest
  group <- rep("control", n)
  group[(half + 1):n] <- "treatment"
  
  group
}))

# Known biological covariates
mod <- model.matrix(~treatment, data = data.frame(treatment = treatment))

mod0 <- model.matrix(~1, data = data.frame(batch = batch))  # null model (intercept-only)
# Estimate number of surrogate variables automatically (Leek method)
n.sv <- num.sv(dat_for_sva, mod, method = "leek")
cat("Estimated number of surrogate variables:", n.sv, "\n")
## Estimated number of surrogate variables: 4
# Run svaseq to estimate surrogate variables
svseq <- svaseq(dat = dat_for_sva, mod = mod, mod0 = mod0, n.sv = n.sv)
## Number of significant surrogate variables is:  4 
## Iteration (out of 5 ):1  2  3  4  5
str(svseq$sv)
##  num [1:173, 1:4] -0.0793 -0.0828 -0.0797 -0.0767 -0.0763 ...
# Remove surrogate variable effects using limma::removeBatchEffect (treat sv's as covariates)
sv_mat <- svseq$sv
sva_corrected <- limma::removeBatchEffect(dat_for_sva, covariates = sv_mat)
## design matrix of interest not specified. Assuming a one-group experiment.
# Visualize boxplots after SVA correction
df_sva <- sva_corrected %>%
  as.data.frame() %>%
  tibble::rownames_to_column("probe_id") %>%
  pivot_longer(
    cols = -probe_id,
    names_to = "sample",
    values_to = "log2expr"
  ) %>%
  mutate(batch = batch[match(sample, names(batch))])

ggplot(df_sva, aes(x = sample, y = log2expr, fill = batch)) +
  geom_boxplot(outlier.size = 0.3) +
  scale_fill_brewer(palette = "Set2") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),   # remove sample labels
    axis.ticks.x = element_blank()   # remove ticks if you also want them gone
  ) +
  labs(title = sprintf("Log2(expr + 1) — after SVA correction (n.sv=%d)", n.sv),
       x = "Sample", y = "log2(expr + 1)")

# Correlogram after SVA
mat_top_sva <- sva_corrected[top_idx, , drop = FALSE]
cor_mat_sva <- cor(mat_top_sva, use = "pairwise.complete.obs", method = "pearson")
corrplot::corrplot(cor_mat_sva, method = "color", type = "upper", tl.cex = 0.7, tl.pos = "n")

pheatmap::pheatmap(
  cor_mat_sva,
  cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
  show_rownames = FALSE, show_colnames = FALSE, # no labels
  color = colorRampPalette(c("blue", "white", "red"))(100)
)

off_diag_sva <- cor_mat_sva[upper.tri(cor_mat_sva)]
cat("SVA-corrected correlation summary (top", topN, "genes):\n")
## SVA-corrected correlation summary (top 1000 genes):
summary(off_diag_sva)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.04122  0.32613  0.39813  0.39653  0.46703  0.99220

11 9. Summary table of correlation summaries

summary_df <- data.frame(
  stage = c("raw", "ComBat", "SVA"),
  mean_cor = c(mean(off_diag), mean(off_diag_combat), mean(off_diag_sva)),
  median_cor = c(median(off_diag), median(off_diag_combat), median(off_diag_sva)),
  min_cor = c(min(off_diag), min(off_diag_combat), min(off_diag_sva)),
  max_cor = c(max(off_diag), max(off_diag_combat), max(off_diag_sva))
)
summary_df
##    stage  mean_cor median_cor     min_cor   max_cor
## 1    raw 0.2268240  0.1431615 -0.17503699 0.9955896
## 2 ComBat 0.4458957  0.4500257 -0.07934069 0.9729193
## 3    SVA 0.3965255  0.3981312 -0.04121872 0.9921982

12 10. Notes & recommendations

library(readxl)
library(dplyr)
library(writexl)

# Folder with data
data_folder <- "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO"

files <- c(
  "E-MTAB-1361_processed.xlsx",
  "GSE12102_processed.xlsx",
  "GSE13433_processed.xlsx",
  "GSE20196_processed.xlsx",
  "GSE32569_processed.xlsx",
  "GSE66533_processed.xlsx"
)

# Dataset names (same order)
dataset_names <- c("E-MTAB-1361", "GSE12102", "GSE13433", "GSE20196", "GSE32569", "GSE66533")

# 1. Add gene_symbol column back to combat_corrected
gene_symbols <- attr(datasets[[1]], "gene_symbol")
combat_corrected_df <- as.data.frame(combat_corrected)
combat_corrected_df$gene_symbol <- gene_symbols

# Add to the list
datasets[["combat_corrected"]] <- combat_corrected_df

# 2. Read Phenodata sheets
phenodata_list <- lapply(files, function(f) {
  readxl::read_excel(file.path(data_folder, f), sheet = "Phenodata")
})

# Name phenodata list by dataset names
names(phenodata_list) <- dataset_names

# Append to datasets list
datasets <- c(datasets, phenodata_list)

# 3. Save full list into Excel
writexl::write_xlsx(datasets, path = file.path(data_folder, "combat_corrected.xlsx"))