This document:
"Expression") and merges them by
probe_id.# Install packages if you don't have them (uncomment)
# install.packages(c("readxl","tidyverse","matrixStats","corrplot","pheatmap","reshape2"))
# if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager")
# BiocManager::install(c("sva","limma"))
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(matrixStats)
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
library(corrplot)
## corrplot 0.95 loaded
library(pheatmap)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
##
## Attaching package: 'genefilter'
## The following objects are masked from 'package:matrixStats':
##
## rowSds, rowVars
## Loading required package: BiocParallel
library(limma)
Set path to your folder (update if needed):
base_dir <- "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO"
files <- c("E-MTAB-1361_processed.xlsx",
"GSE12102_processed.xlsx",
"GSE13433_processed.xlsx",
"GSE20196_processed.xlsx",
"GSE32569_processed.xlsx",
"GSE66533_processed.xlsx")
files <- file.path(base_dir, files)
files
## [1] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/E-MTAB-1361_processed.xlsx"
## [2] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE12102_processed.xlsx"
## [3] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE13433_processed.xlsx"
## [4] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE20196_processed.xlsx"
## [5] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE32569_processed.xlsx"
## [6] "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE66533_processed.xlsx"
Each file is expected to have a sheet named
"Expression", with the last two columns
probe_id and gene_symbol. We will:
probe_id to first column,read_expr <- function(path) {
# Read the "Expression" sheet
df <- readxl::read_excel(path, sheet = "Expression")
df <- as.data.frame(df) # ensure data.frame
# Expect last two cols to be probe_id and gene_symbol:
n <- ncol(df)
probe_col <- names(df)[n-1]
gene_symbol_col <- names(df)[n]
if (!all(c(probe_col, gene_symbol_col) %in% c("probe_id","gene_symbol"))) {
# if columns are named differently, attempt to rename them accordingly:
# We assume user said last two columns are probe_id and gene_symbol
names(df)[(n-1):n] <- c("probe_id","gene_symbol")
}
# Move probe_id to first column (for easier merges)
# Sample columns: everything except last two
sample_df <- df[, setdiff(names(df), c("probe_id","gene_symbol")), drop = FALSE]
# Convert sample columns to numeric (in case they are character)
sample_df <- as.data.frame(lapply(sample_df, function(x) as.numeric(as.character(x))))
# Add probe_id
out <- cbind(probe_id = df$probe_id, sample_df)
# Return: data.frame with probe_id and sample columns, and gene_symbol for later mapping
attr(out, "gene_symbol") <- df$gene_symbol
out
}
# Read all datasets into list
datasets <- list()
dataset_names <- basename(files) %>% tools::file_path_sans_ext()
for (i in seq_along(files)) {
message("Reading: ", files[i])
datasets[[ dataset_names[i] ]] <- read_expr(files[i])
}
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/E-MTAB-1361_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE12102_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE13433_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE20196_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE32569_processed.xlsx
## Reading: /Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO/GSE66533_processed.xlsx
# Build a merged table by probe_id (full join)
merged <- datasets[[1]]
for (i in 2:length(datasets)) {
merged <- full_join(merged, datasets[[i]], by = "probe_id")
}
# mergedf <- function(x, y) { merge(x, y, by = "probe_id") }
# merged <- Reduce(mergedf, datasets)
#
# merged <- Reduce(function(x, y) merge(x, y, by = "probe_id") , datasets)
# Inspect merged structure
cat("Merged dimensions (rows probes x cols samples+probe):", dim(merged), "\n")
## Merged dimensions (rows probes x cols samples+probe): 54675 174
head(merged[,1:6])
## probe_id X20100429_PF_01.CEL X20100429_PF_02.CEL X20100429_PF_03.CEL
## 1 1007_s_at 10.847742 10.424886 10.403364
## 2 1053_at 8.387543 7.620070 7.737997
## 3 117_at 5.404436 4.727790 5.454780
## 4 121_at 7.714166 7.606236 7.340357
## 5 1255_g_at 3.469701 3.448380 3.445810
## 6 1294_at 9.354246 9.292688 8.933648
## X20100429_PF_06.CEL X20100429_PF_07.CEL
## 1 10.703058 10.075131
## 2 7.453109 7.513165
## 3 6.596276 7.138303
## 4 9.471454 7.644979
## 5 3.526868 3.212224
## 6 8.708730 8.898202
We will:
probe_id as rownames,batch vector: sample columns originating from
same Excel file share batch label.# probe ids
probes <- merged$probe_id
# expression matrix: all columns except probe_id (there may be NA columns if different datasets had different sample names)
expr_df <- merged %>% select(-probe_id)
# Keep only numeric columns (some merges may produce non-numeric columns)
is_num <- sapply(expr_df, is.numeric)
expr_df <- expr_df[, is_num, drop = FALSE]
# Set colnames: may currently be duplicate or NA if two datasets used same sample names.
# We'll ensure uniqueness by prefixing with dataset name where necessary:
# Build vector of original sample names by scanning each dataset and recording their source
sample_names <- c()
sample_batches <- c()
for (nm in dataset_names) {
d <- datasets[[nm]]
# sample columns in d (excluding probe_id)
sn <- names(d)[names(d) != "probe_id"]
# prefix with dataset short name to avoid collisions
sn_pref <- paste0(nm, "_", sn)
sample_names <- c(sample_names, sn_pref)
sample_batches <- c(sample_batches, rep(nm, length(sn)))
}
# If sample_names length doesn't match expr_df columns, try to align by position:
if (length(sample_names) != ncol(expr_df)) {
# fallback: assign generic sample names
sample_names <- paste0("Sample", seq_len(ncol(expr_df)))
sample_batches <- rep("batch_unknown", length(sample_names))
warning("Could not derive sample names from datasets; using generic names. Check merging logic.")
}
colnames(expr_df) <- sample_names
rownames(expr_df) <- probes
# Convert to matrix
expr_mat <- as.matrix(expr_df)
mode(expr_mat) <- "numeric"
# Build batch factor matching columns
batch <- factor(sample_batches)
names(batch) <- colnames(expr_mat)
cat("Expression matrix dims:", dim(expr_mat), "\n")
## Expression matrix dims: 54675 173
table(batch)
## batch
## E-MTAB-1361_processed GSE12102_processed GSE13433_processed
## 16 37 16
## GSE20196_processed GSE32569_processed GSE66533_processed
## 34 12 58
Remove probes with all NA or zero, then log2(x + 1) transform for downstream visualization.
# Remove probes with all NA or zero across samples
valid <- rowSums(!is.na(expr_mat)) >= 1 & rowSums(expr_mat, na.rm = TRUE) > 0
expr_mat_filt <- expr_mat[valid, , drop = FALSE]
# Replace remaining NAs with small value (0) for visualization (but keep note)
expr_mat_filt[is.na(expr_mat_filt)] <- 0
# Log2 transform (+1)
log2_mat <- log2(expr_mat_filt + 1)
dim(log2_mat)
## [1] 54675 173
We melt the matrix and plot sample-wise boxplots, coloring boxes by batch.
library(tidyr)
library(dplyr)
df_long <- log2_mat %>%
as.data.frame() %>%
tibble::rownames_to_column("probe_id") %>%
pivot_longer(
cols = -probe_id,
names_to = "sample",
values_to = "log2expr"
) %>%
mutate(batch = batch[match(sample, names(batch))])
# attach batch info
df_long$batch <- batch[match(df_long$sample, names(batch))]
# order samples by batch for easier visual grouping
# sample_order <- names(batch)[order(batch)]
# df_long$sample <- factor(df_long$sample, levels = sample_order)
ggplot(df_long, aes(x = sample, y = log2expr, color = batch, fill = batch)) +
geom_boxplot(outlier.size = 0.3) +
scale_fill_brewer(palette = "Set2") +
theme_bw() +
theme(
axis.text.x = element_blank(), # remove sample labels
axis.ticks.x = element_blank() # remove ticks if you also want them gone
) +
labs(
title = "Log2(expression + 1) — raw merged data",
x = "Sample",
y = "log2(expr + 1)"
)
Compute per-gene variance, select top 1000 variable genes, compute sample–sample Pearson correlation, display correlogram and print summary stats (off-diagonal correlations).
# compute row variances
rv <- matrixStats::rowVars(log2_mat)
ord <- order(rv, decreasing = TRUE)
topN <- min(1000, sum(!is.na(rv)))
top_idx <- ord[1:topN]
mat_top <- log2_mat[top_idx, , drop = FALSE]
# sample-sample correlation
cor_mat <- cor(mat_top, use = "pairwise.complete.obs", method = "pearson")
# correlogram
corrplot::corrplot(cor_mat, method = "color", type = "upper",
tl.pos = 'n', addgrid.col = NA, outline = FALSE, add = FALSE,
tl.cex = 0.7, tl.col = "black", addCoef.col="black", number.cex=0.6)
pheatmap::pheatmap(
cor_mat,
cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
show_rownames = FALSE, show_colnames = FALSE, # no labels
color = colorRampPalette(c("blue", "white", "red"))(100)
)
# summary statistics of off-diagonal correlations
off_diag <- cor_mat[upper.tri(cor_mat)]
cat("Correlation summary (top", topN, "genes):\n")
## Correlation summary (top 1000 genes):
summary(off_diag)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.17504 0.06151 0.14316 0.22682 0.29663 0.99559
We will use sva::ComBat() to adjust for known batches.
We do not provide an explicit model matrix mod (i.e. mod =
NULL) because we have no additional biological covariates here. If you
have biological covariates (e.g., condition) you should include them in
mod.
# Prepare data for ComBat: ComBat expects genes x samples (matrix)
combat_input <- as.matrix(log2_mat) # currently genes Ă— samples
# Use ComBat (parametric empirical Bayes) to adjust for batch
batch_vec <- as.character(batch)
mod <- NULL # no biological covariates in this example; modify if you have them
combat_corrected <- sva::ComBat(dat = combat_input, batch = batch_vec, mod = mod, par.prior = TRUE, prior.plots = FALSE)
## Found6batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
# sanity check dims
dim(combat_corrected)
## [1] 54675 173
library(tidyr)
library(dplyr)
df_combat <- combat_corrected %>%
as.data.frame() %>%
tibble::rownames_to_column("probe_id") %>%
pivot_longer(
cols = -probe_id,
names_to = "sample",
values_to = "log2expr"
) %>%
mutate(batch = batch[match(sample, names(batch))])
# Boxplots after ComBat
ggplot(df_combat, aes(x = sample, y = log2expr, fill = batch)) +
geom_boxplot(outlier.size = 0.3) +
scale_fill_brewer(palette = "Set2") +
theme_bw() +
theme(
axis.text.x = element_blank(), # remove sample labels
axis.ticks.x = element_blank() # remove ticks if you also want them gone
) +
labs(title = "Log2(expr + 1) — after ComBat", x = "Sample", y = "log2(expr + 1)")
# Correlogram after ComBat
mat_top_combat <- combat_corrected[top_idx, , drop = FALSE]
cor_mat_combat <- cor(mat_top_combat, use = "pairwise.complete.obs", method = "pearson")
corrplot::corrplot(
cor_mat_combat,
method = "color",
type = "upper",
tl.pos = "n", # disable text labels
addCoef.col = "black",
number.cex = 0.6
)
pheatmap::pheatmap(
cor_mat_combat,
cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
show_rownames = FALSE, show_colnames = FALSE, # no labels
color = colorRampPalette(c("blue", "white", "red"))(100)
)
off_diag_combat <- cor_mat_combat[upper.tri(cor_mat_combat)]
cat("ComBat-corrected correlation summary (top", topN, "genes):\n")
## ComBat-corrected correlation summary (top 1000 genes):
summary(off_diag_combat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.07934 0.38089 0.45003 0.44590 0.51794 0.97292
We will estimate surrogate variables with sva::svaseq()
and then remove them via limma::removeBatchEffect() (which
regresses out covariates / surrogate variables). In a typical analysis
you would include known biological variables in the mod
matrix so that svaseq finds variation orthogonal to your variables of
interest.
# Prepare data and design matrices for svaseq
dat_for_sva <- as.matrix(expr_mat_filt) # use non-log counts ideally for svaseq; but svaseq can accept log-scale too in tutorial
# For svaseq, the function expects (genes x samples) on a scale where counts make sense.
# If you have raw counts, consider using voom/limma or the recommendation in sva vignette.
# Here we'll use the log2-transformed data as a demonstration.
# dat_for_sva <- combat_input # use the previously log2 matrix (genes x samples)
# Create a treatment variable by splitting each batch
treatment <- unlist(lapply(split(seq_along(batch), batch), function(idx) {
n <- length(idx)
half <- floor(n / 2)
# Assign "control" to the first half, "treatment" to the rest
group <- rep("control", n)
group[(half + 1):n] <- "treatment"
group
}))
# Known biological covariates
mod <- model.matrix(~treatment, data = data.frame(treatment = treatment))
mod0 <- model.matrix(~1, data = data.frame(batch = batch)) # null model (intercept-only)
# Estimate number of surrogate variables automatically (Leek method)
n.sv <- num.sv(dat_for_sva, mod, method = "leek")
cat("Estimated number of surrogate variables:", n.sv, "\n")
## Estimated number of surrogate variables: 4
# Run svaseq to estimate surrogate variables
svseq <- svaseq(dat = dat_for_sva, mod = mod, mod0 = mod0, n.sv = n.sv)
## Number of significant surrogate variables is: 4
## Iteration (out of 5 ):1 2 3 4 5
str(svseq$sv)
## num [1:173, 1:4] -0.0793 -0.0828 -0.0797 -0.0767 -0.0763 ...
# Remove surrogate variable effects using limma::removeBatchEffect (treat sv's as covariates)
sv_mat <- svseq$sv
sva_corrected <- limma::removeBatchEffect(dat_for_sva, covariates = sv_mat)
## design matrix of interest not specified. Assuming a one-group experiment.
# Visualize boxplots after SVA correction
df_sva <- sva_corrected %>%
as.data.frame() %>%
tibble::rownames_to_column("probe_id") %>%
pivot_longer(
cols = -probe_id,
names_to = "sample",
values_to = "log2expr"
) %>%
mutate(batch = batch[match(sample, names(batch))])
ggplot(df_sva, aes(x = sample, y = log2expr, fill = batch)) +
geom_boxplot(outlier.size = 0.3) +
scale_fill_brewer(palette = "Set2") +
theme_bw() +
theme(
axis.text.x = element_blank(), # remove sample labels
axis.ticks.x = element_blank() # remove ticks if you also want them gone
) +
labs(title = sprintf("Log2(expr + 1) — after SVA correction (n.sv=%d)", n.sv),
x = "Sample", y = "log2(expr + 1)")
# Correlogram after SVA
mat_top_sva <- sva_corrected[top_idx, , drop = FALSE]
cor_mat_sva <- cor(mat_top_sva, use = "pairwise.complete.obs", method = "pearson")
corrplot::corrplot(cor_mat_sva, method = "color", type = "upper", tl.cex = 0.7, tl.pos = "n")
pheatmap::pheatmap(
cor_mat_sva,
cluster_rows = FALSE, cluster_cols = FALSE, # no clustering
show_rownames = FALSE, show_colnames = FALSE, # no labels
color = colorRampPalette(c("blue", "white", "red"))(100)
)
off_diag_sva <- cor_mat_sva[upper.tri(cor_mat_sva)]
cat("SVA-corrected correlation summary (top", topN, "genes):\n")
## SVA-corrected correlation summary (top 1000 genes):
summary(off_diag_sva)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.04122 0.32613 0.39813 0.39653 0.46703 0.99220
summary_df <- data.frame(
stage = c("raw", "ComBat", "SVA"),
mean_cor = c(mean(off_diag), mean(off_diag_combat), mean(off_diag_sva)),
median_cor = c(median(off_diag), median(off_diag_combat), median(off_diag_sva)),
min_cor = c(min(off_diag), min(off_diag_combat), min(off_diag_sva)),
max_cor = c(max(off_diag), max(off_diag_combat), max(off_diag_sva))
)
summary_df
## stage mean_cor median_cor min_cor max_cor
## 1 raw 0.2268240 0.1431615 -0.17503699 0.9955896
## 2 ComBat 0.4458957 0.4500257 -0.07934069 0.9729193
## 3 SVA 0.3965255 0.3981312 -0.04121872 0.9921982
mod (so you
don’t remove biological signal).mod,
svaseq will try to find variation orthogonal to those
variables.edgeR/DESeq2 with appropriate normalization
(TMM, median ratios) and then use
svaseq/ComBat as needed on transformed
expression values (e.g., voom output or log-CPM).library(readxl)
library(dplyr)
library(writexl)
# Folder with data
data_folder <- "/Users/mdozmorov/Documents/Work/VCU_work/Tony/misc/2025-09-20.Tony_GEO"
files <- c(
"E-MTAB-1361_processed.xlsx",
"GSE12102_processed.xlsx",
"GSE13433_processed.xlsx",
"GSE20196_processed.xlsx",
"GSE32569_processed.xlsx",
"GSE66533_processed.xlsx"
)
# Dataset names (same order)
dataset_names <- c("E-MTAB-1361", "GSE12102", "GSE13433", "GSE20196", "GSE32569", "GSE66533")
# 1. Add gene_symbol column back to combat_corrected
gene_symbols <- attr(datasets[[1]], "gene_symbol")
combat_corrected_df <- as.data.frame(combat_corrected)
combat_corrected_df$gene_symbol <- gene_symbols
# Add to the list
datasets[["combat_corrected"]] <- combat_corrected_df
# 2. Read Phenodata sheets
phenodata_list <- lapply(files, function(f) {
readxl::read_excel(file.path(data_folder, f), sheet = "Phenodata")
})
# Name phenodata list by dataset names
names(phenodata_list) <- dataset_names
# Append to datasets list
datasets <- c(datasets, phenodata_list)
# 3. Save full list into Excel
writexl::write_xlsx(datasets, path = file.path(data_folder, "combat_corrected.xlsx"))