Load Xenium data
(Step 1)
outputFolders <- list.files(path = baseDir, pattern = "outs")
outputFolders
## [1] "Xenium_Prime__Human_Ovary_FF__outs"
# Summary
if (file.exists(file.path(outDir, "01_summary.RDS"))) {
summary <- readRDS(file.path(outDir, "01_summary.RDS"))
} else {
summaryL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
summary <- read.csv(file.path(baseDir, outputFolder, "metrics_summary.csv"))
return(summary)
})
summary <- Reduce(rbind, summaryL)
saveRDS(summary, file.path(outDir, "01_summary.RDS"))
}
# Cells
if (file.exists(file.path(outDir, "02_cells.RDS"))) {
cells <- readRDS(file.path(outDir, "02_cells.RDS"))
} else {
cellsL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
cells <- read_parquet(file.path(baseDir, outputFolder, "cells.parquet"), as_data_frame = TRUE)
cells$region <- sapply(str_split(outputFolder, "__"), "[[", 2)
return(cells)
})
cells <- Reduce(rbind, cellsL)
cells$region <- factor(cells$region, levels = regionNames)
saveRDS(cells, file.path(outDir, "02_cells.RDS"))
}
# Transcripts
if (file.exists(file.path(outDir, "03_txL.RDS"))) {
txL <- readRDS(file.path(outDir, "03_txL.RDS"))
} else {
txL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
tx <- read_parquet(file.path(baseDir, outputFolder, "transcripts.parquet"), as_data_frame = TRUE)
return(tx)
})
names(txL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
txL <- txL[regionNames]
saveRDS(txL, file.path(outDir, "03_txL.RDS"))
}
# Expression profile
if (file.exists(file.path(outDir, "04_exprL.RDS"))) {
exprL <- readRDS(file.path(outDir, "04_exprL.RDS"))
} else {
exprL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
mat <- TENxMatrix(file.path(baseDir, outputFolder, "cell_feature_matrix.h5"), "matrix")
return(mat)
})
names(exprL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
exprL <- exprL[regionNames]
saveRDS(exprL, file.path(outDir, "04_exprL.RDS"))
}
# Feature annotation
if (file.exists(file.path(outDir, "05_featuresAnnot.RDS"))) {
featuresAnnot <- readRDS(file.path(outDir, "05_featuresAnnot.RDS"))
} else {
featuresAnnot <- read.delim(file.path(baseDir, outputFolders[1], "cell_feature_matrix", "features.tsv.gz"), header = FALSE, stringsAsFactors = FALSE)
saveRDS(featuresAnnot, file.path(outDir, "05_featuresAnnot.RDS"))
}
Exclude unassigned Tx
(Step 2)
- The following capture is an example case to show Tx that
are unassigned
if (file.exists(file.path(outDir, "06_txAssignedL.RDS"))) {
txAssignedL <- readRDS(file.path(outDir, "06_txAssignedL.RDS"))
} else {
txAssignedL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
tx <- read_parquet(file.path(baseDir, outputFolder, "out.parquet"), as_data_frame = TRUE)
return(tx)
})
# txAssignedL <- lapply(seq_along(txL), function(idx) {
# tx <- txL[[idx]]
# return(tx[which(tx$cell_id != "UNASSIGNED"), ])
# })
names(txAssignedL) <- regionNames
saveRDS(txAssignedL, file.path(outDir, "06_txAssignedL.RDS"))
}
Human_Ovary_FF |
2,698,510,033 |
2,419,097,382 |
279,412,651 |
0.1035 |
QC (Step 3)
Cell area and
detected Tx
- The following violin plots show the distribution of the cell area or
detected Tx across regions
- X-axis: sample
- Y-axis: area in micrometer square or total detected Tx
- Dot: cell
ggplot(data = cells, aes(x = region, y = log10(cell_area + 1), fill = region)) +
geom_violin(position = dodge, size = 0) +
geom_boxplot(width = 0.1, position = dodge, fill = "white") +
scale_fill_manual(values = regionCols) +
labs(
x = "",
y = "Cell area, log10"
) +
theme_bw() +
theme(
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
legend.position = "none",
text = element_text(size = 12)
)
ggplot(data = cells, aes(x = region, y = log10(transcript_counts + 1), fill = region)) +
geom_violin(position = dodge, size = 0) +
geom_boxplot(width = 0.1, position = dodge, fill = "white") +
scale_fill_manual(values = regionCols) +
labs(
x = "",
y = "Tx counts, log10"
) +
theme_bw() +
theme(
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
legend.position = "none",
text = element_text(size = 12)
)
Detect ouliters in terms of area
- Detect under segmentated (when more than one cells segmented
as one) cells using Grubb’s test
- A given area of the cell is an outlier when Grubb’s test p-value is
less than 0.05
- Note that a centroid diameter, or cell diameter, is preset in Xenium
for a specific tissue type
Human_Ovary_FF |
1,157,659 |
1,014 |
1,156,645 |
for (region in regionNames) {
subCells <- cells[which(cells$region == region), ]
subCells$cell_area <- round(subCells$cell_area, 3)
testRes <- grubbsTestRec(subCells$cell_area)
areaThreshold <- max(testRes$Area[!testRes$Outlier])
cat("#### ", region, "\n")
cat("\n")
plot(density(testRes$Area), sub="Cell area, µm2", main="")
abline(v = areaThreshold, col="red", lty=2)
cat("\n\n")
}
Human_Ovary_FF
Count and Features per cell
- Exclude cells with few (<20) Tx detected
- The following scatter plots show the relationship between nCount and
nFeature in linear (L) or log-scale (R)
- X-axis: nCount (total number of detected Tx within a cell,
depth)
- Y-axis: nFeature (number of detected Tx in each cell, coverage)
- Dot: cell
Human_Ovary_FF |
1,157,659 |
1,014 |
2,918 |
1,153,727 |
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
expr <- expr[panelGenesIdx, ]
cellDf <- data.frame(
nCount = apply(as.matrix(expr), 2, sum),
nFeature = apply(as.matrix(expr), 2, function(x) length(which(x > 0)))
)
cellDf <- densityColors(df = cellDf, cols = heatCols)
cellDf$criteria <- "Include"
cellDf$criteria[which(cellDf$nCount < nCountThre)] <- "Exclude"
cellDfInc <- cellDf[which(cellDf$criteria == "Include"), ]
cellDfExc <- cellDf[which(cellDf$criteria == "Exclude"), ]
cat("#### ", exprName, "\n")
cat("\n")
par(mfrow = c(2, 2))
plot(cellDf$nCount, cellDf$nFeature, col = cellDf$Col, xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, nFeatureMax))
plot(log10(cellDf$nCount + 1), log10(cellDf$nFeature + 1), col = cellDf$Col, xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax)), ylim = c(0, log10(nFeatureMax)))
plot(cellDfInc$nCount, cellDfInc$nFeature, col = "grey60", xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, max(cellDfInc$nFeature)))
points(cellDfExc$nCount, cellDfExc$nFeature, col = "red", pch = 20)
legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
plot(log10(cellDfInc$nCount + 1), log10(cellDfInc$nFeature + 1), col = "grey60", xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax + 1)), ylim = c(0, log10(nFeatureMax)))
points(log10(cellDfExc$nCount + 1), log10(cellDfExc$nFeature + 1), col = "red", pch = 20)
legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
cat("\n\n")
}
Human_Ovary_FF
Controls
Background signals
- Exclude a cell when a background signal (proportion) is
greater than 0.05
- Proportion = Number of negative probe detected / Number of (Tx + Neg
probe) detected
if (file.exists(file.path(outDir, "14_excludeCells.RDS"))) {
excludeCells <- readRDS(file.path(outDir, "14_excludeCells.RDS"))
} else {
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
# negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))
signalDf <- data.frame(
CellID = colnames(expr),
Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum)
# negCodewords = apply(expr[negCodewordsIdx,], 2, sum)
)
signalDf <- signalDf[which(signalDf$negProbes != 0), ]
signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)
buff <- data.frame(
Model = exprName,
Cells = signalDf$CellID[which(signalDf$Prop > negProbePropThre)],
Criteria = "High_background"
)
excludeCells <- rbind(excludeCells, buff)
}
excludeCells$Criteria <- factor(excludeCells$Criteria, levels=c("Large_cell_area", "Low_nCount", "High_background"))
saveRDS(excludeCells, file.path(outDir, "14_excludeCells.RDS"))
}
if (file.exists(file.path(outDir, "15_excDf.RDS"))) {
excDf <- readRDS(file.path(outDir, "15_excDf.RDS"))
} else {
buff1 <- excludeCells %>% group_by(Model, Criteria) %>% dplyr::count() %>% dplyr::rename(ExcludeCells = n) %>% dplyr::summarise(across(where(is.numeric), sum))
buff2 <- reshape2::dcast(buff1, formula = Model ~ Criteria)
excDf <- data.frame(
Model = buff2$Model,
Total_cells = as.numeric(summary[,"num_cells_detected"]),
Large_cell_area = as.numeric(buff2$Large_cell_area),
Low_nCount = as.numeric(buff2$Low_nCount),
High_background = as.numeric(buff2$High_background),
Remainders = as.numeric(summary[,"num_cells_detected"] - as.numeric(apply(buff2[,c(2:4)], 1, sum)))
)
rownames(excDf) <- excDf$Model
excDf <- excDf[regionNames,]
rownames(excDf) <- NULL
total <- as.numeric(apply(excDf[,c(2:6)], 2, sum))
totalDf <- data.frame(Model = "Total", Total_cells = total[1], Large_cell_area = total[2], Low_nCount = total[3], High_background = total[4], Remainders = total[5])
excDf <- rbind(excDf, totalDf)
saveRDS(excDf, file.path(outDir, "15_excDf.RDS"))
}
knitr::kable(excDf)
Human_Ovary_FF |
1,157,659 |
1,014 |
2,918 |
2 |
1,153,725 |
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
signalDf <- data.frame(
CellID = colnames(expr),
Genes = apply(as.matrix(expr[panelGenesIdx, ]), 2, sum),
negProbes = apply(as.matrix(expr[negProbesIdx, ]), 2, sum)
)
signalDf <- signalDf[which(signalDf$negProbes != 0), ]
signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)
cat("##### ", exprName, "\n")
cat("\n")
hist(signalDf$Prop, breaks = seq(0, max(signalDf$Prop) + 0.01, 0.01), xlab = "# Neg probes / Total detected molecules", sub = paste0(nrow(signalDf), " cells that capture at least one Neg Probe"), main = exprName)
abline(v = negProbePropThre, col = "red", lty = 2)
cat("\n\n")
}
Human_Ovary_FF
Positive probes
- Does not apply any filters but check the distribution of the
expression levels of the positive controls
- Very few cells detected positive controls
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
posProbesIdx <- which(str_detect(rownames(expr), "^Intergenic_Region"))
# negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))
signalDf <- data.frame(
CellID = colnames(expr),
# Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum),
posProbes = apply(as.matrix(expr)[posProbesIdx,], 2, sum)
)
cat("##### ", exprName, "\n")
cat("\n")
hist(signalDf$posProbes, xlab="Number of the positive controls detected", ylab="Number of cells", sub = paste0(length(which(signalDf$posProbes > 0)), " cells that capture at least one Pos Probe"), breaks = seq(0, max(signalDf$posProbes), 0.2), main = exprName)
cat("\n\n")
}
Human_Ovary_FF
Phred-like quality
score distribution
- 10X Genomics recommend to include Tx quality that is greater than 20
and the threshold in this study is 20
- 20 (99%, an error rate of 1 in 100)
- 30 (99.9%, an error rate of 1 in 1000)
- 40 (99.99%, an error rate of 1 in 10000)
- Low qual Tx are likely the negative controls
if (file.exists(file.path(outDir, "16_qvDf.RDS"))) {
qvDf <- readRDS(file.path(outDir, "16_qvDf.RDS"))
} else {
qvDf <- c()
for (idx in seq_along(regionOrder)) {
txName <- names(txAssignedL)[idx]
txAssigned <- txAssignedL[[idx]]
buff <- data.frame(cut(txAssigned$qv, breaks = c(0, 20, 30, 40)) %>% table)
buff$Sample <- txName
qvDf <- rbind(qvDf, buff)
}
colnames(qvDf) <- c("Quality_score", "Freq", "Sample")
saveRDS(qvDf, file.path(outDir, "16_qvDf.RDS"))
}
knitr::kable(reshape2::dcast(qvDf, Sample ~ Quality_score, value.var = c("Freq")))
Human_Ovary_FF |
490,810,495 |
788,861,443 |
1,139,425,444 |
QC Summary
- Number of cells after QC for the downstream
analysis
- Analysis ready expression profiles (before and after QC)
if (file.exists(file.path(outDir, "17_exprL.RDS"))) {
exprL <- readRDS(file.path(outDir, "17_exprL.RDS"))
} else {
exprL <- lapply(seq_along(exprL), function(idx) {
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
expr <- expr[panelGenesIdx, ]
return(expr)
})
names(exprL) <- regionOrder
saveRDS(exprL, file.path(outDir, "17_exprL.RDS"))
}
if (file.exists(file.path(outDir, "18_analysisReady_expression.RDS"))){
arExprL <- readRDS(file.path(outDir, "18_analysisReady_expression.RDS"))
} else {
geneAnnot <- featuresAnnot$V2 # gene symbol
names(geneAnnot) <- featuresAnnot$V1 # ensembl
arExprL <- lapply(seq_along(exprL), function(idx) { # analysis-ready expression profiles
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
exclude <- excludeCells[which(excludeCells$Sample == exprName),]
if (any(colnames(expr) %in% exclude$Cells)) {
expr <- expr[, -which(colnames(expr) %in% exclude$Cells)]
}
rownames(expr) <- geneAnnot[rownames(expr)]
return(expr)
})
names(arExprL) <- regionOrder
saveRDS(arExprL, file.path(outDir, "18_analysisReady_expression.RDS"))
}
Human_Ovary_FF |
1,157,659 |
3,934 |
0.0034 |
1,153,725 |
- QC parameters used in this report
nCount |
20 |
Minimum number of detected Tx within a cell |
negProbe proportion |
0.05 |
Maximum proportion of the background signal |
quality score |
20 |
Minimum Tx Phred-like quality score |
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-conda-linux-gnu
## Running under: Rocky Linux 8.8 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /home/heewon.seo/.local/miniconda3/envs/xenium/lib/libopenblasp-r0.3.27.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Edmonton
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] outliers_0.15 ggridges_0.5.6 ggplot2_3.5.1
## [4] stringr_1.5.1 dplyr_1.1.4 HDF5Array_1.32.1
## [7] rhdf5_2.48.0 DelayedArray_0.30.1 SparseArray_1.4.8
## [10] S4Arrays_1.4.1 abind_1.4-8 IRanges_2.38.1
## [13] S4Vectors_0.42.1 MatrixGenerics_1.16.0 matrixStats_1.4.1
## [16] BiocGenerics_0.50.0 Matrix_1.6-5 arrow_17.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.5 xfun_0.48 bslib_0.8.0
## [4] lattice_0.22-6 rhdf5filters_1.16.0 vctrs_0.6.5
## [7] tools_4.4.1 generics_0.1.3 tibble_3.2.1
## [10] fansi_1.0.6 highr_0.11 pkgconfig_2.0.3
## [13] KernSmooth_2.23-24 assertthat_0.2.1 lifecycle_1.0.4
## [16] compiler_4.4.1 farver_2.1.2 munsell_0.5.1
## [19] htmltools_0.5.8.1 sass_0.4.9 yaml_2.3.10
## [22] pillar_1.9.0 crayon_1.5.3 jquerylib_0.1.4
## [25] cachem_1.1.0 tidyselect_1.2.1 digest_0.6.37
## [28] stringi_1.8.4 reshape2_1.4.4 purrr_1.0.2
## [31] labeling_0.4.3 fastmap_1.2.0 grid_4.4.1
## [34] colorspace_2.1-1 cli_3.6.3 magrittr_2.0.3
## [37] utf8_1.2.4 withr_3.0.1 scales_1.3.0
## [40] bit64_4.5.2 rmarkdown_2.28 XVector_0.44.0
## [43] bit_4.5.0 evaluate_1.0.1 knitr_1.48
## [46] rlang_1.1.4 Rcpp_1.0.13 glue_1.8.0
## [49] jsonlite_1.8.9 R6_2.5.1 Rhdf5lib_1.26.0
## [52] plyr_1.8.9 zlibbioc_1.50.0