Load Xenium data
(Step 1)
outputFolders <- list.files(path = baseDir, pattern = "outs")
outputFolders
## [1] "Xenium_Prime__Mouse_Brain_Coronal_FF__outs"
# Summary
summaryL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
summary <- read.csv(file.path(baseDir, outputFolder, "metrics_summary.csv"))
return(summary)
})
summary <- Reduce(rbind, summaryL)
# Cells
cellsL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
cells <- read_parquet(file.path(baseDir, outputFolder, "cells.parquet"), as_data_frame = TRUE)
cells$region <- sapply(str_split(outputFolder, "__"), "[[", 2)
return(cells)
})
cells <- Reduce(rbind, cellsL)
cells$region <- factor(cells$region, levels = regionNames)
# Transcripts
txL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
tx <- read_parquet(file.path(baseDir, outputFolder, "transcripts.parquet"), as_data_frame = TRUE)
return(tx)
})
names(txL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
txL <- txL[regionNames]
# Expression profile
exprL <- lapply(seq_along(outputFolders), function(idx) {
outputFolder <- outputFolders[idx]
mat <- TENxMatrix(file.path(baseDir, outputFolder, "cell_feature_matrix.h5"), "matrix")
return(mat)
})
names(exprL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
exprL <- exprL[regionNames]
# Feature annotation
featuresAnnot <- read.delim(file.path(baseDir, outputFolders[1], "cell_feature_matrix", "features.tsv.gz"), header = FALSE, stringsAsFactors = FALSE)
QC (Step 3)
Cell area and
detected Tx
- The following violin plots show the distribution of the cell area or
detected Tx across regions
- X-axis: sample
- Y-axis: area in micrometer square or total detected Tx
- Dot: cell
ggplot(data = cells, aes(x = region, y = log10(cell_area + 1), fill = region)) +
geom_violin(position = dodge, size = 0) +
geom_boxplot(width = 0.1, position = dodge, fill = "white") +
scale_fill_manual(values = regionCols) +
labs(
x = "",
y = "Cell area, log10"
) +
theme_bw() +
theme(
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
legend.position = "none",
text = element_text(size = 12)
)
ggplot(data = cells, aes(x = region, y = log10(transcript_counts + 1), fill = region)) +
geom_violin(position = dodge, size = 0) +
geom_boxplot(width = 0.1, position = dodge, fill = "white") +
scale_fill_manual(values = regionCols) +
labs(
x = "",
y = "Tx counts, log10"
) +
theme_bw() +
theme(
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
legend.position = "none",
text = element_text(size = 12)
)
Detect ouliters in terms of area
- Detect under segmentated (when more than one cells segmented
as one) cells using Grubb’s test
- A given area of the cell is an outlier when Grubb’s test p-value is
less than 0.05
- Note that a centroid diameter, or cell diameter, is preset in Xenium
for a specific tissue type
Mouse_Brain_Coronal_FF |
63,173 |
78 |
63,095 |
for (region in regionNames) {
subCells <- cells[which(cells$region == region), ]
subCells$cell_area <- round(subCells$cell_area, 3)
testRes <- grubbsTestRec(subCells$cell_area)
areaThreshold <- max(testRes$Area[!testRes$Outlier])
cat("#### ", region, "\n")
cat("\n")
plot(density(testRes$Area), sub="Cell area, µm2", main="")
abline(v = areaThreshold, col="red", lty=2)
cat("\n\n")
}
Mouse_Brain_Coronal_FF
Detected Tx across cells
- Number of detected Tx across cells at the FOV
level
- In other words, some Tx detected only in one cell, while others
detected in many cells
- The following ridge plot shows the distribution of the detected Tx
in each FOV
- X-axis: number of cells that captures a given Tx
- Y-axis: density
txDfL <- lapply(seq_along(txAssignedL), function(idx) {
txName <- names(txAssignedL)[idx]
txAssigned <- txAssignedL[[idx]]
txDf <- as.data.frame(txAssigned %>% group_by(feature_name, fov_name) %>% dplyr::count() %>% dplyr::rename(cells = n))
txDf$model <- txName
return(txDf)
})
txDf <- Reduce(rbind, txDfL)
head(txDf)
## feature_name fov_name cells model
## 1 A1cf A10 5 Mouse_Brain_Coronal_FF
## 2 A1cf A11 5 Mouse_Brain_Coronal_FF
## 3 A1cf A12 13 Mouse_Brain_Coronal_FF
## 4 A1cf A13 7 Mouse_Brain_Coronal_FF
## 5 A1cf A14 6 Mouse_Brain_Coronal_FF
## 6 A1cf A15 17 Mouse_Brain_Coronal_FF
for (region in regionNames) {
cat("#### ", region, "\n")
cat("\n")
regDf <- txDf[which(txDf$model == region), ]
rp <- ridgePlot(df = regDf, x = "cells", y = "fov_name", title = region, xLbl = "nCells", yLbl = "FOV", scaleTrans = "log10")
print(rp)
cat("\n\n")
}
Mouse_Brain_Coronal_FF
Count and Features per cell
- Exclude cells with few (<20) Tx detected
- The following scatter plots show the relationship between nCount and
nFeature in linear (L) or log-scale (R)
- X-axis: nCount (total number of detected Tx within a cell,
depth)
- Y-axis: nFeature (number of detected Tx in each cell, coverage)
- Dot: cell
Mouse_Brain_Coronal_FF |
63,173 |
78 |
38 |
63,057 |
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
expr <- expr[panelGenesIdx, ]
cellDf <- data.frame(
nCount = apply(as.matrix(expr), 2, sum),
nFeature = apply(as.matrix(expr), 2, function(x) length(which(x > 0)))
)
cellDf <- densityColors(df = cellDf, cols = heatCols)
cellDf$criteria <- "Include"
cellDf$criteria[which(cellDf$nCount < nCountThre)] <- "Exclude"
cellDfInc <- cellDf[which(cellDf$criteria == "Include"), ]
cellDfExc <- cellDf[which(cellDf$criteria == "Exclude"), ]
cat("#### ", exprName, "\n")
cat("\n")
par(mfrow = c(2, 2))
plot(cellDf$nCount, cellDf$nFeature, col = cellDf$Col, xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, nFeatureMax))
plot(log10(cellDf$nCount + 1), log10(cellDf$nFeature + 1), col = cellDf$Col, xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax)), ylim = c(0, log10(nFeatureMax)))
plot(cellDfInc$nCount, cellDfInc$nFeature, col = "grey60", xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, max(cellDfInc$nFeature)))
points(cellDfExc$nCount, cellDfExc$nFeature, col = "red", pch = 20)
legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
plot(log10(cellDfInc$nCount + 1), log10(cellDfInc$nFeature + 1), col = "grey60", xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax + 1)), ylim = c(0, log10(nFeatureMax)))
points(log10(cellDfExc$nCount + 1), log10(cellDfExc$nFeature + 1), col = "red", pch = 20)
legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
cat("\n\n")
}
Mouse_Brain_Coronal_FF
Controls
Background signals
- Exclude a cell when a background signal (proportion) is
greater than 0.05
- Proportion = Number of negative probe detected / Number of (Tx + Neg
probe) detected
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
# negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))
signalDf <- data.frame(
CellID = colnames(expr),
Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum)
# negCodewords = apply(expr[negCodewordsIdx,], 2, sum)
)
signalDf <- signalDf[which(signalDf$negProbes != 0), ]
signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)
buff <- data.frame(
Model = exprName,
Cells = signalDf$CellID[which(signalDf$Prop > negProbePropThre)],
Criteria = "High_background"
)
excludeCells <- rbind(excludeCells, buff)
}
excludeCells$Criteria <- factor(excludeCells$Criteria, levels=c("Large_cell_area", "Low_nCount", "High_background"))
buff1 <- excludeCells %>% group_by(Model, Criteria) %>% dplyr::count() %>% dplyr::rename(ExcludeCells = n) %>% dplyr::summarise(across(where(is.numeric), sum))
buff2 <- reshape2::dcast(buff1, formula = Model ~ Criteria)
excDf <- data.frame(
Model = buff2$Model,
Total_cells = as.numeric(summary[,"num_cells_detected"]),
Large_cell_area = as.numeric(buff2$Large_cell_area),
Low_nCount = as.numeric(buff2$Low_nCount),
High_background = as.numeric(buff2$High_background),
Remainders = as.numeric(summary[,"num_cells_detected"] - as.numeric(apply(buff2[,c(2:4)], 1, sum)))
)
rownames(excDf) <- excDf$Model
excDf <- excDf[regionNames,]
rownames(excDf) <- NULL
total <- as.numeric(apply(excDf[,c(2:6)], 2, sum))
totalDf <- data.frame(Model = "Total", Total_cells = total[1], Large_cell_area = total[2], Low_nCount = total[3], High_background = total[4], Remainders = total[5])
excDf <- rbind(excDf, totalDf)
knitr::kable(excDf)
Mouse_Brain_Coronal_FF |
63,173 |
78 |
38 |
1 |
63,056 |
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
signalDf <- data.frame(
CellID = colnames(expr),
Genes = apply(as.matrix(expr[panelGenesIdx, ]), 2, sum),
negProbes = apply(as.matrix(expr[negProbesIdx, ]), 2, sum)
)
signalDf <- signalDf[which(signalDf$negProbes != 0), ]
signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)
cat("##### ", exprName, "\n")
cat("\n")
hist(signalDf$Prop, breaks = seq(0, max(signalDf$Prop) + 0.01, 0.01), xlab = "# Neg probes / Total detected molecules", sub = paste0(nrow(signalDf), " cells that capture at least one Neg Probe"), main = exprName)
abline(v = negProbePropThre, col = "red", lty = 2)
cat("\n\n")
}
Mouse_Brain_Coronal_FF
Positive probes
- Does not apply any filters but check the distribution of the
expression levels of the positive controls
- Very few cells detected positive controls
for (idx in seq_along(regionNames)) {
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
posProbesIdx <- which(str_detect(rownames(expr), "^Intergenic_Region"))
# negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))
signalDf <- data.frame(
CellID = colnames(expr),
# Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum),
posProbes = apply(as.matrix(expr)[posProbesIdx,], 2, sum)
)
cat("##### ", exprName, "\n")
cat("\n")
hist(signalDf$posProbes, xlab="Number of the positive controls detected", ylab="Number of cells", sub = paste0(length(which(signalDf$posProbes > 0)), " cells that capture at least one Pos Probe"), breaks = seq(0, max(signalDf$posProbes), 0.2), main = exprName)
cat("\n\n")
}
Mouse_Brain_Coronal_FF
Phred-like quality
score distribution
- 10X Genomics recommend to include Tx quality that is greater than 20
and the threshold in this study is 20
- 20 (99%, an error rate of 1 in 100)
- 30 (99.9%, an error rate of 1 in 1000)
- 40 (99.99%, an error rate of 1 in 10000)
- Low qual Tx are likely the negative controls
qvDf <- c()
for (idx in seq_along(regionOrder)) {
txName <- names(txAssignedL)[idx]
txAssigned <- txAssignedL[[idx]]
buff <- data.frame(cut(txAssigned$qv, breaks = c(0, 20, 30, 40)) %>% table)
buff$Sample <- txName
qvDf <- rbind(qvDf, buff)
}
colnames(qvDf) <- c("Quality_score", "Freq", "Sample")
knitr::kable(reshape2::dcast(qvDf, Sample ~ Quality_score, value.var = c("Freq")))
Mouse_Brain_Coronal_FF |
31,480,964 |
39,412,607 |
46,892,687 |
QC Summary
- Number of cells after QC for the downstream
analysis
- Analysis ready expression profiles (before and after QC)
exprL <- lapply(seq_along(exprL), function(idx) {
expr <- exprL[[idx]]
panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
expr <- expr[panelGenesIdx, ]
return(expr)
})
names(exprL) <- regionOrder
geneAnnot <- featuresAnnot$V2 # gene symbol
names(geneAnnot) <- featuresAnnot$V1 # ensembl
arExprL <- lapply(seq_along(exprL), function(idx) { # analysis-ready expression profiles
exprName <- names(exprL)[idx]
expr <- exprL[[idx]]
exclude <- excludeCells[which(excludeCells$Sample == exprName),]
if (any(colnames(expr) %in% exclude$Cells)) {
expr <- expr[, -which(colnames(expr) %in% exclude$Cells)]
}
rownames(expr) <- geneAnnot[rownames(expr)]
return(expr)
})
names(arExprL) <- regionOrder
saveRDS(arExprL, file.path(outDir, "01_analysisReady_expression.RDS"))
Mouse_Brain_Coronal_FF |
63,173 |
117 |
0.003 |
63,056 |
- QC parameters used in this report
nCount |
20 |
Minimum number of detected Tx within a cell |
negProbe proportion |
0.05 |
Maximum proportion of the background signal |
quality score |
20 |
Minimum Tx Phred-like quality score |
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
##
## time zone: America/Edmonton
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] outliers_0.15 ggridges_0.5.6 ggplot2_3.5.1
## [4] stringr_1.5.1 dplyr_1.1.4 HDF5Array_1.32.1
## [7] rhdf5_2.48.0 DelayedArray_0.30.1 SparseArray_1.4.8
## [10] S4Arrays_1.4.1 abind_1.4-8 IRanges_2.38.1
## [13] S4Vectors_0.42.1 MatrixGenerics_1.16.0 matrixStats_1.4.1
## [16] BiocGenerics_0.50.0 Matrix_1.7-0 arrow_17.0.0.1
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.5 xfun_0.47 bslib_0.8.0
## [4] lattice_0.22-6 rhdf5filters_1.16.0 vctrs_0.6.5
## [7] tools_4.4.1 generics_0.1.3 tibble_3.2.1
## [10] fansi_1.0.6 highr_0.11 pkgconfig_2.0.3
## [13] KernSmooth_2.23-24 assertthat_0.2.1 lifecycle_1.0.4
## [16] compiler_4.4.1 farver_2.1.2 munsell_0.5.1
## [19] htmltools_0.5.8.1 sass_0.4.9 yaml_2.3.10
## [22] pillar_1.9.0 crayon_1.5.3 jquerylib_0.1.4
## [25] cachem_1.1.0 tidyselect_1.2.1 digest_0.6.37
## [28] stringi_1.8.4 reshape2_1.4.4 purrr_1.0.2
## [31] labeling_0.4.3 fastmap_1.2.0 grid_4.4.1
## [34] colorspace_2.1-1 cli_3.6.3 magrittr_2.0.3
## [37] utf8_1.2.4 withr_3.0.1 scales_1.3.0
## [40] bit64_4.0.5 rmarkdown_2.28 XVector_0.44.0
## [43] bit_4.0.5 evaluate_1.0.0 knitr_1.48
## [46] rlang_1.1.4 Rcpp_1.0.13 glue_1.8.0
## [49] jsonlite_1.8.9 R6_2.5.1 Rhdf5lib_1.26.0
## [52] plyr_1.8.9 zlibbioc_1.50.0