1 Introduction

ASOC generated this report includes stats and diagnostic plots of the four Xenium In Situ data with the 5k panel from 10X Genomics.

2 Prerequisite

2.1 Environment

  • OS: macOS Sequoia 15.0.1
  • Platform: aarch64-apple-darwin20 (64-bit)
  • Software: R (v4.4.1)
  • Pakcages: outliers (v0.15), ggplot2 (v3.5.1), ggridges (v0.5.6), stringr (v1.5.1), dplyr (v1.1.4), Matrix (v1.7-0), yaml (v2.3.10)

2.2 Xenium In Situ data

WORKING_DIRECTORY
└── Xenium_Prime_SAMPLE_outs
    ├── cell_feature_matrix
    │   └── features.tsv.gz
    ├── cell_feature_matrix.h5
    ├── cells.parquet
    ├── metrics_summary.csv
    └── transcripts.parquet

3 Preprocessing/QC

3.2 Preparation

  • Set working directory to load files
# Do not run
baseDir <- "WORKING_DIRECTORY" # where the output-* folders are located
setwd(baseDir)
outDir <- file.path(baseDir, "Results")
library(arrow)
library(HDF5Array)
library(dplyr)
library(stringr)
library(ggplot2)
library(ggridges)
library(outliers)

3.3 Load Xenium data (Step 1)

outputFolders <- list.files(path = baseDir, pattern = "outs")
outputFolders
## [1] "Xenium_Prime__Breast_Cancer_FFPE__outs"  
## [2] "Xenium_Prime__Cervical_Cancer_FFPE__outs"
## [3] "Xenium_Prime__Ovarian_Cancer_FFPE__outs"
# Summary
summaryL <- lapply(seq_along(outputFolders), function(idx) {
        outputFolder <- outputFolders[idx]
        summary <- read.csv(file.path(baseDir, outputFolder, "metrics_summary.csv"))
        return(summary)
})
summary <- Reduce(rbind, summaryL)

# Cells
cellsL <- lapply(seq_along(outputFolders), function(idx) {
        outputFolder <- outputFolders[idx]
        cells <- read_parquet(file.path(baseDir, outputFolder, "cells.parquet"), as_data_frame = TRUE)
        cells$region <- sapply(str_split(outputFolder, "__"), "[[", 2)
        return(cells)
})
cells <- Reduce(rbind, cellsL)
cells$region <- factor(cells$region, levels = regionNames)

# Transcripts
txL <- lapply(seq_along(outputFolders), function(idx) {
        outputFolder <- outputFolders[idx]
        tx <- read_parquet(file.path(baseDir, outputFolder, "transcripts.parquet"), as_data_frame = TRUE)
        return(tx)
})
names(txL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
txL <- txL[regionNames]

# Expression profile
exprL <- lapply(seq_along(outputFolders), function(idx) {
        outputFolder <- outputFolders[idx]
        mat <- TENxMatrix(file.path(baseDir, outputFolder, "cell_feature_matrix.h5"), "matrix")
        return(mat)
})
names(exprL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
exprL <- exprL[regionNames]

# Feature annotation
featuresAnnot <- read.delim(file.path(baseDir, outputFolders[1], "cell_feature_matrix", "features.tsv.gz"), header = FALSE, stringsAsFactors = FALSE)

3.4 Stats at a glance

Region Region_area Cell_area Cells_detected High_qual_Tx Tx_per_100um2 FOVs
Cervical_Cancer_FFPE 155,159,781 45,536,939 699,110 80,158,637 157.17 261
Ovarian_Cancer_FFPE 113,854,997 53,651,502 840,387 89,640,525 144.93 199
Breast_Cancer_FFPE 86,812,984 27,984,815 407,124 120,455,566 380.50 356

3.5 Exclude unassigned Tx (Step 2)

  • The following capture is an example case to show Tx that are unassigned
txAssignedL <- lapply(seq_along(txL), function(idx) {
        tx <- txL[[idx]]
        return(tx[which(tx$cell_id != "UNASSIGNED"), ])
})
names(txAssignedL) <- regionNames
Model Total AssignedTx UnAssignedTx Proportion
Cervical_Cancer_FFPE 113,697,508 99,798,199 13,899,309 0.122
Ovarian_Cancer_FFPE 147,706,750 131,542,476 16,164,274 0.109
Breast_Cancer_FFPE 109,411,890 96,572,865 12,839,025 0.117
Total 370,816,148 327,913,540 42,902,608 0.116

3.6 QC (Step 3)

3.6.1 Cell area and detected Tx

  • The following violin plots show the distribution of the cell area or detected Tx across regions
    • X-axis: sample
    • Y-axis: area in micrometer square or total detected Tx
    • Dot: cell
ggplot(data = cells, aes(x = region, y = log10(cell_area + 1), fill = region)) +
        geom_violin(position = dodge, size = 0) +
        geom_boxplot(width = 0.1, position = dodge, fill = "white") +
        scale_fill_manual(values = regionCols) +
        labs(
                x = "",
                y = "Cell area, log10"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                legend.position = "none",
                text = element_text(size = 12)
        )

ggplot(data = cells, aes(x = region, y = log10(transcript_counts + 1), fill = region)) +
        geom_violin(position = dodge, size = 0) +
        geom_boxplot(width = 0.1, position = dodge, fill = "white") +
        scale_fill_manual(values = regionCols) +
        labs(
                x = "",
                y = "Tx counts, log10"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                legend.position = "none",
                text = element_text(size = 12)
        )

3.6.2 Detect ouliters in terms of area

  • Detect under segmentated (when more than one cells segmented as one) cells using Grubb’s test
  • A given area of the cell is an outlier when Grubb’s test p-value is less than 0.05
  • Note that a centroid diameter, or cell diameter, is preset in Xenium for a specific tissue type
Model Total_cells Excluded Remainders
Cervical_Cancer_FFPE 840,387 2,955 837,432
Ovarian_Cancer_FFPE 407,124 1,036 406,088
Breast_Cancer_FFPE 699,110 2,401 696,709
Total 1,946,621 6,392 1,940,229
for (region in regionNames) {
        subCells <- cells[which(cells$region == region), ]
        subCells$cell_area <- round(subCells$cell_area, 3)
        testRes <- grubbsTestRec(subCells$cell_area)
        areaThreshold <- max(testRes$Area[!testRes$Outlier])
        
        cat("#### ", region, "\n")
        cat("\n")
        plot(density(testRes$Area), sub="Cell area, µm2", main="")
        abline(v = areaThreshold, col="red", lty=2)
        cat("\n\n")
}

3.6.2.1 Cervical_Cancer_FFPE

3.6.2.2 Ovarian_Cancer_FFPE

3.6.2.3 Breast_Cancer_FFPE

3.6.3 Detected Tx across cells

  • Number of detected Tx across cells at the FOV level
  • In other words, some Tx detected only in one cell, while others detected in many cells
  • The following ridge plot shows the distribution of the detected Tx in each FOV
    • X-axis: number of cells that captures a given Tx
    • Y-axis: density
txDfL <- lapply(seq_along(txAssignedL), function(idx) {
        txName <- names(txAssignedL)[idx]
        txAssigned <- txAssignedL[[idx]]
        txDf <- as.data.frame(txAssigned %>% group_by(feature_name, fov_name) %>% dplyr::count() %>% dplyr::rename(cells = n))
        txDf$model <- txName
        return(txDf)
})
txDf <- Reduce(rbind, txDfL)
head(txDf)
##   feature_name fov_name cells                model
## 1        A2ML1      B10     3 Cervical_Cancer_FFPE
## 2        A2ML1      B11     2 Cervical_Cancer_FFPE
## 3        A2ML1      B12     1 Cervical_Cancer_FFPE
## 4        A2ML1      B13     5 Cervical_Cancer_FFPE
## 5        A2ML1      B14     1 Cervical_Cancer_FFPE
## 6        A2ML1      B16     2 Cervical_Cancer_FFPE
for (region in regionNames) {
        cat("#### ", region, "\n")
        cat("\n")
        regDf <- txDf[which(txDf$model == region), ]
        rp <- ridgePlot(df = regDf, x = "cells", y = "fov_name", title = region, xLbl = "nCells", yLbl = "FOV", scaleTrans = "log10")
        print(rp)
        cat("\n\n")
}

3.6.3.1 Cervical_Cancer_FFPE

3.6.3.2 Ovarian_Cancer_FFPE

3.6.3.3 Breast_Cancer_FFPE

3.6.4 Count and Features per cell

  • Exclude cells with few (<20) Tx detected
  • The following scatter plots show the relationship between nCount and nFeature in linear (L) or log-scale (R)
    • X-axis: nCount (total number of detected Tx within a cell, depth)
    • Y-axis: nFeature (number of detected Tx in each cell, coverage)
    • Dot: cell
Model Total_cells Large_cell_area Low_nCount Remainders
Cervical_Cancer_FFPE 840,387 2,955 131,499 705,933
Ovarian_Cancer_FFPE 407,124 1,036 22,855 383,233
Breast_Cancer_FFPE 699,110 2,401 194,607 502,102
Total 1,946,621 6,392 348,961 1,591,268
for (idx in seq_along(regionNames)) {
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]
        panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
        expr <- expr[panelGenesIdx, ]

        cellDf <- data.frame(
                nCount = apply(as.matrix(expr), 2, sum),
                nFeature = apply(as.matrix(expr), 2, function(x) length(which(x > 0)))
        )
        cellDf <- densityColors(df = cellDf, cols = heatCols)

        cellDf$criteria <- "Include"
        cellDf$criteria[which(cellDf$nCount < nCountThre)] <- "Exclude"

        cellDfInc <- cellDf[which(cellDf$criteria == "Include"), ]
        cellDfExc <- cellDf[which(cellDf$criteria == "Exclude"), ]

        cat("#### ", exprName, "\n")
        cat("\n")
        par(mfrow = c(2, 2))
        plot(cellDf$nCount, cellDf$nFeature, col = cellDf$Col, xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, nFeatureMax))
        plot(log10(cellDf$nCount + 1), log10(cellDf$nFeature + 1), col = cellDf$Col, xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax)), ylim = c(0, log10(nFeatureMax)))
        plot(cellDfInc$nCount, cellDfInc$nFeature, col = "grey60", xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, nFeatureMax))
        points(cellDfExc$nCount, cellDfExc$nFeature, col = "red", pch = 20)
        legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
        plot(log10(cellDfInc$nCount + 1), log10(cellDfInc$nFeature + 1), col = "grey60", xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax + 1)), ylim = c(0, log10(nFeatureMax)))
        points(log10(cellDfExc$nCount + 1), log10(cellDfExc$nFeature + 1), col = "red", pch = 20)
        legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
        cat("\n\n")
}

3.6.4.1 Cervical_Cancer_FFPE

3.6.4.2 Ovarian_Cancer_FFPE

3.6.4.3 Breast_Cancer_FFPE

3.6.5 Controls

3.6.5.1 Background signals

  • Exclude a cell when a background signal (proportion) is greater than 0.05
    • Proportion = Number of negative probe detected / Number of (Tx + Neg probe) detected
for (idx in seq_along(regionNames)) {
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]

        panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
        negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
        # negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))

        signalDf <- data.frame(
                CellID = colnames(expr),
                Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
                negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum)
                # negCodewords = apply(expr[negCodewordsIdx,], 2, sum)
        )
        signalDf <- signalDf[which(signalDf$negProbes != 0), ]
        signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)

        buff <- data.frame(
                Model = exprName,
                Cells = signalDf$CellID[which(signalDf$Prop > negProbePropThre)],
                Criteria = "High_background"
        )
        excludeCells <- rbind(excludeCells, buff)
}
excludeCells$Criteria <- factor(excludeCells$Criteria, levels=c("Large_cell_area", "Low_nCount", "High_background"))

buff1 <- excludeCells %>% group_by(Model, Criteria) %>% dplyr::count() %>% dplyr::rename(ExcludeCells = n) %>% dplyr::summarise(across(where(is.numeric), sum))
buff2 <- reshape2::dcast(buff1, formula = Model ~ Criteria)
excDf <- data.frame(
        Model = buff2$Model,
        Total_cells = as.numeric(summary[,"num_cells_detected"]),
        Large_cell_area = as.numeric(buff2$Large_cell_area),
        Low_nCount = as.numeric(buff2$Low_nCount),
        High_background = as.numeric(buff2$High_background),
        Remainders = as.numeric(summary[,"num_cells_detected"] - as.numeric(apply(buff2[,c(2:4)], 1, sum)))
)
rownames(excDf) <- excDf$Model
excDf <- excDf[regionNames,]
rownames(excDf) <- NULL
total <- as.numeric(apply(excDf[,c(2:6)], 2, sum))
totalDf <- data.frame(Model = "Total", Total_cells = total[1], Large_cell_area = total[2], Low_nCount = total[3], High_background = total[4], Remainders = total[5])
excDf <- rbind(excDf, totalDf)
knitr::kable(excDf)
Model Total_cells Large_cell_area Low_nCount High_background Remainders
Cervical_Cancer_FFPE 840,387 2,955 131,499 34 705,899
Ovarian_Cancer_FFPE 407,124 1,036 22,855 11 383,222
Breast_Cancer_FFPE 699,110 2,401 194,607 63 502,039
Total 1,946,621 6,392 348,961 108 1,591,160
for (idx in seq_along(regionNames)) {
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]

        panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
        negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))        

        signalDf <- data.frame(
                CellID = colnames(expr),
                Genes = apply(as.matrix(expr[panelGenesIdx, ]), 2, sum),
                negProbes = apply(as.matrix(expr[negProbesIdx, ]), 2, sum)
        )
        signalDf <- signalDf[which(signalDf$negProbes != 0), ]
        signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)

        cat("##### ", exprName, "\n")
        cat("\n")
        hist(signalDf$Prop, breaks = seq(0, max(signalDf$Prop) + 0.01, 0.01), xlab = "# Neg probes / Total detected molecules", sub = paste0(nrow(signalDf), " cells that capture at least one Neg Probe"), main = exprName)
        abline(v = negProbePropThre, col = "red", lty = 2)
        cat("\n\n")
}
3.6.5.1.1 Cervical_Cancer_FFPE

3.6.5.1.2 Ovarian_Cancer_FFPE

3.6.5.1.3 Breast_Cancer_FFPE

3.6.5.2 Positive probes

  • Does not apply any filters but check the distribution of the expression levels of the positive controls
  • Very few cells detected positive controls
for (idx in seq_along(regionNames)) {
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]

        panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
        negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
        posProbesIdx <- which(str_detect(rownames(expr), "^Intergenic_Region"))
        # negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))

        signalDf <- data.frame(
                CellID = colnames(expr),
                # Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
                negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum),
                posProbes = apply(as.matrix(expr)[posProbesIdx,], 2, sum)
        )
        
        cat("##### ", exprName, "\n")
        cat("\n")
        hist(signalDf$posProbes, xlab="Number of the positive controls detected", ylab="Number of cells", sub = paste0(length(which(signalDf$posProbes > 0)), " cells that capture at least one Pos Probe"), breaks = seq(0, max(signalDf$posProbes), 0.2), main = exprName)
        cat("\n\n")       
}
3.6.5.2.1 Cervical_Cancer_FFPE

3.6.5.2.2 Ovarian_Cancer_FFPE

3.6.5.2.3 Breast_Cancer_FFPE

3.6.6 Phred-like quality score distribution

  • 10X Genomics recommend to include Tx quality that is greater than 20 and the threshold in this study is 20
    • 20 (99%, an error rate of 1 in 100)
    • 30 (99.9%, an error rate of 1 in 1000)
    • 40 (99.99%, an error rate of 1 in 10000)
  • Low qual Tx are likely the negative controls
qvDf <- c()
for (idx in seq_along(regionOrder)) {
        txName <- names(txAssignedL)[idx]
        txAssigned <- txAssignedL[[idx]]

        buff <- data.frame(cut(txAssigned$qv, breaks = c(0, 20, 30, 40)) %>% table)
        buff$Sample <- txName

        qvDf <- rbind(qvDf, buff)
}
colnames(qvDf) <- c("Quality_score", "Freq", "Sample")
knitr::kable(reshape2::dcast(qvDf, Sample ~ Quality_score, value.var = c("Freq")))
Sample (0,20] (20,30] (30,40]
Breast_Cancer_FFPE 13,563,007 7,239,803 75,770,055
Cervical_Cancer_FFPE 11,476,814 3,222,037 85,099,348
Ovarian_Cancer_FFPE 15,916,628 3,520,096 112,105,752

3.6.7 QC Summary

  • Number of cells after QC for the downstream analysis
  • Analysis ready expression profiles (before and after QC)
exprL <- lapply(seq_along(exprL), function(idx) {
        expr <- exprL[[idx]]
        panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
        expr <- expr[panelGenesIdx, ]
        return(expr)
})
names(exprL) <- regionOrder

geneAnnot <- featuresAnnot$V2 # gene symbol
names(geneAnnot) <- featuresAnnot$V1 # ensembl 

arExprL <- lapply(seq_along(exprL), function(idx) { # analysis-ready expression profiles
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]
        exclude <- excludeCells[which(excludeCells$Sample == exprName),]

        if (any(colnames(expr) %in% exclude$Cells)) {
                expr <- expr[, -which(colnames(expr) %in% exclude$Cells)]
        }        
        rownames(expr) <- geneAnnot[rownames(expr)]
        return(expr)
})
names(arExprL) <- regionOrder
saveRDS(arExprL, file.path(outDir, "01_analysisReady_expression.RDS"))
Sample Total_cells Excluded_cells Proportion_exc Remainders
Cervical_Cancer_FFPE 840,387 134,488 0.1600 705,899
Ovarian_Cancer_FFPE 407,124 23,902 0.0587 383,222
Breast_Cancer_FFPE 699,110 197,071 0.2819 502,039
  • QC parameters used in this report
Filter name Threshold Description
nCount 20 Minimum number of detected Tx within a cell
negProbe proportion 0.05 Maximum proportion of the background signal
quality score 20 Minimum Tx Phred-like quality score
  • R environment
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
## 
## time zone: America/Edmonton
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] outliers_0.15         ggridges_0.5.6        ggplot2_3.5.1        
##  [4] stringr_1.5.1         dplyr_1.1.4           HDF5Array_1.32.1     
##  [7] rhdf5_2.48.0          DelayedArray_0.30.1   SparseArray_1.4.8    
## [10] S4Arrays_1.4.1        abind_1.4-8           IRanges_2.38.1       
## [13] S4Vectors_0.42.1      MatrixGenerics_1.16.0 matrixStats_1.4.1    
## [16] BiocGenerics_0.50.0   Matrix_1.7-0          arrow_17.0.0.1       
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.5        xfun_0.47           bslib_0.8.0        
##  [4] lattice_0.22-6      rhdf5filters_1.16.0 vctrs_0.6.5        
##  [7] tools_4.4.1         generics_0.1.3      tibble_3.2.1       
## [10] fansi_1.0.6         highr_0.11          pkgconfig_2.0.3    
## [13] KernSmooth_2.23-24  assertthat_0.2.1    lifecycle_1.0.4    
## [16] compiler_4.4.1      farver_2.1.2        munsell_0.5.1      
## [19] htmltools_0.5.8.1   sass_0.4.9          yaml_2.3.10        
## [22] pillar_1.9.0        crayon_1.5.3        jquerylib_0.1.4    
## [25] cachem_1.1.0        tidyselect_1.2.1    digest_0.6.37      
## [28] stringi_1.8.4       reshape2_1.4.4      purrr_1.0.2        
## [31] labeling_0.4.3      fastmap_1.2.0       grid_4.4.1         
## [34] colorspace_2.1-1    cli_3.6.3           magrittr_2.0.3     
## [37] utf8_1.2.4          withr_3.0.1         scales_1.3.0       
## [40] bit64_4.0.5         rmarkdown_2.28      XVector_0.44.0     
## [43] bit_4.0.5           evaluate_1.0.0      knitr_1.48         
## [46] rlang_1.1.4         Rcpp_1.0.13         glue_1.8.0         
## [49] jsonlite_1.8.9      R6_2.5.1            Rhdf5lib_1.26.0    
## [52] plyr_1.8.9          zlibbioc_1.50.0