1 Introduction

ASOC generated this report includes stats and diagnostic plots of the four Xenium In Situ data with the 5k panel from 10X Genomics.

2 Prerequisite

2.1 Environment

  • OS: Rocky Linux 8.8
  • Platform: x86_64-conda-linux-gnu (64-bit)
  • Software: R (v4.4.1)
  • Packages:
    • outliers (v0.15)
    • ggplot2 (v3.5.1)
    • ggridges (v0.5.6)
    • stringr (v1.5.1)
    • dplyr (v1.1.4)
    • Matrix (v1.7-0)
    • yaml (v2.3.10)

2.2 Xenium In Situ data

  • FOVs (field of views): 455
  • Fresh Frozen Human Ovarian Adenocarcinoma with 5K Human Pan Tissue and Pathways Panel
    • Xenium Prime 5K In Situ Gene Expression with Cell Segmentation Staining data for human ovarian adenocarcinoma tissue (Fresh Frozen) using the Xenium Prime 5K Human Pan Tissue and Pathways Panel.
  • Slide ID: N/A
  • Preparation method: FF
  • Number of panel genes: 5,001 (Xenium Human 5K Pan Tissue & Pathways Panel, hAtlas_v1.1)
  • Files: feature.tsv.gz, cell_feature_matrix.h5, cells.parquet, metrics_summary.csv, and transcripts.parquet (see below)
WORKING_DIRECTORY
└── Xenium_Prime_SAMPLE_outs
    ├── cell_feature_matrix
    │   └── features.tsv.gz
    ├── cell_feature_matrix.h5
    ├── cells.parquet
    ├── metrics_summary.csv
    └── transcripts.parquet

3 Preprocessing/QC

3.1 Xenium Ranger output

3.2 Preparation

  • Set working directory to load files
# Do not run
baseDir <- "WORKING_DIRECTORY" # where the output-* folders are located
setwd(baseDir)
outDir <- file.path(baseDir, "Results")
library(arrow)
library(HDF5Array)
library(dplyr)
library(stringr)
library(ggplot2)
library(ggridges)
library(outliers)

3.3 Load Xenium data (Step 1)

outputFolders <- list.files(path = baseDir, pattern = "outs")
outputFolders
## [1] "Xenium_Prime__Human_Ovary_FF__outs"
# Summary
if (file.exists(file.path(outDir, "01_summary.RDS"))) {
        summary <- readRDS(file.path(outDir, "01_summary.RDS"))
} else {
        summaryL <- lapply(seq_along(outputFolders), function(idx) {
                outputFolder <- outputFolders[idx]
                summary <- read.csv(file.path(baseDir, outputFolder, "metrics_summary.csv"))
                return(summary)
        })
        summary <- Reduce(rbind, summaryL)
        saveRDS(summary, file.path(outDir, "01_summary.RDS"))
}

# Cells
if (file.exists(file.path(outDir, "02_cells.RDS"))) {
        cells <- readRDS(file.path(outDir, "02_cells.RDS"))
} else {
        cellsL <- lapply(seq_along(outputFolders), function(idx) {
                outputFolder <- outputFolders[idx]
                cells <- read_parquet(file.path(baseDir, outputFolder, "cells.parquet"), as_data_frame = TRUE)
                cells$region <- sapply(str_split(outputFolder, "__"), "[[", 2)
                return(cells)
        })
        cells <- Reduce(rbind, cellsL)
        cells$region <- factor(cells$region, levels = regionNames)
        saveRDS(cells, file.path(outDir, "02_cells.RDS"))
}

# Transcripts
if (file.exists(file.path(outDir, "03_txL.RDS"))) {
        txL <- readRDS(file.path(outDir, "03_txL.RDS"))
} else {
        txL <- lapply(seq_along(outputFolders), function(idx) {
                outputFolder <- outputFolders[idx]
                tx <- read_parquet(file.path(baseDir, outputFolder, "transcripts.parquet"), as_data_frame = TRUE)
                return(tx)
        })
        names(txL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
        txL <- txL[regionNames]
        saveRDS(txL, file.path(outDir, "03_txL.RDS"))
}

# Expression profile
if (file.exists(file.path(outDir, "04_exprL.RDS"))) {
        exprL <- readRDS(file.path(outDir, "04_exprL.RDS"))
} else {
        exprL <- lapply(seq_along(outputFolders), function(idx) {
                outputFolder <- outputFolders[idx]
                mat <- TENxMatrix(file.path(baseDir, outputFolder, "cell_feature_matrix.h5"), "matrix")
                return(mat)
        })
        names(exprL) <- sapply(str_split(outputFolders, "__"), "[[", 2)
        exprL <- exprL[regionNames]
        saveRDS(exprL, file.path(outDir, "04_exprL.RDS"))
}

# Feature annotation
if (file.exists(file.path(outDir, "05_featuresAnnot.RDS"))) {
        featuresAnnot <- readRDS(file.path(outDir, "05_featuresAnnot.RDS"))
} else {
        featuresAnnot <- read.delim(file.path(baseDir, outputFolders[1], "cell_feature_matrix", "features.tsv.gz"), header = FALSE, stringsAsFactors = FALSE)
        saveRDS(featuresAnnot, file.path(outDir, "05_featuresAnnot.RDS"))
}

3.4 Stats at a glance

Region Region_area Cell_area Cells_detected High_qual_Tx Tx_per_100um2 FOVs
Human_Ovary_FF 198,118,228 112,246,727 1,157,659 2,163,771,250 1720.60 455

3.5 Exclude unassigned Tx (Step 2)

  • The following capture is an example case to show Tx that are unassigned
if (file.exists(file.path(outDir, "06_txAssignedL.RDS"))) {
        txAssignedL <- readRDS(file.path(outDir, "06_txAssignedL.RDS"))
} else {
        txAssignedL <- lapply(seq_along(outputFolders), function(idx) {
                outputFolder <- outputFolders[idx]
                tx <- read_parquet(file.path(baseDir, outputFolder, "out.parquet"), as_data_frame = TRUE)
                return(tx)
        })
        # txAssignedL <- lapply(seq_along(txL), function(idx) {
        #         tx <- txL[[idx]]
        #         return(tx[which(tx$cell_id != "UNASSIGNED"), ])
        # })
        names(txAssignedL) <- regionNames
        saveRDS(txAssignedL, file.path(outDir, "06_txAssignedL.RDS"))
}
Model Total AssignedTx UnAssignedTx Proportion
Human_Ovary_FF 2,698,510,033 2,419,097,382 279,412,651 0.1035

3.6 QC (Step 3)

3.6.1 Cell area and detected Tx

  • The following violin plots show the distribution of the cell area or detected Tx across regions
    • X-axis: sample
    • Y-axis: area in micrometer square or total detected Tx
    • Dot: cell
ggplot(data = cells, aes(x = region, y = log10(cell_area + 1), fill = region)) +
        geom_violin(position = dodge, size = 0) +
        geom_boxplot(width = 0.1, position = dodge, fill = "white") +
        scale_fill_manual(values = regionCols) +
        labs(
                x = "",
                y = "Cell area, log10"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                legend.position = "none",
                text = element_text(size = 12)
        )

ggplot(data = cells, aes(x = region, y = log10(transcript_counts + 1), fill = region)) +
        geom_violin(position = dodge, size = 0) +
        geom_boxplot(width = 0.1, position = dodge, fill = "white") +
        scale_fill_manual(values = regionCols) +
        labs(
                x = "",
                y = "Tx counts, log10"
        ) +
        theme_bw() +
        theme(
                axis.line = element_line(colour = "black"),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                panel.border = element_blank(),
                panel.background = element_blank(),
                axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
                legend.position = "none",
                text = element_text(size = 12)
        )

3.6.2 Detect ouliters in terms of area

  • Detect under segmentated (when more than one cells segmented as one) cells using Grubb’s test
  • A given area of the cell is an outlier when Grubb’s test p-value is less than 0.05
  • Note that a centroid diameter, or cell diameter, is preset in Xenium for a specific tissue type
Model Total_cells Excluded Remainders
Human_Ovary_FF 1,157,659 1,014 1,156,645
for (region in regionNames) {
        subCells <- cells[which(cells$region == region), ]
        subCells$cell_area <- round(subCells$cell_area, 3)
        testRes <- grubbsTestRec(subCells$cell_area)
        areaThreshold <- max(testRes$Area[!testRes$Outlier])
        
        cat("#### ", region, "\n")
        cat("\n")
        plot(density(testRes$Area), sub="Cell area, µm2", main="")
        abline(v = areaThreshold, col="red", lty=2)
        cat("\n\n")
}

3.6.2.1 Human_Ovary_FF

3.6.3 Count and Features per cell

  • Exclude cells with few (<20) Tx detected
  • The following scatter plots show the relationship between nCount and nFeature in linear (L) or log-scale (R)
    • X-axis: nCount (total number of detected Tx within a cell, depth)
    • Y-axis: nFeature (number of detected Tx in each cell, coverage)
    • Dot: cell
Model Total_cells Large_cell_area Low_nCount Remainders
Human_Ovary_FF 1,157,659 1,014 2,918 1,153,727
for (idx in seq_along(regionNames)) {
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]
        panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
        expr <- expr[panelGenesIdx, ]

        cellDf <- data.frame(
                nCount = apply(as.matrix(expr), 2, sum),
                nFeature = apply(as.matrix(expr), 2, function(x) length(which(x > 0)))
        )
        cellDf <- densityColors(df = cellDf, cols = heatCols)

        cellDf$criteria <- "Include"
        cellDf$criteria[which(cellDf$nCount < nCountThre)] <- "Exclude"

        cellDfInc <- cellDf[which(cellDf$criteria == "Include"), ]
        cellDfExc <- cellDf[which(cellDf$criteria == "Exclude"), ]

        cat("#### ", exprName, "\n")
        cat("\n")
        par(mfrow = c(2, 2))
        plot(cellDf$nCount, cellDf$nFeature, col = cellDf$Col, xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, nFeatureMax))
        plot(log10(cellDf$nCount + 1), log10(cellDf$nFeature + 1), col = cellDf$Col, xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax)), ylim = c(0, log10(nFeatureMax)))
        plot(cellDfInc$nCount, cellDfInc$nFeature, col = "grey60", xlab = "nCount", ylab = "nFeature", pch = 20, xlim = c(0, nCountMax), ylim = c(0, max(cellDfInc$nFeature)))
        points(cellDfExc$nCount, cellDfExc$nFeature, col = "red", pch = 20)
        legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
        plot(log10(cellDfInc$nCount + 1), log10(cellDfInc$nFeature + 1), col = "grey60", xlab = "nCount, log10", ylab = "nFeature, log10", pch = 20, xlim = c(0, log10(nCountMax + 1)), ylim = c(0, log10(nFeatureMax)))
        points(log10(cellDfExc$nCount + 1), log10(cellDfExc$nFeature + 1), col = "red", pch = 20)
        legend("topleft", legend = c("Include", "Exclude"), fill = c("grey60", "red"), bty = "n")
        cat("\n\n")
}

3.6.3.1 Human_Ovary_FF

3.6.4 Controls

3.6.4.1 Background signals

  • Exclude a cell when a background signal (proportion) is greater than 0.05
    • Proportion = Number of negative probe detected / Number of (Tx + Neg probe) detected
if (file.exists(file.path(outDir, "14_excludeCells.RDS"))) {
        excludeCells <- readRDS(file.path(outDir, "14_excludeCells.RDS"))
} else {
                for (idx in seq_along(regionNames)) {
                exprName <- names(exprL)[idx]
                expr <- exprL[[idx]]

                panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
                negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
                # negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))

                signalDf <- data.frame(
                        CellID = colnames(expr),
                        Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
                        negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum)
                        # negCodewords = apply(expr[negCodewordsIdx,], 2, sum)
                )
                signalDf <- signalDf[which(signalDf$negProbes != 0), ]
                signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)

                buff <- data.frame(
                        Model = exprName,
                        Cells = signalDf$CellID[which(signalDf$Prop > negProbePropThre)],
                        Criteria = "High_background"
                )
                excludeCells <- rbind(excludeCells, buff)
        }
        excludeCells$Criteria <- factor(excludeCells$Criteria, levels=c("Large_cell_area", "Low_nCount", "High_background"))
        saveRDS(excludeCells, file.path(outDir, "14_excludeCells.RDS"))
}

if (file.exists(file.path(outDir, "15_excDf.RDS"))) {
        excDf <- readRDS(file.path(outDir, "15_excDf.RDS"))
} else { 
        buff1 <- excludeCells %>% group_by(Model, Criteria) %>% dplyr::count() %>% dplyr::rename(ExcludeCells = n) %>% dplyr::summarise(across(where(is.numeric), sum))
        buff2 <- reshape2::dcast(buff1, formula = Model ~ Criteria)
        excDf <- data.frame(
                Model = buff2$Model,
                Total_cells = as.numeric(summary[,"num_cells_detected"]),
                Large_cell_area = as.numeric(buff2$Large_cell_area),
                Low_nCount = as.numeric(buff2$Low_nCount),
                High_background = as.numeric(buff2$High_background),
                Remainders = as.numeric(summary[,"num_cells_detected"] - as.numeric(apply(buff2[,c(2:4)], 1, sum)))
        )
        rownames(excDf) <- excDf$Model
        excDf <- excDf[regionNames,]
        rownames(excDf) <- NULL
        total <- as.numeric(apply(excDf[,c(2:6)], 2, sum))
        totalDf <- data.frame(Model = "Total", Total_cells = total[1], Large_cell_area = total[2], Low_nCount = total[3], High_background = total[4], Remainders = total[5])
        excDf <- rbind(excDf, totalDf)
        saveRDS(excDf, file.path(outDir, "15_excDf.RDS"))
}
knitr::kable(excDf)
Model Total_cells Large_cell_area Low_nCount High_background Remainders
Human_Ovary_FF 1,157,659 1,014 2,918 2 1,153,725
for (idx in seq_along(regionNames)) {
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]

        panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
        negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))        

        signalDf <- data.frame(
                CellID = colnames(expr),
                Genes = apply(as.matrix(expr[panelGenesIdx, ]), 2, sum),
                negProbes = apply(as.matrix(expr[negProbesIdx, ]), 2, sum)
        )
        signalDf <- signalDf[which(signalDf$negProbes != 0), ]
        signalDf$Prop <- signalDf$negProbes / (signalDf$Genes + signalDf$negProbes)

        cat("##### ", exprName, "\n")
        cat("\n")
        hist(signalDf$Prop, breaks = seq(0, max(signalDf$Prop) + 0.01, 0.01), xlab = "# Neg probes / Total detected molecules", sub = paste0(nrow(signalDf), " cells that capture at least one Neg Probe"), main = exprName)
        abline(v = negProbePropThre, col = "red", lty = 2)
        cat("\n\n")
}
3.6.4.1.1 Human_Ovary_FF

3.6.4.2 Positive probes

  • Does not apply any filters but check the distribution of the expression levels of the positive controls
  • Very few cells detected positive controls
for (idx in seq_along(regionNames)) {
        exprName <- names(exprL)[idx]
        expr <- exprL[[idx]]

        panelGenesIdx <- which(str_detect(rownames(expr), "^ENS"))
        negProbesIdx <- which(str_detect(rownames(expr), "^NegControlProbe"))
        posProbesIdx <- which(str_detect(rownames(expr), "^Intergenic_Region"))
        # negCodewordsIdx <- which(str_detect(rownames(expr), "^NegControlCodeword"))

        signalDf <- data.frame(
                CellID = colnames(expr),
                # Genes = apply(as.matrix(expr)[panelGenesIdx, ], 2, sum),
                negProbes = apply(as.matrix(expr)[negProbesIdx, ], 2, sum),
                posProbes = apply(as.matrix(expr)[posProbesIdx,], 2, sum)
        )
        
        cat("##### ", exprName, "\n")
        cat("\n")
        hist(signalDf$posProbes, xlab="Number of the positive controls detected", ylab="Number of cells", sub = paste0(length(which(signalDf$posProbes > 0)), " cells that capture at least one Pos Probe"), breaks = seq(0, max(signalDf$posProbes), 0.2), main = exprName)
        cat("\n\n")       
}
3.6.4.2.1 Human_Ovary_FF

3.6.5 Phred-like quality score distribution

  • 10X Genomics recommend to include Tx quality that is greater than 20 and the threshold in this study is 20
    • 20 (99%, an error rate of 1 in 100)
    • 30 (99.9%, an error rate of 1 in 1000)
    • 40 (99.99%, an error rate of 1 in 10000)
  • Low qual Tx are likely the negative controls
if (file.exists(file.path(outDir, "16_qvDf.RDS"))) {
        qvDf <- readRDS(file.path(outDir, "16_qvDf.RDS"))
} else {
        qvDf <- c()
        for (idx in seq_along(regionOrder)) {
                txName <- names(txAssignedL)[idx]
                txAssigned <- txAssignedL[[idx]]

                buff <- data.frame(cut(txAssigned$qv, breaks = c(0, 20, 30, 40)) %>% table)
                buff$Sample <- txName

                qvDf <- rbind(qvDf, buff)
        }
        colnames(qvDf) <- c("Quality_score", "Freq", "Sample")
        saveRDS(qvDf, file.path(outDir, "16_qvDf.RDS"))
}
knitr::kable(reshape2::dcast(qvDf, Sample ~ Quality_score, value.var = c("Freq")))
Sample (0,20] (20,30] (30,40]
Human_Ovary_FF 490,810,495 788,861,443 1,139,425,444

3.6.6 QC Summary

  • Number of cells after QC for the downstream analysis
  • Analysis ready expression profiles (before and after QC)
if (file.exists(file.path(outDir, "17_exprL.RDS"))) {
        exprL <- readRDS(file.path(outDir, "17_exprL.RDS"))
} else {
        exprL <- lapply(seq_along(exprL), function(idx) {
                expr <- exprL[[idx]]
                panelGenesIdx <- which(str_detect(rownames(expr), "ENS"))
                expr <- expr[panelGenesIdx, ]
                return(expr)
        })
        names(exprL) <- regionOrder
        saveRDS(exprL, file.path(outDir, "17_exprL.RDS"))
}

if (file.exists(file.path(outDir, "18_analysisReady_expression.RDS"))){
        arExprL <- readRDS(file.path(outDir, "18_analysisReady_expression.RDS"))
} else {
        geneAnnot <- featuresAnnot$V2 # gene symbol
        names(geneAnnot) <- featuresAnnot$V1 # ensembl 

        arExprL <- lapply(seq_along(exprL), function(idx) { # analysis-ready expression profiles
                exprName <- names(exprL)[idx]
                expr <- exprL[[idx]]
                exclude <- excludeCells[which(excludeCells$Sample == exprName),]

                if (any(colnames(expr) %in% exclude$Cells)) {
                        expr <- expr[, -which(colnames(expr) %in% exclude$Cells)]
                }        
                rownames(expr) <- geneAnnot[rownames(expr)]
                return(expr)
        })
        names(arExprL) <- regionOrder
        saveRDS(arExprL, file.path(outDir, "18_analysisReady_expression.RDS"))
}
Sample Total_cells Excluded_cells Proportion_exc Remainders
Human_Ovary_FF 1,157,659 3,934 0.0034 1,153,725
  • QC parameters used in this report
Filter name Threshold Description
nCount 20 Minimum number of detected Tx within a cell
negProbe proportion 0.05 Maximum proportion of the background signal
quality score 20 Minimum Tx Phred-like quality score
  • R environment
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-conda-linux-gnu
## Running under: Rocky Linux 8.8 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /home/heewon.seo/.local/miniconda3/envs/xenium/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Edmonton
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] outliers_0.15         ggridges_0.5.6        ggplot2_3.5.1        
##  [4] stringr_1.5.1         dplyr_1.1.4           HDF5Array_1.32.1     
##  [7] rhdf5_2.48.0          DelayedArray_0.30.1   SparseArray_1.4.8    
## [10] S4Arrays_1.4.1        abind_1.4-8           IRanges_2.38.1       
## [13] S4Vectors_0.42.1      MatrixGenerics_1.16.0 matrixStats_1.4.1    
## [16] BiocGenerics_0.50.0   Matrix_1.6-5          arrow_17.0.0         
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.5        xfun_0.48           bslib_0.8.0        
##  [4] lattice_0.22-6      rhdf5filters_1.16.0 vctrs_0.6.5        
##  [7] tools_4.4.1         generics_0.1.3      tibble_3.2.1       
## [10] fansi_1.0.6         highr_0.11          pkgconfig_2.0.3    
## [13] KernSmooth_2.23-24  assertthat_0.2.1    lifecycle_1.0.4    
## [16] compiler_4.4.1      farver_2.1.2        munsell_0.5.1      
## [19] htmltools_0.5.8.1   sass_0.4.9          yaml_2.3.10        
## [22] pillar_1.9.0        crayon_1.5.3        jquerylib_0.1.4    
## [25] cachem_1.1.0        tidyselect_1.2.1    digest_0.6.37      
## [28] stringi_1.8.4       reshape2_1.4.4      purrr_1.0.2        
## [31] labeling_0.4.3      fastmap_1.2.0       grid_4.4.1         
## [34] colorspace_2.1-1    cli_3.6.3           magrittr_2.0.3     
## [37] utf8_1.2.4          withr_3.0.1         scales_1.3.0       
## [40] bit64_4.5.2         rmarkdown_2.28      XVector_0.44.0     
## [43] bit_4.5.0           evaluate_1.0.1      knitr_1.48         
## [46] rlang_1.1.4         Rcpp_1.0.13         glue_1.8.0         
## [49] jsonlite_1.8.9      R6_2.5.1            Rhdf5lib_1.26.0    
## [52] plyr_1.8.9          zlibbioc_1.50.0