1 Introduction

This report includes stats and diagnostic plots of the Nanostring CosMx Spatial Molecular Imager (SMI) data. It was generated in the CosMx training session with two Tonsil samples. The QC metrics and workflows were adopted from the paper (https://doi.org/10.1038/s41467-023-43458-x) where the Authors analyzed 10X Genomics Xenium In Situ data.

2 Prerequisite

2.1 Environment

  • OS: macOS Sonoma 14.0
  • Platform: aarch64-apple-darwin20 (64-bit)
  • Software: R (v4.3.1)
  • Pakcages: Seurat (v5.0.1), ggplot2 (v3.5.0), ggridges (v0.5.4), ggpubr (v0.6.0), stringr (v1.5.1), reshape2 (v1.4.4)

2.2 CosMx SMI Data: Tonsil

  • Slide: Tonsil_1 â—¼ and Tonsil_2 â—¼

3 Preprocessing/QC

3.1 Preparation

  • Set working directory to load files
# Do not run
baseDir <- "WORKING_DIRECTORY" # where the CosMx folders/files are located
setwd(baseDir)
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggridges))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(RColorBrewer))

3.2 Load files (Step 1)

if (file.exists(file.path(outDir, "01_rawFlats.CosMx.RDS"))) {
        rawFlatL <- readRDS(file.path(outDir, "01_rawFlats.CosMx.RDS"))
} else {
        rawFlatL <- lapply(seq_along(slideOrder), function(idx) {
                slidePath <- file.path(baseDir, paste0("FlatFiles_", projectTag), slideOrder[idx])
                # LoadNanostring function was imported from the Seurat package
                slideObj <- LoadNanostring(data.dir = slidePath, fov = slideOrder[idx])
                return(slideObj)
        })
        names(rawFlatL) <- slideOrder
        saveRDS(rawFlatL, file.path(outDir, "01_rawFlats.CosMx.RDS"))
}
rawFlatL
## $Tonsil_1
## An object of class Seurat 
## 1207 features across 215312 samples within 1 assay 
## Active assay: Nanostring (1207 features, 0 variable features)
##  1 layer present: counts
##  1 spatial field of view present: Tonsil_1
## 
## $Tonsil_2
## An object of class Seurat 
## 1207 features across 218421 samples within 1 assay 
## Active assay: Nanostring (1207 features, 0 variable features)
##  1 layer present: counts
##  1 spatial field of view present: Tonsil_2
if (file.exists(file.path(outDir, "02_metadata.CosMx.RDS"))) {
        metadataL <- readRDS(file.path(outDir, "02_metadata.CosMx.RDS"))
} else {
        metadataL <- lapply(seq_along(slideOrder), function(idx) {
                metadataPath <- file.path(baseDir, paste0("FlatFiles_", projectTag), slideOrder[idx], paste0(slideOrder[idx], "_metadata_file.csv.gz"))                
                metadata <- read.csv(metadataPath, header=T)
                return(metadata)
        })
        names(metadataL) <- slideOrder
        saveRDS(metadataL, file.path(outDir, "02_metadata.CosMx.RDS"))
}

3.3 Raw data quality check (Step 2)

metadataDf <- do.call("rbind", metadataL)
metadataDf <- metadataDf[order(metadataDf$fov, metadataDf$cell_id, metadataDf$slide_ID),]
metadataDf$fov <- factor(metadataDf$fov, levels=unique(metadataDf$fov))
metadataDf$Run_Tissue_name <- factor(metadataDf$Run_Tissue_name, levels=names(slideCols))
mSubDf <- metadataDf[,c("nCount_RNA", "nFeature_RNA", "Area", "fov", "Run_Tissue_name")]

fovstatsL <- lapply(seq_along(metadataL), function(idx) {
        metadata <- metadataL[[idx]][,c("nCount_RNA", "nFeature_RNA", "Area", "fov", "Run_Tissue_name")]
        fovstats <- c()
        for (fov in unique(metadata$fov)) {
                buff <- metadata[which(metadata$fov == fov),]
                ncells <- nrow(buff)
                ncount <- sum(buff$nCount_RNA) / ncells
                nfeature <- sum(buff$nFeature_RNA) / ncells
                area <- sum(buff$Area) / ncells
                fovstats <- rbind(fovstats, c(idx, fov, ncells, ncount, nfeature, area))
        }
        return(fovstats)
})
fovstatsDf <- do.call("rbind", fovstatsL)
fovstatsDf <- apply(fovstatsDf, 2, as.numeric)
fovstatsDf <- as.data.frame(fovstatsDf)
colnames(fovstatsDf) <- c("Slide", "FOV", "nCells", "nCount/cell", "nFeature/cell", "Area/cell")
fovstatsDf$Slide <- factor(fovstatsDf$Slide, levels=seq_along(metadataL), labels=names(slideCols))

3.3.1 Number of Counts/Features/Area acorss FOVs

3.3.1.1 nCount

ridgePlot(df = mSubDf, x = "nCount_RNA", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.1.2 nFeature

ridgePlot(df = mSubDf, x = "nFeature_RNA", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.1.3 Area

ridgePlot(df = mSubDf, x = "Area", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.2 Number of Counts/Features/Area per cell (per Cell)

3.3.2.1 nCount

perFovPlot(df = fovstatsDf, x = "Slide", y = "nCount/cell", label = "nCount / cell", cols = slideCols)

3.3.2.2 nFeature

perFovPlot(df = fovstatsDf, x = "Slide", y = "nFeature/cell", label = "nFeature / cell", cols = slideCols)

3.3.2.3 Area

perFovPlot(df = fovstatsDf, x = "Slide", y = "Area/cell", label = "Area / cell", cols = slideCols)

3.3.2.4 No. cells per slide

perFovPlot(df = fovstatsDf, x = "Slide", y = "nCells", label = "Number of cells", cols = slideCols)

3.3.3 Calculate residual standard deviation (RSD) in each FOV

rsdL <- lapply(seq_along(rawFlatL), function(idx) {
        slidename <- names(rawFlatL)[idx]
        slide <- rawFlatL[[idx]]

        mat <- t(as.matrix(slide@assays$Nanostring$counts)) # TO-DOs: optimization
        columnids <- data.frame(
                fov = sapply(str_split(rownames(mat), "_"), "[[", 2),
                cellid = sapply(str_split(rownames(mat), "_"), "[[", 1)
        )
        fovMat <- c()
        for (fov in sort(unique(columnids$fov))) {
                idx <- which(columnids$fov == fov)
                sumMat <- apply(mat[idx, ], 2, sum)
                fovMat <- rbind(fovMat, sumMat)
        }
        rownames(fovMat) <- sort(unique(columnids$fov))

        negIdx <- which(str_detect(colnames(fovMat), "^Neg"))
        if (length(negIdx) > 0) {
                fovMat <- fovMat[,-negIdx]
        }
        sysIdx <- which(str_detect(colnames(fovMat), "^Sys"))
        if (length(sysIdx) > 0) {
                fovMat <- fovMat[,-sysIdx]
        }
        chisq <- chisq.test(fovMat)
        residual <- chisq$residuals
        resSD <- apply(residual, 1, sd)

        return(data.frame(Slide = slidename, FOV =  names(resSD), RSD = as.numeric(resSD)))
})
names(rsdL) <- names(slideCols)
residualSD <- do.call("rbind", rsdL)
residualSD$Slide <- factor(residualSD$Slide, levels=names(slideCols))
  • RSD distribution per slide (each dot is a FOV)
perFovPlot(df = residualSD, x = "Slide", y = "RSD", label = "Residual SD", cols = slideCols)

3.3.4 Cumulative distributions of the Counts/Features/Area

  • A FOV with a smaller RSD is likely a problematic one due to (partial) out of focus blur with under segmentation (more than one cells segmented as one)
  • Note that RSD is not validated so reviewing the flagged FOV(s) is strongly recommended
  • The bottom 10% FOVs based on RSD marked in red
lowRsdFovs <- lapply(seq_along(rsdL), function(idx) {
        slidename <- names(rsdL)[idx]
        rsd <- rsdL[[idx]]
        return(rsd$FOV[which(rsd$RSD < quantile(rsd$RSD, rsdPerc))])
})

3.3.4.1 Count

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggCount <- ecdfPlot(df = tmpDf, x = "nCount_RNA", y = "fov", cols = fovCols, label = slide)

        return(ggCount)
})
print(ecdfL)

3.3.4.2 Feature

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggFeature <- ecdfPlot(df = tmpDf, x = "nFeature_RNA", y = "fov", cols = fovCols, label = slide)

        return(ggFeature)
})
print(ecdfL)

3.3.4.3 Area

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggArea <- ecdfPlot(df = tmpDf, x = "Area", y = "fov", cols = fovCols, label = slide)

        return(ggArea)
})
print(ecdfL)

3.4 AtoMx quality control (Step 3)

  • Load the Seurat Object with QC Flags
objFileName <- list.files(file.path(baseDir, paste0("SeuratObjectAfterQC_", projectTag)), pattern = ".RDS", full.names = T)
seuratObjfile <- readRDS(objFileName)
sMetadata <- seuratObjfile@meta.data
seuratObjfile
## An object of class Seurat 
## 2207 features across 433733 samples within 4 assays 
## Active assay: RNA (1000 features, 0 variable features)
##  2 layers present: counts, data
##  3 other assays present: falsecode, negprobes, RNA_normalized_086d7349.c603.4808.9ae6.52c504d0ac3b_1
##  2 dimensional reductions calculated: approximatepca_94c48860.6aee.4a27.8b15.53e99c25f2cd_1, approximateumap_f77f0d93.45a0.4469.ba27.1f6939e6c1cd_1
qcFlagsFOV <- sMetadata[which(sMetadata$qcFlagsFOV == "Fail"),]
qcFlagsCellCounts <- sMetadata[which(sMetadata$qcFlagsCellCounts == "Fail"),]
qcFlagsCellPropNeg <- sMetadata[which(sMetadata$qcFlagsCellPropNeg == "Fail"),]
qcFlagsCellArea <- sMetadata[which(sMetadata$qcFlagsCellArea == "Fail"),]
# qcFlagsRNACounts <- sMetadata[which(sMetadata$qcFlagsRNACounts == "Fail"),]
# qcFlagsCellComplex <- sMetadata[which(sMetadata$qcFlagsCellComplex == "Fail"),]

failMetadata <- sMetadata[Reduce(union, list(
        which(sMetadata$qcFlagsRNACounts == "Fail"), 
        which(sMetadata$qcFlagsCellCounts == "Fail"), 
        which(sMetadata$qcFlagsCellPropNeg == "Fail"), 
        which(sMetadata$qcFlagsCellComplex == "Fail"), 
        which(sMetadata$qcFlagsCellArea == "Fail"), 
        which(sMetadata$qcFlagsFOV == "Fail"))),
]

3.4.1 QC results on AtoMx

  • Criteria
    • FOV QC: FOVs with low expression. The MEAN method involves filtering out FOVs where the total count per cell averages below a threshold (default: 100)
    • Cell QC 1: Minimum counts per cell (default: 20)
    • Cell QC 2: Flags cells with negative counts greater than this proportion of total counts (default: 0.1)
    • Cell QC 3: Grubb's test p-value to flag outliers cells based on cell area (default: 0.01)
  • Results
    • Total: 9824 / 433733 cell(s) excluded from 2 slide(s)
    • FOV QC: Tonsil_1$40 excluded
    • Cell QC 1: 5566 cell(s) excluded
    • Cell QC 2: 24 cell(s) excluded
    • Cell QC 3: 0 cell(s) excluded
  • Notes on QC (as of March 11th, 2024)
    • There is no single guideline on QC threshold values so default values for the 1000-plex were used on AtoMx
    • Four more parameters were used in the QC step, however, no relevant columns were exported from AtoMx
      • Negative Prove QC
      • Cell count distribution
      • Neg Control Probe Quantile Cufoff (Target QC)
      • Detection (Target QC)
    • User Manual is incomplete and two columns in the Seurat object below don’t seem to directly match with any of the criteria above
      • qcFlagsCellComplex
      • qcFlagsRNACounts
qcDfL <- lapply(seq_along(rawFlatL), function(idx) {
        slidename <- names(rawFlatL)[idx]
        metadata <- metadataL[[idx]]
        
        metadata$Filter <- "Included"
        metadata$Filter[which(metadata$cell %in% rownames(failMetadata))] <- "Excluded"

        qcDf <- data.frame(
                id = metadata$cell,
                Slide = slidename,
                FOV = as.numeric(metadata$fov),
                CellId = as.numeric(metadata$cell_ID),
                nCount = as.numeric(metadata$nCount_RNA),
                nFeature = as.numeric(metadata$nFeature_RNA),
                Area = as.numeric(metadata$Area),
                Group = metadata$Filter
        )
        return(qcDf)
})

qcDf <- do.call("rbind", qcDfL)
qcDf <- qcDf[order(qcDf$FOV, qcDf$CellId, qcDf$Slide),]
qcDf$FOV <- factor(qcDf$FOV, levels=sort(unique(qcDf$FOV)))
qcDf$Group <- factor(qcDf$Group, levels=c("Included", "Excluded"))

3.4.1.1 Count and Feature

  • Check out a typical plot and another one with log10(value + 1) transformation
3.4.1.1.1 Scatter plot
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 1)

        par(mfrow=c(1, 2))
        plot(subQcDf$nCount, subQcDf$nFeature, col = subQcDf$Col, xlab = "nCount", ylab = "nFeature", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(0, max(qcDf$nFeature)), main=slide)
        plot(subDfInc$nCount, subDfInc$nFeature, col = "grey60", xlab = "nCount", ylab = "nFeature", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(0, max(qcDf$nFeature)), main="")
        points(subDfExc$nCount, subDfExc$nFeature, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.1.2 Log-scale
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 1)

        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nCount+1), log10(subQcDf$nFeature+1), col = subQcDf$Col, xlab = "nCount, log10", ylab = "nFeature, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(0, max(log10(qcDf$nFeature+1))), main=slide)
        plot(log10(subDfInc$nCount+1), log10(subDfInc$nFeature+1), col = "grey60", xlab = "nCount, log10", ylab = "nFeature, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(0, max(log10(qcDf$nFeature+1))), main="")
        points(log10(subDfExc$nCount+1), log10(subDfExc$nFeature+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.2 Count and Area

  • Check out a typical plot and another one with log10(value + 1) transformation
3.4.1.2.1 Scatter plot
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 2)

        par(mfrow=c(1, 2))
        plot(subQcDf$nCount, subQcDf$Area, col = subQcDf$Col, xlab = "nCount", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main=slide)
        plot(subDfInc$nCount, subDfInc$Area, col = "grey60", xlab = "nCount", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main="")
        points(subDfExc$nCount, subDfExc$Area, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.2.2 Log-scale
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 2)

        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nCount+1), log10(subQcDf$Area+1), col = subQcDf$Col, xlab = "nCount, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main=slide)
        plot(log10(subDfInc$nCount+1), log10(subDfInc$Area+1), col = "grey60", xlab = "nCount, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main="")
        points(log10(subDfExc$nCount+1), log10(subDfExc$Area+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.3 Feature and Area

  • Check out a typical plot and another one with log10(value + 1) transformation
3.4.1.3.1 Scatter plot
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 3)
        
        par(mfrow=c(1, 2))
        plot(subQcDf$nFeature, subQcDf$Area, col = subQcDf$Col, xlab = "nFeature", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nFeature)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main=slide)
        plot(subDfInc$nFeature, subDfInc$Area, col = "grey60", xlab = "nFeature", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nFeature)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main="")
        points(subDfExc$nFeature, subDfExc$Area, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")        
}

3.4.1.3.2 Log-scale
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 3)
        
        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nFeature+1), log10(subQcDf$Area+1), col = subQcDf$Col, xlab = "nFeature, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nFeature+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main=slide)
        plot(log10(subDfInc$nFeature+1), log10(subDfInc$Area+1), col = "grey60", xlab = "nFeature, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nFeature+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main="")
        points(log10(subDfExc$nFeature+1), log10(subDfExc$Area+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")        
}