1 Introduction

This report includes stats and diagnostic plots of the Nanostring CosMx Spatial Molecular Imager (SMI) data.

Nanostring generously shared a Pancreas data which was generated using CosMx SMI with the CosMx Human Whole Transcriptome panel (18,946 plex, pre-commercial version) and I downloaded the data from the Nanostring Web to generate the following QC report.

2 Prerequisite

2.1 Environment

  • OS: macOS Sonoma 14.0
  • Platform: aarch64-apple-darwin20 (64-bit)
  • Software: R (v4.3.1)
  • Pakcages: Seurat (v5.0.1), ggplot2 (v3.5.0), ggridges (v0.5.4), ggpubr (v0.6.0), stringr (v1.5.1), reshape2 (v1.4.4)

2.2 CosMx SMI Data: Pancreas

  • Slide: Pancreas_1 â—¼
  • #FOVs: 18
  • #Cells: 48,944
  • #Probes: 18,946 probes, 50 negative probes, and 2,735 system probes

3 Preprocessing/QC

3.1 Preparation

  • Set working directory to load files
# Do not run
baseDir <- "WORKING_DIRECTORY" # where the CosMx folders/files are located
setwd(baseDir)
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggridges))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(RColorBrewer))

3.2 Load files (Step 1)

if (file.exists(file.path(outDir, "01_rawFlats.CosMx.RDS"))) {
        rawFlatL <- readRDS(file.path(outDir, "01_rawFlats.CosMx.RDS"))
} else {
        rawFlatL <- lapply(seq_along(slideOrder), function(idx) {
                slidePath <- file.path(baseDir, paste0("FlatFiles_", projectTag), slideOrder[idx])
                # LoadNanostring function was imported from the Seurat package
                slideObj <- LoadNanostring(data.dir = slidePath, fov = slideOrder[idx])
                return(slideObj)
        })
        names(rawFlatL) <- slideOrder
        saveRDS(rawFlatL, file.path(outDir, "01_rawFlats.CosMx.RDS"))
}
rawFlatL
## $Pancreas_1
## An object of class Seurat 
## 21731 features across 48944 samples within 1 assay 
## Active assay: Nanostring (21731 features, 0 variable features)
##  1 layer present: counts
##  1 spatial field of view present: Pancreas_1
if (file.exists(file.path(outDir, "02_metadata.CosMx.RDS"))) {
        metadataL <- readRDS(file.path(outDir, "02_metadata.CosMx.RDS"))
} else {
        metadataL <- lapply(seq_along(slideOrder), function(idx) {
                metadataPath <- file.path(baseDir, paste0("FlatFiles_", projectTag), slideOrder[idx], paste0(slideOrder[idx], "_metadata_file.csv.gz"))                
                metadata <- read.csv(metadataPath, header=T)
                return(metadata)
        })
        names(metadataL) <- slideOrder
        saveRDS(metadataL, file.path(outDir, "02_metadata.CosMx.RDS"))
}

3.3 Raw data quality check (Step 2)

metadataDf <- do.call("rbind", metadataL)
metadataDf <- metadataDf[order(metadataDf$fov, metadataDf$cell_id, metadataDf$slide_ID),]
metadataDf$fov <- factor(metadataDf$fov, levels=unique(metadataDf$fov))
mSubDf <- metadataDf[,c("nCount_RNA", "nFeature_RNA", "Area", "fov", "Run_Tissue_name")]

fovstatsL <- lapply(seq_along(metadataL), function(idx) {
        metadata <- metadataL[[idx]][,c("nCount_RNA", "nFeature_RNA", "Area", "fov", "Run_Tissue_name")]
        fovstats <- c()
        for (fov in unique(metadata$fov)) {
                buff <- metadata[which(metadata$fov == fov),]
                ncells <- nrow(buff)
                ncount <- sum(buff$nCount_RNA) / ncells
                nfeature <- sum(buff$nFeature_RNA) / ncells
                area <- sum(buff$Area) / ncells
                fovstats <- rbind(fovstats, c(idx, fov, ncells, ncount, nfeature, area))
        }
        return(fovstats)
})
fovstatsDf <- do.call("rbind", fovstatsL)
fovstatsDf <- apply(fovstatsDf, 2, as.numeric)
fovstatsDf <- as.data.frame(fovstatsDf)
colnames(fovstatsDf) <- c("Slide", "FOV", "nCells", "nCount/cell", "nFeature/cell", "Area/cell")
fovstatsDf$Slide <- factor(fovstatsDf$Slide, levels=seq_along(metadataL), labels=names(slideCols))

3.3.1 Number of Counts/Features/Area acorss FOVs

3.3.1.1 nCount

ridgePlot(df = mSubDf, x = "nCount_RNA", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.1.2 nFeature

ridgePlot(df = mSubDf, x = "nFeature_RNA", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.1.3 Area

ridgePlot(df = mSubDf, x = "Area", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.2 Number of Counts/Features/Area per cell (per Cell)

3.3.2.1 nCount

perFovPlot(df = fovstatsDf, x = "Slide", y = "nCount/cell", label = "nCount / cell", cols = slideCols)

3.3.2.2 nFeature

perFovPlot(df = fovstatsDf, x = "Slide", y = "nFeature/cell", label = "nFeature / cell", cols = slideCols)

3.3.2.3 Area

perFovPlot(df = fovstatsDf, x = "Slide", y = "Area/cell", label = "Area / cell", cols = slideCols)

3.3.2.4 No. cells per slide

perFovPlot(df = fovstatsDf, x = "Slide", y = "nCells", label = "Number of cells", cols = slideCols)

3.3.3 Calculate residual standard deviation (RSD) in each FOV

rsdL <- lapply(seq_along(rawFlatL), function(idx) {
        slidename <- names(rawFlatL)[idx]
        slide <- rawFlatL[[idx]]

        mat <- t(as.matrix(slide@assays$Nanostring$counts)) # TO-DOs: optimization
        columnids <- data.frame(
                fov = sapply(str_split(rownames(mat), "_"), "[[", 2),
                cellid = sapply(str_split(rownames(mat), "_"), "[[", 1)
        )
        fovMat <- c()
        for (fov in sort(unique(columnids$fov))) {
                idx <- which(columnids$fov == fov)
                sumMat <- apply(mat[idx, ], 2, sum)
                fovMat <- rbind(fovMat, sumMat)
        }
        rownames(fovMat) <- sort(unique(columnids$fov))

        negIdx <- which(str_detect(colnames(fovMat), "^Neg"))
        if (length(negIdx) > 0) {
                fovMat <- fovMat[,-negIdx]
        }
        sysIdx <- which(str_detect(colnames(fovMat), "^Sys"))
        if (length(sysIdx) > 0) {
                fovMat <- fovMat[,-sysIdx]
        }
        chisq <- chisq.test(fovMat)
        residual <- chisq$residuals
        resSD <- apply(residual, 1, sd)

        return(data.frame(Slide = slidename, FOV =  names(resSD), RSD = as.numeric(resSD)))
})
names(rsdL) <- names(slideCols)
residualSD <- do.call("rbind", rsdL)
residualSD$Slide <- factor(residualSD$Slide, levels=names(slideCols))
  • RSD distribution per slide (each dot is a FOV)
perFovPlot(df = residualSD, x = "Slide", y = "RSD", label = "Residual SD", cols = slideCols)

3.3.4 Cumulative distributions of the Counts/Features/Area

  • A FOV with a smaller RSD is likely a problematic one due to (partial) out of focus blur with under segmentation (more than one cells segmented as one)
  • Note that RSD is not validated so reviewing the flagged FOV(s) is strongly recommended
  • The bottom 10% FOVs based on RSD marked in red
lowRsdFovs <- lapply(seq_along(rsdL), function(idx) {
        slidename <- names(rsdL)[idx]
        rsd <- rsdL[[idx]]
        return(rsd$FOV[which(rsd$RSD < quantile(rsd$RSD, rsdPerc))])
})

3.3.4.1 Count

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggCount <- ecdfPlot(df = tmpDf, x = "nCount_RNA", y = "fov", cols = fovCols, label = slide)

        return(ggCount)
})
print(ecdfL)

3.3.4.2 Feature

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggFeature <- ecdfPlot(df = tmpDf, x = "nFeature_RNA", y = "fov", cols = fovCols, label = slide)

        return(ggFeature)
})
print(ecdfL)

3.3.4.3 Area

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggArea <- ecdfPlot(df = tmpDf, x = "Area", y = "fov", cols = fovCols, label = slide)

        return(ggArea)
})
print(ecdfL)

3.4 AtoMx quality control (Step 3)

  • Load the Seurat Object with QC Flags
objFileName <- list.files(file.path(baseDir, paste0("SeuratObjectAfterQC_", projectTag)), pattern = ".RDS", full.names = T)
seuratObjfile <- readRDS(objFileName)
sMetadata <- seuratObjfile@meta.data
seuratObjfile
## An object of class Seurat 
## 21731 features across 48944 samples within 3 assays 
## Active assay: negprobes (50 features, 0 variable features)
##  2 layers present: counts, data
##  2 other assays present: falsecode, RNA
##  1 spatial field of view present: Pancreas
qcFlagsFOV <- sMetadata[which(sMetadata$qcFlagsFOV == "Fail"),]
qcFlagsCellCounts <- sMetadata[which(sMetadata$qcFlagsCellCounts == "Fail"),]
qcFlagsCellPropNeg <- sMetadata[which(sMetadata$qcFlagsCellPropNeg == "Fail"),]
qcFlagsCellArea <- sMetadata[which(sMetadata$qcFlagsCellArea == "Fail"),]
# qcFlagsRNACounts <- sMetadata[which(sMetadata$qcFlagsRNACounts == "Fail"),]
# qcFlagsCellComplex <- sMetadata[which(sMetadata$qcFlagsCellComplex == "Fail"),]

failMetadata <- sMetadata[Reduce(union, list(
        which(sMetadata$qcFlagsRNACounts == "Fail"), 
        which(sMetadata$qcFlagsCellCounts == "Fail"), 
        which(sMetadata$qcFlagsCellPropNeg == "Fail"), 
        which(sMetadata$qcFlagsCellComplex == "Fail"), 
        which(sMetadata$qcFlagsCellArea == "Fail"), 
        which(sMetadata$qcFlagsFOV == "Fail"))),
]

3.4.1 QC results on AtoMx

  • Criteria
    • FOV QC: FOVs with low expression. The MEAN method involves filtering out FOVs where the total count per cell averages below a threshold (default: 100)
    • Cell QC 1: Minimum counts per cell (default: 20)
    • Cell QC 2: Flags cells with negative counts greater than this proportion of total counts (default: 0.1)
    • Cell QC 3: Grubb's test p-value to flag outliers cells based on cell area (default: 0.01)
  • Results
    • Total: 0 / 48944 cell(s) excluded from 1 slide(s)
    • FOV QC: 0 excluded
    • Cell QC 1: 0 cell(s) excluded
    • Cell QC 2: 0 cell(s) excluded
    • Cell QC 3: 0 cell(s) excluded
  • Notes on QC (as of March 13th, 2024)
    • No cells were excluded - it’s either:
    • Nanostring does not have recommended QC threshold values for the 6k-plex and did not run QC
    • Nanostring excluded low qual data from both flat files and Seurat object before make them publicly available
qcDfL <- lapply(seq_along(rawFlatL), function(idx) {
        slidename <- names(rawFlatL)[idx]
        metadata <- metadataL[[idx]]
        
        metadata$Filter <- "Included"
        metadata$Filter[which(metadata$cell %in% rownames(failMetadata))] <- "Excluded"

        qcDf <- data.frame(
                id = metadata$cell,
                Slide = slidename,
                FOV = as.numeric(metadata$fov),
                CellId = as.numeric(metadata$cell_ID),
                nCount = as.numeric(metadata$nCount_RNA),
                nFeature = as.numeric(metadata$nFeature_RNA),
                Area = as.numeric(metadata$Area),
                Group = metadata$Filter
        )
        return(qcDf)
})

qcDf <- do.call("rbind", qcDfL)
qcDf <- qcDf[order(qcDf$FOV, qcDf$CellId, qcDf$Slide),]
qcDf$FOV <- factor(qcDf$FOV, levels=sort(unique(qcDf$FOV)))
qcDf$Group <- factor(qcDf$Group, levels=c("Included", "Excluded"))

3.4.1.1 Count and Feature

  • Check out a typical plot and another one with log10(value + 1) transformation
3.4.1.1.1 Scatter plot
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 1)

        par(mfrow=c(1, 2))
        plot(subQcDf$nCount, subQcDf$nFeature, col = subQcDf$Col, xlab = "nCount", ylab = "nFeature", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(0, max(qcDf$nFeature)), main=slide)
        plot(subDfInc$nCount, subDfInc$nFeature, col = "grey60", xlab = "nCount", ylab = "nFeature", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(0, max(qcDf$nFeature)), main="")
        points(subDfExc$nCount, subDfExc$nFeature, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.1.2 Log-scale
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 1)

        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nCount+1), log10(subQcDf$nFeature+1), col = subQcDf$Col, xlab = "nCount, log10", ylab = "nFeature, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(0, max(log10(qcDf$nFeature+1))), main=slide)
        plot(log10(subDfInc$nCount+1), log10(subDfInc$nFeature+1), col = "grey60", xlab = "nCount, log10", ylab = "nFeature, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(0, max(log10(qcDf$nFeature+1))), main="")
        points(log10(subDfExc$nCount+1), log10(subDfExc$nFeature+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.2 Count and Area

  • Check out a typical plot and another one with log10(value + 1) transformation
3.4.1.2.1 Scatter plot
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 2)

        par(mfrow=c(1, 2))
        plot(subQcDf$nCount, subQcDf$Area, col = subQcDf$Col, xlab = "nCount", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main=slide)
        plot(subDfInc$nCount, subDfInc$Area, col = "grey60", xlab = "nCount", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main="")
        points(subDfExc$nCount, subDfExc$Area, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.2.2 Log-scale
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 2)

        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nCount+1), log10(subQcDf$Area+1), col = subQcDf$Col, xlab = "nCount, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main=slide)
        plot(log10(subDfInc$nCount+1), log10(subDfInc$Area+1), col = "grey60", xlab = "nCount, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main="")
        points(log10(subDfExc$nCount+1), log10(subDfExc$Area+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.3 Feature and Area

  • Check out a typical plot and another one with log10(value + 1) transformation
3.4.1.3.1 Scatter plot
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 3)
        
        par(mfrow=c(1, 2))
        plot(subQcDf$nFeature, subQcDf$Area, col = subQcDf$Col, xlab = "nFeature", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nFeature)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main=slide)
        plot(subDfInc$nFeature, subDfInc$Area, col = "grey60", xlab = "nFeature", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nFeature)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main="")
        points(subDfExc$nFeature, subDfExc$Area, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")        
}

3.4.1.3.2 Log-scale
for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 3)
        
        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nFeature+1), log10(subQcDf$Area+1), col = subQcDf$Col, xlab = "nFeature, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nFeature+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main=slide)
        plot(log10(subDfInc$nFeature+1), log10(subDfInc$Area+1), col = "grey60", xlab = "nFeature, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nFeature+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main="")
        points(log10(subDfExc$nFeature+1), log10(subDfExc$Area+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")        
}