1 Introduction

This report includes stats and diagnostic plots of the Nanostring CosMx Spatial Molecular Imager (SMI) data.

Nanostring generously shared a Pancreas data which was generated using CosMx SMI with the CosMx Human Whole Transcriptome panel (18,946 plex, pre-commercial version) and I downloaded the data from the Nanostring Web to generate the following QC report.

2 Prerequisite

2.1 Environment

OS: macOS Sonoma 14.0
Platform: aarch64-apple-darwin20 (64-bit)
Software: R (v4.3.1)
Pakcages: Seurat (v5.0.1), ggplot2 (v3.5.0), ggridges (v0.5.4), ggpubr (v0.6.0), stringr (v1.5.1), reshape2 (v1.4.4)

2.2 CosMx SMI Data: Pancreas

Slide: Pancreas_1 ◼
#FOVs: 18
#Cells: 48,944
#Probes: 18,946 probes, 50 negative probes, and 2,735 system probes

3 Preprocessing/QC

3.1 Preparation

Set working directory to load files

# Do not run
baseDir <- "WORKING_DIRECTORY" # where the CosMx folders/files are located
setwd(baseDir)

suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggridges))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(RColorBrewer))

3.2 Load files (Step 1)

if (file.exists(file.path(outDir, "01_rawFlats.CosMx.RDS"))) {
        rawFlatL <- readRDS(file.path(outDir, "01_rawFlats.CosMx.RDS"))
} else {
        rawFlatL <- lapply(seq_along(slideOrder), function(idx) {
                slidePath <- file.path(baseDir, paste0("FlatFiles_", projectTag), slideOrder[idx])
                # LoadNanostring function was imported from the Seurat package
                slideObj <- LoadNanostring(data.dir = slidePath, fov = slideOrder[idx])
                return(slideObj)
        })
        names(rawFlatL) <- slideOrder
        saveRDS(rawFlatL, file.path(outDir, "01_rawFlats.CosMx.RDS"))
}
rawFlatL

## $Pancreas_1
## An object of class Seurat 
## 21731 features across 48944 samples within 1 assay 
## Active assay: Nanostring (21731 features, 0 variable features)
##  1 layer present: counts
##  1 spatial field of view present: Pancreas_1

if (file.exists(file.path(outDir, "02_metadata.CosMx.RDS"))) {
        metadataL <- readRDS(file.path(outDir, "02_metadata.CosMx.RDS"))
} else {
        metadataL <- lapply(seq_along(slideOrder), function(idx) {
                metadataPath <- file.path(baseDir, paste0("FlatFiles_", projectTag), slideOrder[idx], paste0(slideOrder[idx], "_metadata_file.csv.gz"))                
                metadata <- read.csv(metadataPath, header=T)
                return(metadata)
        })
        names(metadataL) <- slideOrder
        saveRDS(metadataL, file.path(outDir, "02_metadata.CosMx.RDS"))
}

3.3 Raw data quality check (Step 2)

metadataDf <- do.call("rbind", metadataL)
metadataDf <- metadataDf[order(metadataDf$fov, metadataDf$cell_id, metadataDf$slide_ID),]
metadataDf$fov <- factor(metadataDf$fov, levels=unique(metadataDf$fov))
mSubDf <- metadataDf[,c("nCount_RNA", "nFeature_RNA", "Area", "fov", "Run_Tissue_name")]

fovstatsL <- lapply(seq_along(metadataL), function(idx) {
        metadata <- metadataL[[idx]][,c("nCount_RNA", "nFeature_RNA", "Area", "fov", "Run_Tissue_name")]
        fovstats <- c()
        for (fov in unique(metadata$fov)) {
                buff <- metadata[which(metadata$fov == fov),]
                ncells <- nrow(buff)
                ncount <- sum(buff$nCount_RNA) / ncells
                nfeature <- sum(buff$nFeature_RNA) / ncells
                area <- sum(buff$Area) / ncells
                fovstats <- rbind(fovstats, c(idx, fov, ncells, ncount, nfeature, area))
        }
        return(fovstats)
})
fovstatsDf <- do.call("rbind", fovstatsL)
fovstatsDf <- apply(fovstatsDf, 2, as.numeric)
fovstatsDf <- as.data.frame(fovstatsDf)
colnames(fovstatsDf) <- c("Slide", "FOV", "nCells", "nCount/cell", "nFeature/cell", "Area/cell")
fovstatsDf$Slide <- factor(fovstatsDf$Slide, levels=seq_along(metadataL), labels=names(slideCols))

3.3.1 Number of Counts/Features/Area acorss FOVs

3.3.1.1 nCount

ridgePlot(df = mSubDf, x = "nCount_RNA", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.1.2 nFeature

ridgePlot(df = mSubDf, x = "nFeature_RNA", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.1.3 Area

ridgePlot(df = mSubDf, x = "Area", y = "fov", by = "Run_Tissue_name", cols = slideCols)

3.3.2 Number of Counts/Features/Area per cell (per Cell)

3.3.2.1 nCount

perFovPlot(df = fovstatsDf, x = "Slide", y = "nCount/cell", label = "nCount / cell", cols = slideCols)

3.3.2.2 nFeature

perFovPlot(df = fovstatsDf, x = "Slide", y = "nFeature/cell", label = "nFeature / cell", cols = slideCols)

3.3.2.3 Area

perFovPlot(df = fovstatsDf, x = "Slide", y = "Area/cell", label = "Area / cell", cols = slideCols)

3.3.2.4 No. cells per slide

perFovPlot(df = fovstatsDf, x = "Slide", y = "nCells", label = "Number of cells", cols = slideCols)

3.3.3 Calculate residual standard deviation (RSD) in each FOV

rsdL <- lapply(seq_along(rawFlatL), function(idx) {
        slidename <- names(rawFlatL)[idx]
        slide <- rawFlatL[[idx]]

        mat <- t(as.matrix(slide@assays$Nanostring$counts)) # TO-DOs: optimization
        columnids <- data.frame(
                fov = sapply(str_split(rownames(mat), "_"), "[[", 2),
                cellid = sapply(str_split(rownames(mat), "_"), "[[", 1)
        )
        fovMat <- c()
        for (fov in sort(unique(columnids$fov))) {
                idx <- which(columnids$fov == fov)
                sumMat <- apply(mat[idx, ], 2, sum)
                fovMat <- rbind(fovMat, sumMat)
        }
        rownames(fovMat) <- sort(unique(columnids$fov))

        negIdx <- which(str_detect(colnames(fovMat), "^Neg"))
        if (length(negIdx) > 0) {
                fovMat <- fovMat[,-negIdx]
        }
        sysIdx <- which(str_detect(colnames(fovMat), "^Sys"))
        if (length(sysIdx) > 0) {
                fovMat <- fovMat[,-sysIdx]
        }
        chisq <- chisq.test(fovMat)
        residual <- chisq$residuals
        resSD <- apply(residual, 1, sd)

        return(data.frame(Slide = slidename, FOV =  names(resSD), RSD = as.numeric(resSD)))
})
names(rsdL) <- names(slideCols)
residualSD <- do.call("rbind", rsdL)
residualSD$Slide <- factor(residualSD$Slide, levels=names(slideCols))

RSD distribution per slide (each dot is a FOV)

perFovPlot(df = residualSD, x = "Slide", y = "RSD", label = "Residual SD", cols = slideCols)

3.3.4 Cumulative distributions of the Counts/Features/Area

A FOV with a smaller RSD is likely a problematic one due to (partial) out of focus blur with under segmentation (more than one cells segmented as one)
Note that RSD is not validated so reviewing the flagged FOV(s) is strongly recommended
The bottom 10% FOVs based on RSD marked in red

lowRsdFovs <- lapply(seq_along(rsdL), function(idx) {
        slidename <- names(rsdL)[idx]
        rsd <- rsdL[[idx]]
        return(rsd$FOV[which(rsd$RSD < quantile(rsd$RSD, rsdPerc))])
})

3.3.4.1 Count

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggCount <- ecdfPlot(df = tmpDf, x = "nCount_RNA", y = "fov", cols = fovCols, label = slide)

        return(ggCount)
})
print(ecdfL)

3.3.4.2 Feature

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggFeature <- ecdfPlot(df = tmpDf, x = "nFeature_RNA", y = "fov", cols = fovCols, label = slide)

        return(ggFeature)
})
print(ecdfL)

3.3.4.3 Area

ecdfL <- lapply(seq_along(rsdL), function(idx) {
        slide <- names(rsdL)[idx]
        tmpDf <- mSubDf[which(mSubDf$Run_Tissue_name == slide),]
        fovCols <- rep("grey60", length(sort(unique(tmpDf$fov))))
        names(fovCols) <- sort(unique(tmpDf$fov))
        fovCols[which(names(fovCols) %in% lowRsdFovs[[idx]])] <- "red"

        ggArea <- ecdfPlot(df = tmpDf, x = "Area", y = "fov", cols = fovCols, label = slide)

        return(ggArea)
})
print(ecdfL)

3.4 AtoMx quality control (Step 3)

Load the Seurat Object with QC Flags

objFileName <- list.files(file.path(baseDir, paste0("SeuratObjectAfterQC_", projectTag)), pattern = ".RDS", full.names = T)
seuratObjfile <- readRDS(objFileName)
sMetadata <- seuratObjfile@meta.data
seuratObjfile

## An object of class Seurat 
## 21731 features across 48944 samples within 3 assays 
## Active assay: negprobes (50 features, 0 variable features)
##  2 layers present: counts, data
##  2 other assays present: falsecode, RNA
##  1 spatial field of view present: Pancreas

qcFlagsFOV <- sMetadata[which(sMetadata$qcFlagsFOV == "Fail"),]
qcFlagsCellCounts <- sMetadata[which(sMetadata$qcFlagsCellCounts == "Fail"),]
qcFlagsCellPropNeg <- sMetadata[which(sMetadata$qcFlagsCellPropNeg == "Fail"),]
qcFlagsCellArea <- sMetadata[which(sMetadata$qcFlagsCellArea == "Fail"),]
# qcFlagsRNACounts <- sMetadata[which(sMetadata$qcFlagsRNACounts == "Fail"),]
# qcFlagsCellComplex <- sMetadata[which(sMetadata$qcFlagsCellComplex == "Fail"),]

failMetadata <- sMetadata[Reduce(union, list(
        which(sMetadata$qcFlagsRNACounts == "Fail"), 
        which(sMetadata$qcFlagsCellCounts == "Fail"), 
        which(sMetadata$qcFlagsCellPropNeg == "Fail"), 
        which(sMetadata$qcFlagsCellComplex == "Fail"), 
        which(sMetadata$qcFlagsCellArea == "Fail"), 
        which(sMetadata$qcFlagsFOV == "Fail"))),
]

3.4.1 QC results on AtoMx

Criteria
- FOV QC: FOVs with low expression. The MEAN method involves filtering out FOVs where the total count per cell averages below a threshold (default: 100)
- Cell QC 1: Minimum counts per cell (default: 20)
- Cell QC 2: Flags cells with negative counts greater than this proportion of total counts (default: 0.1)
- Cell QC 3: Grubb's test p-value to flag outliers cells based on cell area (default: 0.01)
Results
- Total: 0 / 48944 cell(s) excluded from 1 slide(s)
- FOV QC: 0 excluded
- Cell QC 1: 0 cell(s) excluded
- Cell QC 2: 0 cell(s) excluded
- Cell QC 3: 0 cell(s) excluded
Notes on QC (as of March 13th, 2024)
- No cells were excluded - it’s either:
- Nanostring does not have recommended QC threshold values for the 6k-plex and did not run QC
- Nanostring excluded low qual data from both flat files and Seurat object before make them publicly available

qcDfL <- lapply(seq_along(rawFlatL), function(idx) {
        slidename <- names(rawFlatL)[idx]
        metadata <- metadataL[[idx]]
        
        metadata$Filter <- "Included"
        metadata$Filter[which(metadata$cell %in% rownames(failMetadata))] <- "Excluded"

        qcDf <- data.frame(
                id = metadata$cell,
                Slide = slidename,
                FOV = as.numeric(metadata$fov),
                CellId = as.numeric(metadata$cell_ID),
                nCount = as.numeric(metadata$nCount_RNA),
                nFeature = as.numeric(metadata$nFeature_RNA),
                Area = as.numeric(metadata$Area),
                Group = metadata$Filter
        )
        return(qcDf)
})

qcDf <- do.call("rbind", qcDfL)
qcDf <- qcDf[order(qcDf$FOV, qcDf$CellId, qcDf$Slide),]
qcDf$FOV <- factor(qcDf$FOV, levels=sort(unique(qcDf$FOV)))
qcDf$Group <- factor(qcDf$Group, levels=c("Included", "Excluded"))

3.4.1.1 Count and Feature

Check out a typical plot and another one with log10(value + 1) transformation

3.4.1.1.1 Scatter plot

for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 1)

        par(mfrow=c(1, 2))
        plot(subQcDf$nCount, subQcDf$nFeature, col = subQcDf$Col, xlab = "nCount", ylab = "nFeature", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(0, max(qcDf$nFeature)), main=slide)
        plot(subDfInc$nCount, subDfInc$nFeature, col = "grey60", xlab = "nCount", ylab = "nFeature", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(0, max(qcDf$nFeature)), main="")
        points(subDfExc$nCount, subDfExc$nFeature, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.1.2 Log-scale

for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 1)

        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nCount+1), log10(subQcDf$nFeature+1), col = subQcDf$Col, xlab = "nCount, log10", ylab = "nFeature, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(0, max(log10(qcDf$nFeature+1))), main=slide)
        plot(log10(subDfInc$nCount+1), log10(subDfInc$nFeature+1), col = "grey60", xlab = "nCount, log10", ylab = "nFeature, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(0, max(log10(qcDf$nFeature+1))), main="")
        points(log10(subDfExc$nCount+1), log10(subDfExc$nFeature+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.2 Count and Area

Check out a typical plot and another one with log10(value + 1) transformation

3.4.1.2.1 Scatter plot

for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 2)

        par(mfrow=c(1, 2))
        plot(subQcDf$nCount, subQcDf$Area, col = subQcDf$Col, xlab = "nCount", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main=slide)
        plot(subDfInc$nCount, subDfInc$Area, col = "grey60", xlab = "nCount", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nCount)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main="")
        points(subDfExc$nCount, subDfExc$Area, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.2.2 Log-scale

for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 2)

        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nCount+1), log10(subQcDf$Area+1), col = subQcDf$Col, xlab = "nCount, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main=slide)
        plot(log10(subDfInc$nCount+1), log10(subDfInc$Area+1), col = "grey60", xlab = "nCount, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nCount+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main="")
        points(log10(subDfExc$nCount+1), log10(subDfExc$Area+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")
}

3.4.1.3 Feature and Area

Check out a typical plot and another one with log10(value + 1) transformation

3.4.1.3.1 Scatter plot

for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 3)
        
        par(mfrow=c(1, 2))
        plot(subQcDf$nFeature, subQcDf$Area, col = subQcDf$Col, xlab = "nFeature", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nFeature)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main=slide)
        plot(subDfInc$nFeature, subDfInc$Area, col = "grey60", xlab = "nFeature", ylab = "Area", pch=20, xlim = c(0, max(qcDf$nFeature)), ylim = c(min(qcDf$Area, na.rm=F), max(qcDf$Area)), main="")
        points(subDfExc$nFeature, subDfExc$Area, col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")        
}

3.4.1.3.2 Log-scale

for (slide in names(slideCols)) {
        subQcDf <- qcDf[which(qcDf$Slide == slide),]
        subDfInc <- subQcDf[which(subQcDf$Group == "Included"),]
        subDfExc <- subQcDf[which(subQcDf$Group == "Excluded"),]
        subQcDf <- densityColors(df = subQcDf, cols = heatCols, option = 3)
        
        par(mfrow=c(1, 2))
        plot(log10(subQcDf$nFeature+1), log10(subQcDf$Area+1), col = subQcDf$Col, xlab = "nFeature, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nFeature+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main=slide)
        plot(log10(subDfInc$nFeature+1), log10(subDfInc$Area+1), col = "grey60", xlab = "nFeature, log10", ylab = "Area, log10", pch=20, xlim = c(0, max(log10(qcDf$nFeature+1))), ylim = c(min(log10(qcDf$Area+1), na.rm=F), max(log10(qcDf$Area+1))), main="")
        points(log10(subDfExc$nFeature+1), log10(subDfExc$Area+1), col = "red", pch=20)
        legend("topleft", legend=c("Included", "Excluded"), fill=c("grey60", "red"), bty="n")        
}

Pancreas

Heewon Seo

March 13, 2024