# read PBMC data in 10x (sparse matrix) format barcodes <- read.csv("PBMC-10x/barcodes.tsv", sep="\t", stringsAsFactors=F, header=F)[[1]] genes <- read.csv("PBMC-10x/genes.tsv", sep="\t", stringsAsFactors=F, header=F) matr <- read.csv("PBMC-10x/matrix.mtx", sep=" ", stringsAsFactors=F, skip=3, header=F) counts <- matrix(0, nrow=nrow(genes), ncol=length(barcodes), dimnames=list(genes[[1]], barcodes)) for (i in 1:nrow(matr)) counts[matr[i,1],matr[i,2]] <- matr[i,3] ensembl2name <- setNames(genes[[2]], genes[[1]]) # enables us to map Ensembl IDs to HUGO symbols # sum the UMI counts in each cell and plot umi.tot <- colSums(counts) hist(umi.tot, breaks=100) plot(sort(umi.tot, decreasing=T)) # extremely high UMI counts might indicate doublets abline(h=7500, col="red") abline(v=sum(umi.tot >= 7500), col="red") bad.high <- umi.tot > 7500 # do the same but at the gene level assuming that UMI count > 0 implies the gene is expressed gene.tot <- colSums(counts > 0) hist(gene.tot, breaks=100) plot(sort(gene.tot, decreasing=T)) # we can be a bit more stringent and require UMI count > 1 for a gene to be considered expressed gene.tot <- colSums(counts > 1) hist(gene.tot, breaks=100) plot(sort(gene.tot, decreasing=T)) # low complexity (low gene count) indicates poorly sequenced cells abline(h=200, col="red") abline(v=sum(gene.tot >= 200), col="red") bad.low <- gene.tot < 200 # high mitochondrial gene % is likely to indicate dead cells library(AnnotationDbi) library(org.Hs.eg.db) library(EnsDb.Hsapiens.v86) ensdb.genes <- genes(EnsDb.Hsapiens.v86) MT.names <- ensdb.genes[seqnames(ensdb.genes) == "MT"]$gene_id mito.pc <- colSums(counts[rownames(counts) %in% MT.names,]) / umi.tot*100 hist(mito.pc, breaks=50) # we eliminate high UMI counts, low complexity, and high MT gene % bad <- mito.pc > 7 | bad.low | bad.high plot(x=umi.tot, y=mito.pc, col="royalblue1", pch=20, main="UMI vesus MT%") lines(x=umi.tot[bad], y=mito.pc[bad], col="orange", pch=20, type="p") plot(x=gene.tot, y=mito.pc, col="royalblue1", pch=20, main="Complexity vesus MT%") lines(x=gene.tot[bad], y=mito.pc[bad], col="orange", pch=20, type="p") counts <- counts[,!bad] # we eliminate rows in the count matrix that contain not or barely expressed genes good <- rowSums(counts > 1) >= 0.01*ncol(counts) # UMI count > 1 in at least 1% of the cells is required sum(good) counts <- counts[good,] dim(counts) # elementary normalization ncounts <- log2(1 + sweep(counts, 2, colSums(counts), "/")*1e5) ncounts[1:5, 1:10] # PCA to reduce dimensionality pca <- prcomp(t(ncounts), scale.=T) plot(y=(pca$sdev[1:20])**2, x=1:20, pch=19, main="Variances", xlab="Principal component", ylab="Variance") plot(x=pca$x[,1], y=pca$x[,2], pch=20) # t-SNE from PCA-reduced data library(Rtsne) rt <- Rtsne(pca$x[,1:10]) plot(x=rt$Y[,1], y=rt$Y[,2], pch=20) # clustering to discover cell populations n.clust <- 8 km <- kmeans(pca$x[,1:10], n.clust) cluster.cols <- rainbow(n.clust) plot(x=rt$Y[,1], y=rt$Y[,2], pch=20, col=cluster.cols[km$cluster]) legend(x="bottomright", legend=1:n.clust, pch=20, col=cluster.cols, bty="n") # more advanced normalization library(sctransform) vst.out <- vst(counts, return_cell_attr=T) ncounts.vst <- correct(vst.out) pca.vst <- prcomp(t(ncounts.vst), scale.=T) km.vst <- kmeans(pca.vst$x[,1:10], n.clust) rt.vst <- Rtsne(pca.vst$x[,1:10]) plot(x=rt.vst$Y[,1], y=rt.vst$Y[,2], pch=20, col=cluster.cols[km.vst$cluster]) legend(x="bottomright", legend=1:n.clust, pch=20, col=cluster.cols, bty="n")