# read PBMC data in 10x (sparse matrix) format
barcodes <- read.csv("PBMC-10x/barcodes.tsv", sep="\t", stringsAsFactors=F, header=F)[[1]]
genes <- read.csv("PBMC-10x/genes.tsv", sep="\t", stringsAsFactors=F, header=F)
matr <- read.csv("PBMC-10x/matrix.mtx", sep=" ", stringsAsFactors=F, skip=3, header=F)
counts <- matrix(0, nrow=nrow(genes), ncol=length(barcodes), dimnames=list(genes[[1]], barcodes))
for (i in 1:nrow(matr))
  counts[matr[i,1],matr[i,2]] <- matr[i,3]
ensembl2name <- setNames(genes[[2]], genes[[1]]) # enables us to map Ensembl IDs to HUGO symbols

# sum the UMI counts in each cell and plot
umi.tot <- colSums(counts)
hist(umi.tot, breaks=100)
plot(sort(umi.tot, decreasing=T))

# extremely high UMI counts might indicate doublets
abline(h=7500, col="red")
abline(v=sum(umi.tot >= 7500), col="red")
bad.high <- umi.tot > 7500

# do the same but at the gene level assuming that UMI count > 0 implies the gene is expressed
gene.tot <- colSums(counts > 0)
hist(gene.tot, breaks=100)
plot(sort(gene.tot, decreasing=T))

# we can be a bit more stringent and require UMI count > 1 for a gene to be considered expressed
gene.tot <- colSums(counts > 1)
hist(gene.tot, breaks=100)
plot(sort(gene.tot, decreasing=T))

# low complexity (low gene count) indicates poorly sequenced cells
abline(h=200, col="red")
abline(v=sum(gene.tot >= 200), col="red")
bad.low <- gene.tot < 200

# high mitochondrial gene % is likely to indicate dead cells
library(AnnotationDbi)
library(org.Hs.eg.db)
library(EnsDb.Hsapiens.v86)
ensdb.genes <- genes(EnsDb.Hsapiens.v86)
MT.names <- ensdb.genes[seqnames(ensdb.genes) == "MT"]$gene_id
mito.pc <- colSums(counts[rownames(counts) %in% MT.names,]) / umi.tot*100
hist(mito.pc, breaks=50)

# we eliminate high UMI counts, low complexity, and high MT gene % 
bad <- mito.pc > 7 | bad.low | bad.high
plot(x=umi.tot, y=mito.pc, col="royalblue1", pch=20, main="UMI vesus MT%")
lines(x=umi.tot[bad], y=mito.pc[bad], col="orange", pch=20, type="p")
plot(x=gene.tot, y=mito.pc, col="royalblue1", pch=20, main="Complexity vesus MT%")
lines(x=gene.tot[bad], y=mito.pc[bad], col="orange", pch=20, type="p")
counts <- counts[,!bad]

# we eliminate rows in the count matrix that contain not or barely expressed genes
good <- rowSums(counts > 1) >= 0.01*ncol(counts) # UMI count > 1 in at least 1% of the cells is required
sum(good)
counts <- counts[good,]
dim(counts)

# elementary normalization
ncounts <- log2(1 + sweep(counts, 2, colSums(counts), "/")*1e5)
ncounts[1:5, 1:10]

# PCA to reduce dimensionality
pca <- prcomp(t(ncounts), scale.=T)
plot(y=(pca$sdev[1:20])**2, x=1:20, pch=19, main="Variances", xlab="Principal component", ylab="Variance")
plot(x=pca$x[,1], y=pca$x[,2], pch=20)

# t-SNE from PCA-reduced data
library(Rtsne)
rt <- Rtsne(pca$x[,1:10])
plot(x=rt$Y[,1], y=rt$Y[,2], pch=20)

# clustering to discover cell populations
n.clust <- 8
km <- kmeans(pca$x[,1:10], n.clust)
cluster.cols <- rainbow(n.clust)
plot(x=rt$Y[,1], y=rt$Y[,2], pch=20, col=cluster.cols[km$cluster])
legend(x="bottomright", legend=1:n.clust, pch=20, col=cluster.cols, bty="n")

# more advanced normalization
library(sctransform)
vst.out <- vst(counts, return_cell_attr=T)
ncounts.vst <- correct(vst.out)
pca.vst <- prcomp(t(ncounts.vst), scale.=T)
km.vst <- kmeans(pca.vst$x[,1:10], n.clust)
rt.vst <- Rtsne(pca.vst$x[,1:10])
plot(x=rt.vst$Y[,1], y=rt.vst$Y[,2], pch=20, col=cluster.cols[km.vst$cluster])
legend(x="bottomright", legend=1:n.clust, pch=20, col=cluster.cols, bty="n")