# Load libraries
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()
library(Seurat)
## Attaching SeuratObject
## Attaching sp
## 
## Attaching package: 'Seurat'
## 
## The following object is masked from 'package:SummarizedExperiment':
## 
##     Assays
library(RColorBrewer)
library(scDblFinder)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract
library(pheatmap)

# Define color palettes
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)
  1. Creating the Seurat Object
# Data Wrangling Processes
VTcounts2 <- read.delim("raw_data_10x.txt")
VTcounts2[1:10, 1:10]

# We notice that the gene names are too lengthy
VTcounts2[,1] <- sub("_.*", "", VTcounts2[,1])

# check for duplicated gene names before setting row names
gene_dups <- VTcounts2[,1] %>% 
  duplicated()

# check the dimensions of the count matrix
dim(VTcounts2)

# filter out the duplicated genes
VTcounts2 <- VTcounts2[!gene_dups,]
dim(VTcounts2)

rownames(VTcounts2) <- NULL

# set the row names
VTcounts2 %<>% 
  column_to_rownames("Gene")

head(VTcounts2)

# Create the Seurat object
vtSeurat2 <- CreateSeuratObject(counts = VTcounts2)

# Let's save the Seurat object as an .rds file to facilitate analysis.
saveRDS(vtSeurat2, "vtSeurat2.rds")

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

Since we have the metadata for this dataset, let’s add the cell annotations first, and filter the dataset to only include cells from the placenta. This will prevent the Seurat object becoming too large and time-costly to process.

vtSeurat2 <- readRDS("vtSeurat2.rds")
# View(vtSeurat2@meta.data)

# Check the dimensions of this object
dim(vtSeurat2)

# Load the cell annotation data
annotations <- read_delim(file = "meta_10x.txt")

# Check if the number of samples (cells) in vtSeurat2 is identical to the number of rows of this annotation file.

dim(annotations)[1] == dim(vtSeurat2)[2]

# make the annotations object compatible for data analysis.
anno_revised <- annotations %>% 
  separate(annotation, 
           into = c("Cluster", "Annotation"),
           sep = "\t") %>% 
  mutate(Location = final_cluster,
         Sample = location) %>% 
  select(-location, -final_cluster)

# Extract the metadata
meta_data.copy <- vtSeurat2@meta.data

# make new column that copies the rownames
meta_data.copy %<>% 
  mutate(Fetus = rownames(meta_data.copy))

# add the annotations!!!
meta_data.copy <- 
  inner_join(meta_data.copy, anno_revised, by = 'Fetus')

# Reassign the metadata
vtSeurat2@meta.data$Cluster <- meta_data.copy$Cluster
vtSeurat2@meta.data$Annotation <- meta_data.copy$Annotation
vtSeurat2@meta.data$Location <- meta_data.copy$Location
vtSeurat2@meta.data$Sample <- meta_data.copy$Sample

# Set the identity (classifier) of the cells based on the location

# Set the column "Location" as characters
vtSeurat2@meta.data$Location <- as.character(vtSeurat2@meta.data$Location)

Idents(object = vtSeurat2) <- "Location"
levels(Idents(vtSeurat2))


vtSeurat2@meta.data %>% 
  filter(Location == "Placenta")
# Subset the vtSeurat2 object to a smaller one.
vtSeurat2 <- subset(vtSeurat2, idents = "Placenta")

# The Seurat object became smaller.
dim(vtSeurat2)

# Let's save this object as another RDS file
saveRDS(vtSeurat2, "vtSeurat2_placenta.rds")
vtSeurat2 <- readRDS("vtSeurat2_placenta.rds")

# Add in the Mitochondrial PCT% information
vtSeurat2$percent.mt <- PercentageFeatureSet(vtSeurat2, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(vtSeurat2$nCount_RNA)

summary(vtSeurat2@meta.data$nCount_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2894    9097   15033   19055   24238  125759
# nFeature_RNA is the number of different genes that had any reads
hist(vtSeurat2$nFeature_RNA)

summary(vtSeurat2@meta.data$nFeature_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1086    2700    3584    3655    4500    8477
# percent.mt is the percent mitochondrial reads
summary(vtSeurat2$percent.mt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
# We see that the mitochondrial genes were removed.

# Check for the percentage of ribosomal genes
vtSeurat2[["percent.rb"]] <- PercentageFeatureSet(vtSeurat2, pattern = "^RP[SL]")
summary(vtSeurat2$percent.rb)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.671  22.224  27.515  27.348  32.371  59.986
dim(vtSeurat2)
## [1] 31743 18547

We now have a nice vtSeurat object that contains only cells from the placenta. Luckily, we still have 18547 cells, which would be sufficient for downstream analysis. Another important part for QC is removing doublets.

# Find for doublets
vtSeurat2 <- as.SingleCellExperiment(vtSeurat2)
vtSeurat2 <- scDblFinder(vtSeurat2)
## Creating ~14838 artificial doublets...
## Dimensional reduction
## Evaluating kNN...
## Training model...
## iter=0, 1629 cells excluded from training.
## iter=1, 2196 cells excluded from training.
## iter=2, 2256 cells excluded from training.
## Threshold found:0.35
## 1741 (9.4%) doublets called
# Convert back to Seurat object.
vtSeurat2 <- as.Seurat(vtSeurat2)

# Notice that our identifiers have changed.
levels(Idents(object = vtSeurat2))
## [1] "SingleCellExperiment"
# We want to specify the identifier whether the cell is a singlet or doublet.
table(vtSeurat2@meta.data$scDblFinder.class)
## 
## singlet doublet 
##   16806    1741
Idents(object = vtSeurat2) <- "scDblFinder.class"
vtSeurat2@meta.data$scDblFinder.class <- as.character(vtSeurat2@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
vtSeurat2 <- subset(vtSeurat2, idents = "singlet")
levels(as.factor(vtSeurat2@meta.data$scDblFinder.class))
## [1] "singlet"
# Check the dimensions of the dataset again.
dim(vtSeurat2)
## [1] 31743 16806

Now, note that we have 16779 cells that are derived from the placenta.

colnames(vtSeurat2@meta.data)
##  [1] "orig.ident"             "nCount_RNA"             "nFeature_RNA"          
##  [4] "Cluster"                "Annotation"             "Location"              
##  [7] "Sample"                 "percent.mt"             "percent.rb"            
## [10] "ident"                  "scDblFinder.class"      "scDblFinder.score"     
## [13] "scDblFinder.weighted"   "scDblFinder.cxds_score"
# convert column names to make the protocol coherent
colnames(vtSeurat2@meta.data)[2] <- "nCount_RNA"
colnames(vtSeurat2@meta.data)[3] <- "nFeature_RNA"

# Make a violin plot of the QC columns
plt <- VlnPlot(vtSeurat2, 
               features = c("nFeature_RNA", "nCount_RNA" ,"percent.rb"),
               ncol = 3,
               pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))

plt

ggsave(filename = "QC.png", plot = plt, width = 7, height = 3.5)

Looking at the violin plot, we do see some extreme values that should be filtered out. However, let’s put the subsetting part for later and proceed to next steps and come back later.

# Make scatter plots to compare
plot1 <- FeatureScatter(vtSeurat2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1

# Looking at the scatterplot, we should subset to remove some outliers
# This is because further data analysis steps requires the assumption that the data has relatively moderate, homogeneous technical variants.
dim(vtSeurat2)
## [1] 31743 16806
# Let's perform some filtering!
vtSeurat2 <- subset(vtSeurat2, subset = percent.rb > 5)

Next, we need to normalize the data.

# Next, we normalize the data
vtSeurat2 <- NormalizeData(vtSeurat2)

# Explore the most variable features, let's use the default version first.
vtSeurat2 <- FindVariableFeatures(vtSeurat2)
# vtSeurat2 <- FindVariableFeatures(vtSeurat2, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(vtSeurat2), 10) 
top10
##  [1] "PLA2G2A" "PSG3"    "PRG2"    "AREG"    "MMP12"   "CSH1"    "PSG5"   
##  [8] "PAPPA2"  "PSG2"    "CHI3L1"
# plot variable features with and without labels
plot2 <- VariableFeaturePlot(vtSeurat2)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6532 rows containing missing values (geom_point).

# Convert normalized gene expression to Z-score
all.genes <- rownames(vtSeurat2)
vtSeurat2 <- ScaleData(vtSeurat2, features = all.genes)
## Centering and scaling data matrix

Now, we are ready to run PCA.

# Run PCA analysis
vtSeurat2 <- RunPCA(vtSeurat2, features = VariableFeatures(object = vtSeurat2))
## PC_ 1 
## Positive:  KRT8, KRT7, GCSH, KRT19, CDKN1C, LY6E, HMGB3, CLIC3, MORF4L2, BZW2 
##     FHL2, ISM2, EFNA1, S100P, PRSS8, GPX3, PTN, MFAP5, CCNE1, IGF2 
##     GJA5, CDK7, LIMA1, PGF, GATA2, CYP11A1, PTPRF, FLT1, OPN3, BNIP3 
## Negative:  TMSB4X, FTL, FCGRT, TYROBP, FCER1G, LAPTM5, HLA-A, AIF1, CD68, ITGB2 
##     SH3BGRL, CTSS, LY96, GMFG, CD36, TREM2, SPI1, CD53, HCST, LST1 
##     GPSM3, HLA-B, LGALS9, C1QC, UCP2, SPP1, C1QA, VIM, RGS10, CD37 
## PC_ 2 
## Positive:  SAT1, HMGA1, NPL, CTSB, TYROBP, FTH1, S100A9, FCER1G, ARHGDIB, LAPTM5 
##     AIF1, KRT7, CREG1, CTSS, ITGB2, CD53, ENO1, TREM2, HCST, CD68 
##     SPI1, LCP1, PKM, LST1, PYCARD, CORO1A, C1QC, SPP1, ADAP2, LGALS9 
## Negative:  EGFL6, COL6A2, MEG3, DCN, COL6A1, DLK1, LUM, PCOLCE, HAPLN1, COL1A1 
##     COL3A1, PITX2, PLAC9, CXCL14, FRZB, CDH11, CTHRC1, COL1A2, WNT2, GPC3 
##     HGF, OLFML3, VCAN, SPARC, CCND2, SRPX, PLAGL1, MGP, ENPP1, HSD17B2 
## PC_ 3 
## Positive:  QSOX1, TGFB1, FSTL3, ITGA5, HAPLN3, C12orf75, ASCL2, DIO2, GLRX, C1QTNF6 
##     FXYD5, SLC16A3, MYCNUT, AIF1L, MT1X, PLAC8, GLIPR1, TPM1, ABCA7, COTL1 
##     HN1, HTRA4, MT1F, HLA-C, NOTUM, SLCO4A1, HLA-G, MT1E, AES, CA2 
## Negative:  SLC40A1, KRT23, BTG3, SLC26A2, SLC13A4, SLC52A1, GSTA3, INSL4, HSD17B1, HSPB8 
##     TMEM40, SOD3, CCR7, ARG2, CSTA, RP11-303E16.6, TBX3, SECISBP2L, SEPP1, SQSTM1 
##     TACSTD2, TIMP3, ST3GAL4, GADD45G, MUC15, HMGA1, SLC1A2, PRKCZ, NPL, CLDN3 
## PC_ 4 
## Positive:  S100P, PVRL4, GDF15, CLIC3, FLT1, CYP19A1, EPS8L1, TMEM40, PLIN2, INSL4 
##     SQSTM1, LYPD3, PRSS8, SAT1, MUC15, CYP11A1, PRKCZ, INHA, CEBPB, NUCB2 
##     EBI3, SERPINE1, GADD45G, SGK1, DHRS9, CGA, HERPUD1, DNAJB9, TBX3, MAFF 
## Negative:  TOP2A, NUSAP1, BIRC5, CCNA2, CDK1, AURKB, TYMS, HMGB2, TUBA1B, STMN1 
##     PRC1, UBE2C, CENPF, MKI67, CDC20, HMMR, PTTG1, PBK, CDCA3, GTSE1 
##     SMC4, HMGB1, KIAA0101, CDKN3, SPC25, TPX2, TK1, MAD2L1, PLK1, H2AFZ 
## PC_ 5 
## Positive:  LYVE1, F13A1, FOLR2, HPGDS, CD5L, EGFL7, LRRC39, IGFBP4, DAB2, IGF1 
##     SCN9A, MAF, MRC1, TMIGD3, C2, GAPLINC, FAM20A, SLC28A1, VSIG4, LILRB5 
##     C10orf128, METTL7B, CD14, GPR34, GAS6, ITM2B, FAM212A, SIGLEC1, ABCG2, FEZ1 
## Negative:  HLA-DRA, HLA-DPA1, HLA-DRB1, HLA-DPB1, LSP1, HLA-DQB1, HLA-DRB5, HLA-DQA1, CD52, CD74 
##     HLA-DMA, LILRB4, SDS, APOC1, FBP1, MSR1, LINC01272, CXCR4, LYZ, RGS1 
##     C1orf162, IL4I1, IFI30, HLA-DQA2, FPR3, SERPINA1, CD44, LPL, CD48, HAMP
# Plot the PCA
DimPlot(vtSeurat2, reduction = "pca")

# Plot the loadings
VizDimLoadings(vtSeurat2, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(vtSeurat2)

# FeaturePlot of PC1 top features
FeaturePlot(vtSeurat2, features = c("TMSB4X", 
                                   "FTL"))

# FeaturePlot of PC2 top features
FeaturePlot(vtSeurat2, features = c("EGFL6", 
                                   "COL6A2"))

Next, we move on to derive clusters.

# Now perform clustering
vtSeurat2 <- FindNeighbors(vtSeurat2, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
vtSeurat2 <- FindClusters(vtSeurat2, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 16804
## Number of edges: 534426
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9232
## Number of communities: 16
## Elapsed time: 2 seconds
# For visualization purposes, we need to generate UMAP reduced dimensionality representation
vtSeurat2 <- RunUMAP(vtSeurat2, dims = 1:10, verbose = F)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# Plot the UMAP with clustering
DimPlot(vtSeurat2, reduction = "umap")

# Get the TSNE embedding
vtSeurat2 <- RunTSNE(vtSeurat2, dims = 1:10)

# Plot the t-SNE with clustering
DimPlot(vtSeurat2, reduction = "tsne")

# Let's look at cluster sizes
table(vtSeurat2@meta.data$seurat_clusters)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2618 2044 1804 1793 1580 1225 1173 1020  896  797  555  456  370  223  142  108
# Let's plot those clusters!
DimPlot(vtSeurat2,label.size = 4,repel = T,label = T)

# Check expression patterns for some genes in top10
FeaturePlot(vtSeurat2, features = c("PLA2G2A",
                                   "MMP12",
                                   "AREG",
                                   "AOC1"))

Let’s calculate the cell cycle scores at this point.

vtSeurat2 <- SetIdent(vtSeurat2, value = "Annotation")

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

# Somehow this code does not work.
vtSeurat2 <- CellCycleScoring(vtSeurat2,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)
## Warning: The following features are not present in the object: PIMREG, JPT1,
## attempting to find updated synonyms
## Warning: No updated symbols found
## Warning: The following features are still not present in the object: PIMREG,
## JPT1
table(vtSeurat2[[]]$Phase)
## 
##    G1   G2M     S 
## 10552  2725  3527

Around 1/5 of the cells are in the S phase.

# Check if ribosomal genes show dependency on cluster
FeaturePlot(vtSeurat2,features = "percent.rb",
            label.size = 4,
            repel = T,
            label = F) & 
  theme(plot.title = element_text(size=10))

# Check using the violin plot
VlnPlot(vtSeurat2,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

# Check # of genes and # of transcripts per cluster
VlnPlot(vtSeurat2,features = "nCount_RNA") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

VlnPlot(vtSeurat2,features = "nFeature_RNA") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(vtSeurat2,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(vtSeurat2, features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

In this case, it looks like some cells do show some cell cycle scores, which should be examined more carefully.

# Plot in UMAP
DimPlot(vtSeurat2, 
        group.by = "Phase")

# Plot with clusters
compTable.int <- table(vtSeurat2$Annotation, vtSeurat2$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)

This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase.

# Proceed to SCTransform
vtSeurat2 <- SCTransform(vtSeurat2, 
                       assay = "RNA", 
                       method = "glmGamPoi", 
                       ncells = 4367, 
                    vars.to.regress = c("percent.mt","S.Score","G2M.Score"),
                    verbose = F)

# View the object
vtSeurat2
## An object of class Seurat 
## 53977 features across 16804 samples within 2 assays 
## Active assay: SCT (22234 features, 3000 variable features)
##  1 other assay present: RNA
##  3 dimensional reductions calculated: pca, umap, tsne
GetAssay(vtSeurat2)
## SCTAssay data with 22234 features for 16804 cells, and 1 SCTModel(s) 
## Top 10 variable features:
##  SPP1, FTL, TMSB4X, CXCL14, CD74, DLK1, CGA, HLA-DRA, APOE, LUM
# Standard PCA, UMAP, and clustering
vtSeurat2 <- RunPCA(vtSeurat2, verbose = F)
vtSeurat2 <- RunUMAP(vtSeurat2, dims = 1:30, verbose = F)
vtSeurat2 <- FindNeighbors(vtSeurat2, dims = 1:30, verbose = F)
vtSeurat2 <- FindClusters(vtSeurat2, verbose = F)
table(vtSeurat2[[]]$seurat_clusters)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2158 2056 1635 1215 1203 1088  928  920  916  898  816  798  747  586  245  170 
##   16   17   18   19 
##  142  130  116   37
vtSeurat2 <- SetIdent(vtSeurat2, value = "Annotation")
DimPlot(vtSeurat2, label = T) + NoLegend()

# Check whether the SCTransform clustering and default clustering are identical
summary(vtSeurat2$SCT_snn_res.0.8 == vtSeurat2$seurat_clusters)
##    Mode    TRUE 
## logical   16804
FeaturePlot(vtSeurat2,"SPP1") & 
  scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

FeaturePlot(vtSeurat2,"CSH1") & 
  scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

# Designate the markers
all.markers <- FindAllMarkers(vtSeurat2,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)
## Calculating cluster dS1
## Calculating cluster VCT
## Calculating cluster dNK2
## Calculating cluster dNK1
## Calculating cluster dM2
## Calculating cluster dM1
## Calculating cluster Tcells
## Calculating cluster fFB1
## Calculating cluster EVT
## Calculating cluster NK CD16+
## Calculating cluster HB
## Calculating cluster SCT
## Calculating cluster dM3
## Calculating cluster MO
## Calculating cluster dNK p
## Calculating cluster Plasma
## Calculating cluster DC1
## Calculating cluster DC2
## Calculating cluster Granulocytes
## Calculating cluster ILC3
## Calculating cluster fFB2
## Calculating cluster Endo (m)
## Calculating cluster Endo L
## Calculating cluster Endo (f)
## Calculating cluster Epi1
## Calculating cluster Epi2
dim(all.markers)
## [1] 4227    7
table(all.markers$cluster)
## 
##          dS1          VCT         dNK2         dNK1          dM2          dM1 
##            0          220            0          244          228          188 
##       Tcells         fFB1          EVT     NK CD16+           HB          SCT 
##          175          225          265            0          313           84 
##          dM3           MO        dNK p       Plasma          DC1          DC2 
##          305          203            0          102            0          203 
## Granulocytes         ILC3         fFB2     Endo (m)       Endo L     Endo (f) 
##          140            0          264          268          269          243 
##         Epi1         Epi2 
##            0          288
top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
# We can check the dataframe briefly 
head(top3_markers)
##           p_val avg_log2FC pct.1 pct.2     p_val_adj cluster  gene
## 1  0.000000e+00   1.250532 0.999 0.573  0.000000e+00     VCT PEG10
## 2  0.000000e+00   1.235704 0.995 0.518  0.000000e+00     VCT SMAGP
## 3  0.000000e+00   1.223374 0.999 0.743  0.000000e+00     VCT PAGE4
## 4 1.396813e-297   2.403480 1.000 0.003 3.105673e-293    dNK1  GZMA
## 5  3.710563e-62   2.402064 1.000 0.018  8.250066e-58    dNK1  NKG7
## 6  1.895328e-36   3.051002 1.000 0.032  4.214071e-32    dNK1  GNLY

Clustering based on Cell Type annotation

DimPlot(vtSeurat2, label = T , repel = T, label.size = 3) + NoLegend()
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

DimPlot(vtSeurat2, reduction = "pca")

DimPlot(vtSeurat2, reduction = "tsne")

DimPlot(vtSeurat2, reduction = "umap")

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(vtSeurat2, "vtSeurat2_annotated_SCT.rds")

Still Work in Progress

# Pseudotime
# Convert to monocle3 cell_data_set
vtSeurat2.cds <- as.cell_data_set(vtSeurat2)
class(vtSeurat2.cds)

# Re-cluster (required for trajectories in monocle3)
vtSeurat2.cds <- cluster_cells(vtSeurat2.cds)

# Learn the trajectory
vtSeurat2.cds <- learn_graph(vtSeurat2.cds)

# Plot the trajectory
plot_cells(vtSeurat2.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE)

# Find the pseudotime order
DimPlot(vtSeurat2, group.by = "Phase")
vtSeurat2.cds <- order_cells(vtSeurat2.cds)

# Plot pseudotime
plot_cells(vtSeurat2.cds, color_cells_by = "pseudotime")

# Add pseudotime score back into the dataset
vtSeurat2$pseudotime_score <- vtSeurat2.cds@principal_graph_aux@listData$UMAP$pseudotime
vtSeurat2$pseudotime_score[vtSeurat2$pseudotime_score == Inf] <- NA

# FeaturePlot with Pseudotime
FeaturePlot(vtSeurat2, features = c("pseudotime_score")) 
DimPlot(vtSeurat2, group.by = "Annotation")
FeaturePlot(vtSeurat2, features = c("pseudotime_score"))