# Load libraries
library(SingleCellExperiment)

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()

library(Seurat)

## Attaching SeuratObject
## Attaching sp
## 
## Attaching package: 'Seurat'
## 
## The following object is masked from 'package:SummarizedExperiment':
## 
##     Assays

library(RColorBrewer)
library(scDblFinder)
library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

library(Matrix)

## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## The following object is masked from 'package:S4Vectors':
## 
##     expand

library(GEOquery)

## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)

library(pheatmap)

# library(monocle3)
# library(SeuratWrappers)

Creating the Seurat Object

# Data Wrangling Processes
# sCounts <- read.delim("Rockefeller.expression.txt", header = TRUE, sep = "\t", dec = ".")
# head(sCounts)

# Warning: This count matrix only contains the genes related to COVID-19. We should use the loom file that contains all the genes, and filter out the cells that are annotated (consistent with the annotations file)
  
sAnnotations <- read.delim("Rockefeller.meta.txt")
sAnnotations <- sAnnotations[-1,]

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

Since we have the metadata for this dataset, let’s add the cell annotations first, and filter the dataset to only include cells from the placenta. This will prevent the Seurat object becoming too large and time-costly to process.

# Load the RDS file
sSeurat <- readRDS("seuratObj.rds")

# Check the dimensions of this object
dim(sSeurat)

## [1]  58347 182605

# Please note that the number of cells in this dataset is much more than that reported in the Suryawanshi paper.
# Also, the annotations file only covers the cells relevant to the paper.

# We will make the names identical with our Seurat object
sAnnotations$NAME <- substr(sAnnotations$NAME, 21, 36)

overlaps <- sAnnotations %>% 
  group_by(NAME) %>% 
  count() %>% 
  filter(n > 1) %>% 
  pull(NAME)

# There are overlaps in the UMI in the annotations. We should filter out the overlaps
sAnno_no_overlaps <- sAnnotations[!duplicated(sAnnotations$NAME),]

# Check if the annotated cells are contained within the Seurat object
(sAnno_no_overlaps$NAME %in% colnames(sSeurat)) %>% 
  summary()

##    Mode    TRUE 
## logical   14213

# Extract the metadata and copy it.
meta_data.copy <- sSeurat@meta.data

# Check if metadata rownames are unique.
rownames(sSeurat@meta.data) %>%
  unique() %>% 
  length()

## [1] 182605

# Make a copy of annotations data
annotations.copy <- sAnno_no_overlaps %>% 
  select(NAME, CellType)

meta_data.copy <- meta_data.copy %>% 
  rownames_to_column()

meta_data.copy <- meta_data.copy %>% 
  mutate(NAME = rowname)


# render the metadata to include the annotation data
meta_data.copy <- left_join(meta_data.copy, annotations.copy)

## Joining, by = "NAME"

meta_data.copy <- meta_data.copy %>% 
  as.data.frame()

rownames(meta_data.copy) <- meta_data.copy$rowname
dim(meta_data.copy)

## [1] 182605     48

sSeurat@meta.data <- meta_data.copy

sSeurat2 <- sSeurat[,!is.na(sSeurat@meta.data$CellType)]

# Check the dimension of the new Seurat object
dim(sSeurat2)

## [1] 58347 14213

# check the metadata
colnames(sSeurat2@meta.data)

##  [1] "rowname"                                               
##  [2] "orig.ident"                                            
##  [3] "nCount_RNA"                                            
##  [4] "nFeature_RNA"                                          
##  [5] "antisense_reads"                                       
##  [6] "cell_barcode_fraction_bases_above_30_mean"             
##  [7] "cell_barcode_fraction_bases_above_30_variance"         
##  [8] "cell_names"                                            
##  [9] "duplicate_reads"                                       
## [10] "emptydrops_FDR"                                        
## [11] "emptydrops_IsCell"                                     
## [12] "emptydrops_Limited"                                    
## [13] "emptydrops_LogProb"                                    
## [14] "emptydrops_PValue"                                     
## [15] "emptydrops_Total"                                      
## [16] "fragments_per_molecule"                                
## [17] "fragments_with_single_read_evidence"                   
## [18] "genes_detected_multiple_observations"                  
## [19] "genomic_read_quality_mean"                             
## [20] "genomic_read_quality_variance"                         
## [21] "genomic_reads_fraction_bases_quality_above_30_mean"    
## [22] "genomic_reads_fraction_bases_quality_above_30_variance"
## [23] "input_id"                                              
## [24] "molecule_barcode_fraction_bases_above_30_mean"         
## [25] "molecule_barcode_fraction_bases_above_30_variance"     
## [26] "molecules_with_single_read_evidence"                   
## [27] "n_fragments"                                           
## [28] "n_genes"                                               
## [29] "n_mitochondrial_genes"                                 
## [30] "n_mitochondrial_molecules"                             
## [31] "n_molecules"                                           
## [32] "n_reads"                                               
## [33] "noise_reads"                                           
## [34] "pct_mitochondrial_molecules"                           
## [35] "perfect_cell_barcodes"                                 
## [36] "perfect_molecule_barcodes"                             
## [37] "reads_mapped_exonic"                                   
## [38] "reads_mapped_intergenic"                               
## [39] "reads_mapped_intronic"                                 
## [40] "reads_mapped_multiple"                                 
## [41] "reads_mapped_too_many_loci"                            
## [42] "reads_mapped_uniquely"                                 
## [43] "reads_mapped_utr"                                      
## [44] "reads_per_fragment"                                    
## [45] "reads_unmapped"                                        
## [46] "spliced_reads"                                         
## [47] "NAME"                                                  
## [48] "CellType"

# Add in the Mitochondrial PCT% information
sSeurat2$percent.mt <- PercentageFeatureSet(sSeurat2, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(sSeurat2$nCount_RNA)

summary(sSeurat2@meta.data$nCount_RNA)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      58    1636    4421    9205   12941   86803

# nFeature_RNA is the number of different genes that had any reads
hist(sSeurat2$nFeature_RNA)

summary(sSeurat2@meta.data$nFeature_RNA)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      32     796    1428    1913    2853    6795

# percent.mt is the percent mitochondrial reads
summary(sSeurat2$percent.mt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.183   2.036   2.938   3.520  87.521

# We see that the mitochondrial genes were removed.

# Check for the percentage of ribosomal genes
sSeurat2[["percent.rb"]] <- PercentageFeatureSet(sSeurat2, pattern = "^RP[SL]")
summary(sSeurat2$percent.rb)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8833 20.0000 26.4393 25.4874 31.5022 65.5154

dim(sSeurat2)

## [1] 58347 14213

We now have a sSeurat2 object that has 14213 cells, which would be sufficient for downstream analysis. Another important part for QC is removing doublets.

# Find for doublets
sSeurat2 <- as.SingleCellExperiment(sSeurat2)
sSeurat2 <- scDblFinder(sSeurat2)

## Warning in .checkSCE(sce): Some cells in `sce` have an extremely low read
## counts; note that these could trigger errors and might best be filtered out

## Creating ~11371 artificial doublets...

## Dimensional reduction

## Evaluating kNN...

## Training model...

## iter=0, 2344 cells excluded from training.

## iter=1, 2382 cells excluded from training.

## iter=2, 2388 cells excluded from training.

## Threshold found:0.383

## 1405 (9.9%) doublets called

# Convert back to Seurat object.
sSeurat2 <- as.Seurat(sSeurat2)

# Notice that our identifiers have changed.
levels(Idents(object = sSeurat2))

## [1] "SingleCellExperiment"

# We want to specify the identifier whether the cell is a singlet or doublet.
table(sSeurat2@meta.data$scDblFinder.class)

## 
## singlet doublet 
##   12808    1405

Idents(object = sSeurat2) <- "scDblFinder.class"
sSeurat2@meta.data$scDblFinder.class <- as.character(sSeurat2@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
sSeurat2 <- subset(sSeurat2, idents = "singlet")
levels(as.factor(sSeurat2@meta.data$scDblFinder.class))

## [1] "singlet"

# Check the dimensions of the dataset again.
dim(sSeurat2)

## [1] 58347 12808

Now, note that we have 12808 cells

colnames(sSeurat2@meta.data)

##  [1] "orig.ident"                                            
##  [2] "nCount_RNA"                                            
##  [3] "nFeature_RNA"                                          
##  [4] "rowname"                                               
##  [5] "antisense_reads"                                       
##  [6] "cell_barcode_fraction_bases_above_30_mean"             
##  [7] "cell_barcode_fraction_bases_above_30_variance"         
##  [8] "cell_names"                                            
##  [9] "duplicate_reads"                                       
## [10] "emptydrops_FDR"                                        
## [11] "emptydrops_IsCell"                                     
## [12] "emptydrops_Limited"                                    
## [13] "emptydrops_LogProb"                                    
## [14] "emptydrops_PValue"                                     
## [15] "emptydrops_Total"                                      
## [16] "fragments_per_molecule"                                
## [17] "fragments_with_single_read_evidence"                   
## [18] "genes_detected_multiple_observations"                  
## [19] "genomic_read_quality_mean"                             
## [20] "genomic_read_quality_variance"                         
## [21] "genomic_reads_fraction_bases_quality_above_30_mean"    
## [22] "genomic_reads_fraction_bases_quality_above_30_variance"
## [23] "input_id"                                              
## [24] "molecule_barcode_fraction_bases_above_30_mean"         
## [25] "molecule_barcode_fraction_bases_above_30_variance"     
## [26] "molecules_with_single_read_evidence"                   
## [27] "n_fragments"                                           
## [28] "n_genes"                                               
## [29] "n_mitochondrial_genes"                                 
## [30] "n_mitochondrial_molecules"                             
## [31] "n_molecules"                                           
## [32] "n_reads"                                               
## [33] "noise_reads"                                           
## [34] "pct_mitochondrial_molecules"                           
## [35] "perfect_cell_barcodes"                                 
## [36] "perfect_molecule_barcodes"                             
## [37] "reads_mapped_exonic"                                   
## [38] "reads_mapped_intergenic"                               
## [39] "reads_mapped_intronic"                                 
## [40] "reads_mapped_multiple"                                 
## [41] "reads_mapped_too_many_loci"                            
## [42] "reads_mapped_uniquely"                                 
## [43] "reads_mapped_utr"                                      
## [44] "reads_per_fragment"                                    
## [45] "reads_unmapped"                                        
## [46] "spliced_reads"                                         
## [47] "NAME"                                                  
## [48] "CellType"                                              
## [49] "percent.mt"                                            
## [50] "percent.rb"                                            
## [51] "ident"                                                 
## [52] "scDblFinder.class"                                     
## [53] "scDblFinder.score"                                     
## [54] "scDblFinder.weighted"                                  
## [55] "scDblFinder.cxds_score"

# convert column names to make the protocol coherent
colnames(sSeurat2@meta.data)[2] <- "nCount_RNA"
colnames(sSeurat2@meta.data)[3] <- "nFeature_RNA"

# Make a violin plot of the QC columns
plt <- VlnPlot(sSeurat2, 
               features = c("nFeature_RNA", "nCount_RNA"),
               ncol = 2,
               pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))

plt

# We can say that most of the QC was already performed.
ggsave(filename = "QC.png", plot = plt, width = 7, height = 3.5)

Looking at the violin plot, we do see some extreme values that should be filtered out. However, let’s put the subsetting part for later and proceed to next steps and come back later.

# Make scatter plots to compare
plot1 <- FeatureScatter(sSeurat2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1

# remove the cells with high mitochondrial gene percentage
sSeurat2 <- subset(sSeurat2, subset = percent.mt < 5) 
dim(sSeurat2)

## [1] 58347 10864

Since we want to compare the difference between subsetted data and the control (no subsetting), we make a copy for the sSeurat2.

subset_sSeurat2 <- sSeurat2
subset_sSeurat2 <- subset(subset_sSeurat2, subset = nFeature_RNA > 3000)
dim(subset_sSeurat2)

Note that this number of cells is approximately identical to that of the publication (they used the same threshold)

Next, we need to normalize the data.

# Next, we normalize the data
sSeurat2 <- NormalizeData(sSeurat2)

# Explore the most variable features, let's use the default version first.
sSeurat2 <- FindVariableFeatures(sSeurat2)
# sSeurat2 <- FindVariableFeatures(sSeurat2, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(sSeurat2), 10) 
top10

##  [1] "PAEP"   "CCL21"  "SLPI"   "GPX3"   "CGA"    "IGFBP1" "DEFB1"  "PRL"   
##  [9] "PCCA"   "TSPAN1"

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(sSeurat2)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)

## When using repel, set xnudge and ynudge to 0 for optimal results

plot2

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 22964 rows containing missing values (geom_point).

# Convert normalized gene expression to Z-score
all.genes <- rownames(sSeurat2)
sSeurat2 <- ScaleData(sSeurat2, features = all.genes)

## Centering and scaling data matrix

Before moving on, let’s compare the above with the subsetted Seurat object.

# Next, we normalize the data
subset_sSeurat2 <- NormalizeData(subset_sSeurat2)

# Explore the most variable features, let's use the default version first.
subset_sSeurat2 <- FindVariableFeatures(subset_sSeurat2)
# subset_sSeurat2 <- FindVariableFeatures(subset_sSeurat2, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10_subset <- head(VariableFeatures(subset_sSeurat2), 10) 
top10_subset

# plot variable features with and without labels
plot2_subset <- VariableFeaturePlot(subset_sSeurat2)
plot2_subset <- LabelPoints(plot = plot2, points = top10, repel = TRUE)
plot2_subset

# Convert normalized gene expression to Z-score
all.genes_subset <- rownames(subset_sSeurat2)
subset_sSeurat2 <- ScaleData(subset_sSeurat2, features = all.genes)

# Check if the top 10 hvg are the same
top10_subset %in% top10
# There are two genes that are different after subsetting!

Now, we are ready to run PCA.

# Run PCA analysis
sSeurat2 <- RunPCA(sSeurat2, features = VariableFeatures(object = sSeurat2))

## PC_ 1 
## Positive:  PAGE4, PHLDA2, VGLL1, XAGE3, HMGA1, KRT7, PEG10, SPINT2, SMAGP, DUSP9 
##     KRT18, SPINT1, CD24, IFI6, PERP, XAGE2, FXYD3, SLC22A11, ISYNA1, PKIB 
##     KRT8, PNP, S100P, PMAIP1, CLDN4, TSC22D1, CKS2, MBNL3, DDIT3, HMGB3 
## Negative:  DKK1, RAMP1, C1R, C1S, HAND2-AS1, TMEM45A, SPARCL1, APOD, IGFBP6, ISLR 
##     MFGE8, SPOCK1, SCARA5, CFD, PLPP1, SERPING1, PLAC9P1, MGST1, IL15, OMD 
##     MAOB, PRSS23, CPXM1, HSPB6, TIMP1, FBLN5, RORB, PCOLCE, SPON2, SELENOM 
## PC_ 2 
## Positive:  KRT18, KRT8, PHLDA2, PAGE4, TINAGL1, PEG10, XAGE3, VGLL1, KRT7, EFEMP1 
##     DUSP9, PERP, SPINT1, ISYNA1, SMAGP, CLDN4, IFI6, XAGE2, PKIB, KRT23 
##     FXYD3, SLC22A11, KRT19, CDKN1C, S100P, CDO1, MBNL3, TSC22D1, GGH, CD24 
## Negative:  TYROBP, FCER1G, LAPTM5, SRGN, CCL4, CD53, CD14, AIF1, CCL3, FOLR2 
##     C1QA, CTSS, C1QC, VSIG4, F13A1, C1QB, CCL4L2, HCST, FCGR2A, PLEK 
##     CSF1R, LY96, GPR34, MS4A4A, HPGDS, CD163, CYBB, LST1, RNASE1, NR4A2 
## PC_ 3 
## Positive:  CTHRC1, EGFL6, DLK1, GPC3, FN1, CD9, FRZB, DES, PITX2, COL1A1 
##     COL1A2, COL9A3, ACTG2, ACTA2, LTBP1, OGN, HAPLN1, FILIP1, IGFBP3, FIBIN 
##     HGF, TAGLN, LOX, PRSS35, SLC18A3, SERPINE2, SBSPON, HSD17B2, AQP1, TNNT2 
## Negative:  SAT1, RGS2, PLIN2, KRT23, EFEMP1, CTSL, XAGE3, VGLL1, DAB2, PMAIP1 
##     KRT7, CLDN4, DDIT3, SPINT1, PHLDA2, FXYD3, PNP, AKR1B1, DUSP9, SLC22A11 
##     PAGE4, SLC43A2, NPC2, SMAGP, PEG10, SPINT2, AVPI1, S100P, HMGA1, XAGE2 
## PC_ 4 
## Positive:  NKG7, GNLY, CTSW, GZMA, KLRC1, TRDC, XCL2, CST7, HOPX, GZMB 
##     LSP1, CD52, PRF1, XCL1, KLRD1, IL2RG, CD3E, CD247, TNFRSF18, IL2RB 
##     RAC2, CD7, KRT81, IL32, CMC1, TRBC2, APOBEC3G, TMIGD2, KRT86, KIR2DL4 
## Negative:  GPC3, DLK1, COL1A2, COL1A1, EGFL6, CTHRC1, PLTP, DES, FN1, IER3 
##     FRZB, HAPLN1, OGN, PITX2, PLAUR, LTBP1, ACTG2, CRABP2, COL9A3, RBP7 
##     IGFBP3, ACTA2, COL14A1, TAGLN, SPP1, LOX, CD36, FILIP1, SERPINE2, F13A1 
## PC_ 5 
## Positive:  CRABP2, DES, OGN, SERPINE2, COL9A3, GPC3, ACTG2, CD9, CSRP2, PITX2 
##     COL1A2, COL1A1, LTBP1, HAPLN1, COL14A1, LOX, COTL1, RBP7, FSTL3, SLC18A3 
##     EGFL6, C12orf75, FIBIN, HSPB7, TAGLN, CXCL14, CD7, CAPG, CNN1, HSD17B1 
## Negative:  FAM213A, GYPA, GYPB, ALAS2, AHSP, HBM, MT1G, HEMGN, HBZP1, HBQ1 
##     HBG1, HBA1, SLC25A37, MT1H, MT1F, HBZ, SLC4A1, KLF1, RHAG, EPB42 
##     HBG2, HBA2, GNG11, FAM210B, MT1X, AC073349.1, SLC39A8, TMOD1, HMBS, SMIM5

# Plot the PCA
DimPlot(sSeurat2, reduction = "pca")

# Plot the loadings
VizDimLoadings(sSeurat2, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(sSeurat2)

# FeaturePlot of PC1 top features
FeaturePlot(sSeurat2, features = c("PAPPA", 
                                   "PCOLCE"))

# FeaturePlot of PC2 top features
FeaturePlot(sSeurat2, features = c("TYROBP", 
                                   "FCER1G"))

Next, we move on to derive clusters.

# Now perform clustering
sSeurat2 <- FindNeighbors(sSeurat2, dims = 1:10)

## Computing nearest neighbor graph

## Computing SNN

sSeurat2 <- FindClusters(sSeurat2, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 10864
## Number of edges: 355164
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9515
## Number of communities: 21
## Elapsed time: 0 seconds

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
sSeurat2 <- RunUMAP(sSeurat2, dims = 1:10, verbose = F)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

# Plot the UMAP with clustering
DimPlot(sSeurat2, reduction = "umap")

# Get the TSNE embedding
sSeurat2 <- RunTSNE(sSeurat2, dims = 1:10, check_duplicates = FALSE)

# Plot the t-SNE with clustering
DimPlot(sSeurat2, reduction = "tsne")

# Let's look at cluster sizes
table(sSeurat2@meta.data$seurat_clusters)

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1300 1023  961  828  730  721  692  618  617  591  470  460  450  299  249  209 
##   16   17   18   19   20 
##  177  170  137  105   57

# Let's plot those clusters!
DimPlot(sSeurat2,label.size = 4,repel = T,label = T)

# Check expression patterns for some genes in top10
FeaturePlot(sSeurat2, features = c("PAPPA",
                                   "TYROBP",
                                   "IGF2",
                                   "RGS5"))

Let’s calculate the cell cycle scores at this point.

sSeurat2 <- SetIdent(sSeurat2, value = "CellType")

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

sSeurat2 <- CellCycleScoring(sSeurat2,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)

table(sSeurat2[[]]$Phase)

## 
##   G1  G2M    S 
## 7598 1632 1634

# Check if ribosomal genes show dependency on cluster
FeaturePlot(sSeurat2,features = "percent.rb",
            label.size = 4,
            repel = T,
            label = F) & 
  theme(plot.title = element_text(size=10))

# Check using the violin plot
VlnPlot(sSeurat2,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Check # of genes per cluster
VlnPlot(sSeurat2,features = "nFeature_RNA") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(sSeurat2,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(sSeurat2, features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

# Plot in UMAP
DimPlot(sSeurat2, 
        group.by = "Phase")

# Plot with clusters
compTable.int <- table(sSeurat2$CellType, sSeurat2$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)

# FeaturePlot for showing mod score
FeaturePlot(sSeurat2, 
            features = c("S.Score", "G2M.Score"))

Note that most of the cells are in the G1 phase.

DefaultAssay(sSeurat2) <- "RNA"

# Perform log-normalization
sSeurat2 <- NormalizeData(sSeurat2)

# Find variable features
sSeurat2 <- FindVariableFeatures(sSeurat2, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(sSeurat2)
sSeurat2 <- ScaleData(sSeurat2, features = all.genes)

## Centering and scaling data matrix

# Designate the markers
all.markers <- FindAllMarkers(sSeurat2,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)

## Calculating cluster Cytotrophoblast

## Calculating cluster Syncytiotrophoblast

## Calculating cluster Stromal (Endothelial)

## Calculating cluster Stromal (Fibroblast)

## Calculating cluster Immune (Macrophage)

## Calculating cluster Hematopoietic (Erythroblast)

## Calculating cluster Immune (Lymphocyte)

## Calculating cluster Extravillous trophoblast

## Calculating cluster Immune (Myeloid)

## Calculating cluster Epithelial (Neuroendocrine)

dim(all.markers)

## [1] 2297    7

table(all.markers$cluster)

## 
##              Cytotrophoblast          Syncytiotrophoblast 
##                          195                          459 
##        Stromal (Endothelial)         Stromal (Fibroblast) 
##                          124                          239 
##          Immune (Macrophage) Hematopoietic (Erythroblast) 
##                          333                           89 
##          Immune (Lymphocyte)     Extravillous trophoblast 
##                          126                          263 
##             Immune (Myeloid)  Epithelial (Neuroendocrine) 
##                          196                          273

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
head(top3_markers)

##           p_val avg_log2FC pct.1 pct.2     p_val_adj             cluster   gene
## 1  0.000000e+00   3.337094 0.971 0.367  0.000000e+00     Cytotrophoblast  PAGE4
## 2  0.000000e+00   3.011525 0.940 0.496  0.000000e+00     Cytotrophoblast   IFI6
## 3  0.000000e+00   2.594826 0.933 0.290  0.000000e+00     Cytotrophoblast PHLDA2
## 4  0.000000e+00   2.751932 0.869 0.196  0.000000e+00 Syncytiotrophoblast  XAGE3
## 5  0.000000e+00   2.747259 0.871 0.344  0.000000e+00 Syncytiotrophoblast PHLDA2
## 6 6.295277e-211   3.983948 0.699 0.225 3.673105e-206 Syncytiotrophoblast    CGA

dim(sSeurat2)

## [1] 58347 10864

Clustering based on Cell Type annotation

p1 <- DimPlot(sSeurat2, label = T , repel = T, label.size = 3) + NoLegend()
p2 <- DimPlot(sSeurat2, reduction = "pca")
p3 <- DimPlot(sSeurat2, reduction = "tsne")
p4 <- DimPlot(sSeurat2, reduction = "umap")

ggsave(filename = "UMAP.png", plot = p1, width = 7, height = 3.5)
ggsave(filename = "tsne.png", plot = p3, width = 7, height = 3.5)
ggsave(filename = "umap_legend.png", plot = p4, width = 6, height = 3.5)

p1

p2

p3

p4

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
# saveRDS(sSeurat2, "sSeurat2_annotated.rds")

Still Work in Progress

# Pseudotime
# Convert to monocle3 cell_data_set
sSeurat2.cds <- as.cell_data_set(sSeurat2)
class(sSeurat2.cds)

# Re-cluster (required for trajectories in monocle3)
sSeurat2.cds <- cluster_cells(sSeurat2.cds)

# Learn the trajectory
sSeurat2.cds <- learn_graph(sSeurat2.cds)

# Plot the trajectory
plot_cells(sSeurat2.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE)

# Find the pseudotime order
DimPlot(sSeurat2, group.by = "Phase")
sSeurat2.cds <- order_cells(sSeurat2.cds)

# Plot pseudotime
plot_cells(sSeurat2.cds, color_cells_by = "pseudotime")

# Add pseudotime score back into the dataset
sSeurat2$pseudotime_score <- sSeurat2.cds@principal_graph_aux@listData$UMAP$pseudotime
sSeurat2$pseudotime_score[sSeurat2$pseudotime_score == Inf] <- NA

# FeaturePlot with Pseudotime
FeaturePlot(sSeurat2, features = c("pseudotime_score")) 
DimPlot(sSeurat2, group.by = "Annotation")
FeaturePlot(sSeurat2, features = c("pseudotime_score"))

Suryawanshi et al. (2018)

Young-June Kim

2022-11-09

Still Work in Progress