Cao et al. (2022) Analysis

# Load libraries
library(SingleCellExperiment)

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: GenomicRanges

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomeInfoDb

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks IRanges::slice()

library(Seurat)

## Attaching SeuratObject
## Attaching sp
## 
## Attaching package: 'Seurat'
## 
## The following object is masked from 'package:SummarizedExperiment':
## 
##     Assays

library(RColorBrewer)
library(scDblFinder)
library(magrittr)

## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

library(pheatmap)
library(GEOquery)

## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)

library(SeuratDisk)

## Registered S3 method overwritten by 'SeuratDisk':
##   method            from  
##   as.sparse.H5Group Seurat

library(loomR)

## Loading required package: R6
## Loading required package: hdf5r
## 
## Attaching package: 'hdf5r'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten_df
## 
## The following object is masked from 'package:SummarizedExperiment':
## 
##     values
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     values
## 
## The following object is masked from 'package:S4Vectors':
## 
##     values
## 
## 
## Attaching package: 'loomR'
## 
## The following object is masked from 'package:SeuratDisk':
## 
##     loom
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:Biobase':
## 
##     combine
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     combine

library(R.utils)

## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.25.0 (2022-06-12 02:20:02 UTC) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following object is masked from 'package:R.methodsS3':
## 
##     throw
## 
## The following object is masked from 'package:magrittr':
## 
##     equals
## 
## The following object is masked from 'package:SummarizedExperiment':
## 
##     trim
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     trim
## 
## The following object is masked from 'package:IRanges':
## 
##     trim
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save
## 
## R.utils v2.12.2 (2022-11-11 22:00:03 UTC) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## 
## The following object is masked from 'package:GEOquery':
## 
##     gunzip
## 
## The following object is masked from 'package:magrittr':
## 
##     extract
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, isOpen, nullfile, parse, warnings

# Define color palettes
WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100)

Creating the Seurat Object

# Data Wrangling Processes
metadata.genes <- read.csv("GSE156793_S2_Metadata_genes.txt")
metadata.placentacells <- readRDS("Placenta cells.rds")
tissue <- read.csv("GSE156793_S4_gene_expression_tissue.txt")
fraction_tissue <- read.csv("GSE156793_S5_gene_fraction_tissue.txt")
expression_celltype <- read.csv("GSE156793_S6_gene_expression_celltype.txt")
fraction_celltype <- read.csv("GSE156793_S7_gene_fraction_celltype.txt")

loom_obj <- connect(filename = "GSE156793_S3_gene_count.loom", mode = 'r+')

metadata.placentacells %<>% 
  filter(Development_day == 89 | Development_day == 90 | Development_day == 94)

length(metadata.placentacells$sample)

# Let's call for the loom file
# gunzip("GSE156793_S3_gene_count.loom.gz", remove=FALSE)

# loom_obj$add.row.attribute(list(Gene = loom_obj$row.attrs$gene_id[]))
# loom_obj$add.col.attribute(list(CellID = loom_obj$col.attrs$sample[]))
loom_obj$add.col.attribute(list(tokeep_FT = loom_obj$col.attrs$sample[] %in% metadata.placentacells$sample))

filtered <- loom_obj[["matrix"]][which(loom_obj$col.attrs$tokeep_FT[]),]
dim(filtered)
counts <- t(filtered)

# loom_obj[["col_attrs"]][,]

coldf <- loom_obj$get.attribute.df(MARGIN = 2)
coldf <- coldf %>% 
  filter(tokeep_FT)
colnames(coldf)

rowdf <- loom_obj$get.attribute.df(MARGIN = 1)

# loom_obj$row.attrs$Gene = loom_obj$row.attrs$gene_id
# loom_obj[["row_attrs"]]$Gene = loom_obj[["row_attrs"]]$gene_id
# 
# loom_obj$col.attrs$sample[] %>%
#   unique() %>% 
#   length()

# loom_obj$close_all()

colnames(counts) <- coldf$CellID
rownames(counts) <- rowdf$gene_short_name

# the whole thing is too big to convert to Seurat - filter the loom first!
# seurat_loom <- as.Seurat(loom_obj)

cSeurat <- CreateSeuratObject(
  counts,
  project = "Placenta_Seurat",
  assay = "RNA",
  names.field = 1,
  names.delim = "_",
  meta.data = coldf,
)

# Check if the metadata looks okay
cSeurat@meta.data

# loom_obj$version()
# Even running the following lines of code yields the same type of error.
# loom_obj$version <- function(){
#   return("0.2.1.9000")
# }
# detach("package:SeuratDisk", unload = TRUE)
# detach("package:Seurat", unload = TRUE)
# 
# library(Seurat)

saveRDS(cSeurat, "cSeurat.rds")

For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory.

# Start running from this line of code
cSeurat <- readRDS("cSeurat.rds")
dim(cSeurat)

## [1] 63561 12267

# Add in the Mitochondrial PCT% information
cSeurat$percent.mt <- PercentageFeatureSet(cSeurat, pattern = "^MT-")

# nCount_RNA is the number of UMI counts in a cell
hist(cSeurat$nCount_RNA)

summary(cSeurat@meta.data$nCount_RNA)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   251.0   301.0   380.0   484.5   528.5  8156.0

# nFeature_RNA is the number of different genes that had any reads
hist(cSeurat$nFeature_RNA)

summary(cSeurat@meta.data$nFeature_RNA)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   120.0   249.0   307.0   368.8   413.0  3392.0

# percent.mt is the percent mitochondrial reads
summary(cSeurat$percent.mt)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0

# Later, we will subset the Seurat object based on a percent.mt cutoff.

# Check for the percentage of ribosomal genes
cSeurat[["percent.rb"]] <- PercentageFeatureSet(cSeurat, pattern = "^RP[SL]")
summary(cSeurat$percent.rb)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.3584  0.7937  1.2704  1.7391 14.1479

Another important part for QC is removing doublets.

# Find for doublets
cSeurat <- as.SingleCellExperiment(cSeurat)
cSeurat <- scDblFinder(cSeurat)

## Creating ~9814 artificial doublets...

## Dimensional reduction

## Evaluating kNN...

## Training model...

## iter=0, 2127 cells excluded from training.

## iter=1, 2217 cells excluded from training.

## iter=2, 2274 cells excluded from training.

## Threshold found:0.499

## 1178 (9.6%) doublets called

# Convert back to Seurat object.
cSeurat <- as.Seurat(cSeurat)

# Notice that our identifiers have changed.
levels(Idents(object = cSeurat))

## [1] "SingleCellExperiment"

# We want to specify the identifier whether the cell is a singlet or doublet.
table(cSeurat@meta.data$scDblFinder.class)

## 
## singlet doublet 
##   11089    1178

Idents(object = cSeurat) <- "scDblFinder.class"
cSeurat@meta.data$scDblFinder.class <- as.character(cSeurat@meta.data$scDblFinder.class)

# Subset the Seurat object to only include singlets.
cSeurat <- subset(cSeurat, idents = "singlet")
levels(as.factor(cSeurat@meta.data$scDblFinder.class))

## [1] "singlet"

# convert column names to make the protocol coherent
colnames(cSeurat@meta.data)[2] <- "nCount_RNA"
colnames(cSeurat@meta.data)[3] <- "nFeature_RNA"

# We draw a violin plot to explore the values in meta data
VlnPlot(cSeurat, 
        features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),
        ncol = 4,
        pt.size = 0.1) & 
  theme(plot.title = element_text(size=10))

## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.mt.

# The following four lines of code are for scatterplots addressing the correlations of every 2 pairs
FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt")

## Warning in cor(x = data[, 1], y = data[, 2]): the standard deviation is zero

FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb")

FeatureScatter(cSeurat, feature1 = "percent.rb", feature2 = "percent.mt")

## Warning in cor(x = data[, 1], y = data[, 2]): the standard deviation is zero

# We also check if there is correlation between number of features and the doublet score
FeatureScatter(cSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score")

# Since the typical cutoff for percent.mt is 5%, we will try out < 5. 
# Last week we used 10%, and couldn't see much difference.
# Also, let't filter out cells that have less than 5% ribosomal content.
# (11/29 Update) Removed the ribosomal filter due to insufficient evidence
# cSeurat <- subset(cSeurat, subset = percent.mt < 5)

dim(cSeurat)

## [1] 63561 11089

Now, we have a reduced version of the Seurat object, where the cells contain no genes.

# Next, we normalize the data
cSeurat <- NormalizeData(cSeurat)

# Explore the most variable features, let's use the default version first.
cSeurat <- FindVariableFeatures(cSeurat)
# cSeurat <- FindVariableFeatures(cSeurat, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(cSeurat), 10) 
top10

##  [1] "ALB"      "PAEP"     "IGFBP1"   "CSH1"     "AFP"      "CSH2"    
##  [7] "TFPI2"    "IGFBP5"   "KISS1"    "SERPINA1"

# plot variable features with and without labels
plot2 <- VariableFeaturePlot(cSeurat)
plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE)

## When using repel, set xnudge and ynudge to 0 for optimal results

plot2

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Removed 34795 rows containing missing values (geom_point).

# Convert normalized gene expression to Z-score
all.genes <- rownames(cSeurat)
cSeurat <- ScaleData(cSeurat, features = all.genes)

## Centering and scaling data matrix

# Run PCA analysis
cSeurat <- RunPCA(cSeurat, features = VariableFeatures(object = cSeurat))

## PC_ 1 
## Positive:  TACC2, CTD-3006G17.2, LDB2, RP11-548L20.1, ADAMTS6, MIR181A1HG, EXOC6, RP11-40F8.2, RP11-179A16.1, LINC00669 
##     GULP1, PAGE4, RALGAPA2, PLCB1, RIMKLB, CYP19A1, SLC26A2, RP11-305B6.3, ANK2, MEOX2 
##     RAPGEF5, RP11-608O21.1, EGFL7, COL1A1, RP11-454C18.2, CASS4, AC079610.1, EXPH5, RP11-770E5.1, RP11-510J16.5 
## Negative:  PAPPA, NOTUM, PRG2, QSOX1, TIMP2, TIMP3, IGFBP1, FSTL3, PAPPA2, SCARA5 
##     ITGA5, HSPG2, EBI3, SYN3, HTRA4, AQPEP, HLA-G, ITM2B, AOC1, SERPINE1 
##     HPGD, FN1, IGFBP4, SERPING1, KRT19, CHRDL1, RORB, DKK1, APOD, MMP2 
## PC_ 2 
## Positive:  SCARA5, IGFBP1, IGFBP4, DKK1, CHRDL1, APOD, RORB, RBP1, LUM, CD248 
##     HSPB6, GAS1, F2R, ISLR, DCN, C3, CD82, IGFBP2, IGFBP5, SPOCK1 
##     PRDM1, SPARCL1, IGFBP6, OMD, RP1-78O14.1, THY1, LGALS1, VIM, ADRA2C, WNT5A 
## Negative:  NOTUM, PAPPA2, PRG2, HPGD, AQPEP, EBI3, HTRA4, QSOX1, ISM2, GPR78 
##     FSTL3, PTPRF, HLA-G, KRT19, AF127936.7, AOC1, ADAM19, HSPG2, MFAP5, FLT1 
##     AC083884.8, KRT8, MGAT5, CSH1, ASCL2, TNFSF10, CPM, DIO2, FN1, ITGA5 
## PC_ 3 
## Positive:  PAEP, AC073218.2, CATSPERB, PAX8, PTPRR, CP, C1orf186, MECOM, AC012593.1, MAP3K5 
##     ATP2C2, TC2N, FAM155A, SLC1A1, RP1-60O19.1, MACC1, RXFP1, RIMKLB, SLC7A2, GPX3 
##     UBE2D2, SLC6A6, PKHD1, IRX3, BICD1, DNER, DEPTOR, ZBTB20, NNMT, PRUNE2 
## Negative:  COL1A1, DCN, COL1A2, TIMP3, SERPINE2, LUM, ABI3BP, CDKN1C, ADAMTS1, CD248 
##     MMP2, ADAMTS5, MFAP4, HSPB6, FN1, SCARA5, DKK1, APOD, IGFBP2, RBP1 
##     FSTL1, LGALS1, THY1, S100A6, PDGFRB, GAS1, CFD, MYL9, SYN3, ISLR 
## PC_ 4 
## Positive:  PLAC4, CYP19A1, CSH2, KISS1, PSG1, AFF1, CGA, ANGPT2, HOPX, PSG4 
##     PSG5, RP11-266L9.2, PSG9, CHODL, DEPDC1B, CSH1, TACC2, GDF15, PSG7, PSG2 
##     GULP1, HSD3B1, TFPI2, RP11-390D11.1, SH3TC2, PSG6, DAB2, AC003051.1, AC008394.1, SDC1 
## Negative:  FN1, SERPINE2, COL1A1, LHFP, COL1A2, HIP1, LDB2, NOTUM, CXCL14, HSPG2 
##     BICD1, RASGRF2, PRG2, MECOM, FSTL3, MIR181A1HG, CTD-3006G17.2, MCAM, HLA-G, PLCB1 
##     TPM1, DLGAP1, MEOX2, FLNB, PLCB4, RAPGEF5, KLF6, C10orf11, FLT1, GUCY1A2 
## PC_ 5 
## Positive:  SERPINA1, APOA2, AFP, APOA1, APOB, AHSG, FGB, MT1G, APOC3, AMBP 
##     ALB, FGA, ITIH2, TTR, VTN, MT1H, FABP1, APOH, AGT, APOC1 
##     CYP3A7, ALDOB, RBP4, FGG, TF, SCD, SERPINF2, AL139819.1, CLU, FTL 
## Negative:  LDB2, MEOX2, EXOC6, CTD-3006G17.2, BICD1, EGFL7, DACH1, ZEB1, C10orf11, FCGR2B 
##     RAPGEF5, ZBTB20, MIR181A1HG, RALGAPA2, FLI1, PLCB4, JAM2, DOCK4, MECOM, ZNF521 
##     CACHD1, KDR, CDH13, ERG, PTPRB, FUT9, PLCB1, SPARCL1, LHFP, IGFBP5

# Plot the PCA
DimPlot(cSeurat, reduction = "pca")

# Plot the loadings
VizDimLoadings(cSeurat, dims = 1:2, reduction = "pca") & 
  theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold"))

# Check the elbow plot to see how much variation is covered for each PC
ElbowPlot(cSeurat)

# FeaturePlot of PC1 top features
FeaturePlot(cSeurat, features = c("PAPPA", 
                                   "NOTUM"))

# FeaturePlot of PC2 top features
FeaturePlot(cSeurat, features = c("SCARA5", 
                                   "IGFBP1"))

# Now perform clustering
cSeurat <- FindNeighbors(cSeurat, dims = 1:10)

## Computing nearest neighbor graph

## Computing SNN

cSeurat <- FindClusters(cSeurat, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 11089
## Number of edges: 336645
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8972
## Number of communities: 12
## Elapsed time: 1 seconds

# For visualization purposes, we need to generate UMAP reduced dimensionality representation
cSeurat <- RunUMAP(cSeurat, dims = 1:10, verbose = F)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

# Plot the UMAP with clustering
DimPlot(cSeurat, reduction = "umap")

# Get the TSNE embedding
cSeurat <- RunTSNE(cSeurat, dims = 1:10)

# Plot the t-SNE with clustering
DimPlot(cSeurat, reduction = "tsne")

# Let's look at cluster sizes
table(cSeurat@meta.data$seurat_clusters)

## 
##    0    1    2    3    4    5    6    7    8    9   10   11 
## 3344 3063 1450  758  599  461  382  334  226  218  182   72

# Let's plot those clusters!
DimPlot(cSeurat,label.size = 4,repel = T,label = T)

FeaturePlot(cSeurat, features = c("PAPPA",
                                   "IGFBP1",
                                   "CSH1",
                                   "ERVW-1"))

Let’s calculate the cell cycle scores at this point.

cSeurat <- SetIdent(cSeurat, value = "Main_cluster_name")

# Calculate the cell cycle scores
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

# Somehow this code does not work.
cSeurat <- CellCycleScoring(cSeurat,
                            search = TRUE,
                            s.features = s.genes, 
                            g2m.features = g2m.genes)

## Warning: The following features are not present in the object: CENPU, attempting
## to find updated synonyms

## Warning: No updated symbols found

## Warning: The following features are still not present in the object: CENPU

## Warning: The following features are not present in the object: PIMREG, JPT1,
## attempting to find updated synonyms

## Warning: No updated symbols found

## Warning: The following features are still not present in the object: PIMREG,
## JPT1

table(cSeurat[[]]$Phase)

## 
##   G1  G2M    S 
## 5878 2401 2810

Around 1/4 of the cells are in the S phase.

# Check if mitochondrial genes show dependency on cell type
VlnPlot(cSeurat,
        features = "percent.mt") & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.mt.

# Check if ribosomal genes show dependency on cell type
VlnPlot(cSeurat,
        features = "percent.rb") & 
  theme(plot.title = element_text(size=10),
        legend.position = 'none')

# Check # of genes and # of transcripts per cluster
VlnPlot(cSeurat,features = 
          c("nCount_RNA","nFeature_RNA")) & 
  theme(plot.title = element_text(size=10))

# Cell cycle score does not seem to depend on the cell type much
FeaturePlot(cSeurat,
            features = c("S.Score","G2M.Score"),
            label.size = 4,
            repel = T,
            label = T) & 
  theme(plot.title = element_text(size = 10))

VlnPlot(cSeurat,features = c("S.Score","G2M.Score")) & 
  theme(plot.title = element_text(size = 10),
        legend.position = 'none')

In this case, it looks like most of the cell types show low cell cycle scores, except for the proliferating cells.

# Plot in UMAP
DimPlot(cSeurat, 
        group.by = "Phase")

# Most of the clusters contain all three cell cycle states.

# Plot with clusters
compTable.int <- table(cSeurat$Main_cluster_name, cSeurat$Phase)
compTable.int <- (compTable.int / rowSums(compTable.int)) * 100
pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100)

# FeaturePlot for showing mod score
FeaturePlot(cSeurat, 
            features = c("S.Score", "G2M.Score"))

This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase.

# This part is replaced when using the SCTransform
DefaultAssay(cSeurat) <- "RNA"

# Perform log-normalization
cSeurat <- NormalizeData(cSeurat)

# Find variable features
cSeurat <- FindVariableFeatures(cSeurat, 
                                 selection.method = "vst", 
                                 nfeatures = 2000)
all.genes <- rownames(cSeurat)
cSeurat <- ScaleData(cSeurat, features = all.genes)

## Centering and scaling data matrix

# Designate the markers
all.markers <- FindAllMarkers(cSeurat,
                              only.pos = T, 
                              min.pct = 0.5, 
                              logfc.threshold = 0.5)

## Calculating cluster Syncytiotrophoblasts and villous cytotrophoblasts

## Calculating cluster AFP_ALB positive cells

## Calculating cluster Stromal cells

## Calculating cluster Smooth muscle cells

## Calculating cluster Vascular endothelial cells

## Calculating cluster Myeloid cells

## Calculating cluster Extravillous trophoblasts

## Calculating cluster PAEP_MECOM positive cells

## Calculating cluster Trophoblast giant cells

## Calculating cluster Megakaryocytes

## Calculating cluster IGFBP1_DKK1 positive cells

## Calculating cluster Lymphoid cells

dim(all.markers)

## [1] 340   7

table(all.markers$cluster)

## 
## Syncytiotrophoblasts and villous cytotrophoblasts 
##                                                 5 
##                            AFP_ALB positive cells 
##                                                31 
##                                     Stromal cells 
##                                                17 
##                               Smooth muscle cells 
##                                                25 
##                        Vascular endothelial cells 
##                                                15 
##                                     Myeloid cells 
##                                                 8 
##                         Extravillous trophoblasts 
##                                                38 
##                         PAEP_MECOM positive cells 
##                                                28 
##                           Trophoblast giant cells 
##                                                32 
##                                    Megakaryocytes 
##                                                78 
##                        IGFBP1_DKK1 positive cells 
##                                                49 
##                                    Lymphoid cells 
##                                                14

top3_markers <- as.data.frame(all.markers %>%
                              group_by(cluster) %>% 
                              top_n(n = 3, wt = avg_log2FC))
# We can check the dataframe briefly 
head(top3_markers)

##           p_val avg_log2FC pct.1 pct.2     p_val_adj
## 1  0.000000e+00   4.233187 0.760 0.080  0.000000e+00
## 2  0.000000e+00   3.953669 0.661 0.055  0.000000e+00
## 3  0.000000e+00   3.603971 0.569 0.054  0.000000e+00
## 4  0.000000e+00   7.378025 0.972 0.017  0.000000e+00
## 5  0.000000e+00   7.170272 1.000 0.014  0.000000e+00
## 6 1.650927e-187   8.536737 1.000 0.086 1.049346e-182
##                                             cluster     gene
## 1 Syncytiotrophoblasts and villous cytotrophoblasts    TENM3
## 2 Syncytiotrophoblasts and villous cytotrophoblasts     FBN2
## 3 Syncytiotrophoblasts and villous cytotrophoblasts    KANK1
## 4                            AFP_ALB positive cells      AFP
## 5                            AFP_ALB positive cells SERPINA1
## 6                            AFP_ALB positive cells      ALB

Clustering based on Cell Type annotation

options(ggrepel.max.overlaps = Inf)
DimPlot(cSeurat, label = T , repel = T, label.size = 3) + 
  NoLegend()

# Syncytin-1 is a gene that allows the differentiation from cytotrophoblasts to SCTs.
FeaturePlot(cSeurat, features = "ERVW-1")

DimPlot(cSeurat, reduction = "pca")

DimPlot(cSeurat, reduction = "tsne")

DimPlot(cSeurat, reduction = "umap")

# ggsave(filename = "Check.png", plot = p1, width = 30, height = 10)
saveRDS(cSeurat, "cSeurat_annotated.rds")

Luckily, it seems that the cell types are preserved even after changing the mitochondrial gene percentage to 5%. Another difference is that the plots show more clustering compared to the 10% threshold, which could be promising in terms of clarity.

Still Work in Progress

# Pseudotime
# Convert to monocle3 cell_data_set
hSeurat.cds <- as.cell_data_set(hSeurat)
class(hSeurat.cds)

# Re-cluster (required for trajectories in monocle3)
hSeurat.cds <- cluster_cells(hSeurat.cds)

# Learn the trajectory
hSeurat.cds <- learn_graph(hSeurat.cds)

# Plot the trajectory
plot_cells(hSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE)

# Find the pseudotime order
DimPlot(hSeurat, group.by = "Phase")
hSeurat.cds <- order_cells(hSeurat.cds)

# Plot pseudotime
plot_cells(hSeurat.cds, color_cells_by = "pseudotime")

# Add pseudotime score back into the dataset
hSeurat$pseudotime_score <- hSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime
hSeurat$pseudotime_score[hSeurat$pseudotime_score == Inf] <- NA

# FeaturePlot with Pseudotime
FeaturePlot(hSeurat, features = c("pseudotime_score")) 
DimPlot(hSeurat, group.by = "Annotation")
FeaturePlot(hSeurat, features = c("pseudotime_score"))

Cao et al. (2022) Analysis

Young-June Kim

2023-02-07

Still Work in Progress