--- title: "Cao et al. (2022) Analysis" author: "Young-June Kim" date: "2023-02-07" output: html_document --- ```{r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) library(scDblFinder) library(magrittr) library(pheatmap) library(GEOquery) library(SeuratDisk) library(loomR) # Define color palettes WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100) ``` 1. Creating the Seurat Object ```{r eval = FALSE} # Data Wrangling Processes library(R.utils) metadata.genes <- read.csv("GSE156793_S2_Metadata_genes.txt") metadata.placentacells <- readRDS("Placenta cells.rds") tissue <- read.csv("GSE156793_S4_gene_expression_tissue.txt") fraction_tissue <- read.csv("GSE156793_S5_gene_fraction_tissue.txt") expression_celltype <- read.csv("GSE156793_S6_gene_expression_celltype.txt") fraction_celltype <- read.csv("GSE156793_S7_gene_fraction_celltype.txt") # Let's call for the loom file # gunzip("GSE156793_S3_gene_count.loom.gz", remove=FALSE) loom_obj <- connect(filename = "GSE156793_S3_gene_count.loom", mode = 'r+') # loom_obj$add.row.attribute(list(Gene = loom_obj$row.attrs$gene_id[])) # loom_obj$add.col.attribute(list(CellID = loom_obj$col.attrs$sample[])) # loom_obj$add.col.attribute(list(tokeep = loom_obj$col.attrs$sample[] %in% metadata.placentacells$sample)) filtered <- loom_obj[["matrix"]][which(loom_obj$col.attrs$tokeep[]),] counts <- t(filtered) loom_obj[["col_attrs"]][,] coldf <- loom_obj$get.attribute.df(MARGIN = 2) coldf <- coldf %>% filter(tokeep) rowdf <- loom_obj$get.attribute.df(MARGIN = 1) dim(loom_obj$col.attrs) # loom_obj$row.attrs$Gene = loom_obj$row.attrs$gene_id # loom_obj[["row_attrs"]]$Gene = loom_obj[["row_attrs"]]$gene_id # # loom_obj$col.attrs$sample[] %>% # unique() %>% # length() # loom_obj$close_all() colnames(counts) <- coldf$CellID rownames(counts) <- rowdf$gene_short_name # the whole thing is too big to convert to Seurat - filter the loom first! # seurat_loom <- as.Seurat(loom_obj) cSeurat <- CreateSeuratObject( counts, project = "Placenta_Seurat", assay = "RNA", names.field = 1, names.delim = "_", meta.data = coldf, ) # Check if the metadata looks okay cSeurat@meta.data # loom_obj$version() # Even running the following lines of code yields the same type of error. # loom_obj$version <- function(){ # return("0.2.1.9000") # } # detach("package:SeuratDisk", unload = TRUE) # detach("package:Seurat", unload = TRUE) # # library(Seurat) saveRDS(cSeurat, "cSeurat.rds") ``` For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory. ```{r} # Start running from this line of code cSeurat <- readRDS("cSeurat.rds") # Add in the Mitochondrial PCT% information cSeurat$percent.mt <- PercentageFeatureSet(cSeurat, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(cSeurat$nCount_RNA) summary(cSeurat@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(cSeurat$nFeature_RNA) summary(cSeurat@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(cSeurat$percent.mt) # Later, we will subset the Seurat object based on a percent.mt cutoff. # Check for the percentage of ribosomal genes cSeurat[["percent.rb"]] <- PercentageFeatureSet(cSeurat, pattern = "^RP[SL]") summary(cSeurat$percent.rb) ``` Another important part for QC is removing doublets. ```{r removing_doublets} # Find for doublets cSeurat <- as.SingleCellExperiment(cSeurat) cSeurat <- scDblFinder(cSeurat) # Convert back to Seurat object. cSeurat <- as.Seurat(cSeurat) # Notice that our identifiers have changed. levels(Idents(object = cSeurat)) # We want to specify the identifier whether the cell is a singlet or doublet. table(cSeurat@meta.data$scDblFinder.class) Idents(object = cSeurat) <- "scDblFinder.class" cSeurat@meta.data$scDblFinder.class <- as.character(cSeurat@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. cSeurat <- subset(cSeurat, idents = "singlet") levels(as.factor(cSeurat@meta.data$scDblFinder.class)) ``` ```{r} # convert column names to make the protocol coherent colnames(cSeurat@meta.data)[2] <- "nCount_RNA" colnames(cSeurat@meta.data)[3] <- "nFeature_RNA" # We draw a violin plot to explore the values in meta data VlnPlot(cSeurat, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"), ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10)) ``` ```{r} # The following four lines of code are for scatterplots addressing the correlations of every 2 pairs FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt") FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") FeatureScatter(cSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb") FeatureScatter(cSeurat, feature1 = "percent.rb", feature2 = "percent.mt") # We also check if there is correlation between number of features and the doublet score FeatureScatter(cSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score") # Since the typical cutoff for percent.mt is 5%, we will try out < 5. # Last week we used 10%, and couldn't see much difference. # Also, let't filter out cells that have less than 5% ribosomal content. # (11/29 Update) Removed the ribosomal filter due to insufficient evidence # cSeurat <- subset(cSeurat, subset = percent.mt < 5) dim(cSeurat) ``` Now, we have a reduced version of the Seurat object, where the cells contain no genes. ```{r} # Next, we normalize the data cSeurat <- NormalizeData(cSeurat) # Explore the most variable features, let's use the default version first. cSeurat <- FindVariableFeatures(cSeurat) # cSeurat <- FindVariableFeatures(cSeurat, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(cSeurat), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(cSeurat) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(cSeurat) cSeurat <- ScaleData(cSeurat, features = all.genes) ``` ```{r} # Run PCA analysis cSeurat <- RunPCA(cSeurat, features = VariableFeatures(object = cSeurat)) # Plot the PCA DimPlot(cSeurat, reduction = "pca") # Plot the loadings VizDimLoadings(cSeurat, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(cSeurat) # FeaturePlot of PC1 top features FeaturePlot(cSeurat, features = c("EBI3", "KRT7")) # FeaturePlot of PC2 top features FeaturePlot(cSeurat, features = c("B2M", "TYROBP")) ``` ```{r} # Now perform clustering cSeurat <- FindNeighbors(cSeurat, dims = 1:10) cSeurat <- FindClusters(cSeurat, resolution = 0.5) # For visualization purposes, we need to generate UMAP reduced dimensionality representation cSeurat <- RunUMAP(cSeurat, dims = 1:10, verbose = F) # Plot the UMAP with clustering DimPlot(cSeurat, reduction = "umap") # Get the TSNE embedding cSeurat <- RunTSNE(cSeurat, dims = 1:10, check_duplicates = FALSE) # Plot the t-SNE with clustering DimPlot(cSeurat, reduction = "tsne") # Let's look at cluster sizes table(cSeurat@meta.data$seurat_clusters) # Let's plot those clusters! DimPlot(cSeurat,label.size = 4,repel = T,label = T) FeaturePlot(cSeurat, features = c("MMP12", "SERPINE2", "CSH1", "ERVW-1")) ``` Let's calculate the cell cycle scores at this point. ```{r fig.height = 4, fig.width = 6.5} cSeurat <- SetIdent(cSeurat, value = "Main_cluster_name") # Calculate the cell cycle scores s.genes <- cc.genes.updated.2019$s.genes g2m.genes <- cc.genes.updated.2019$g2m.genes # Somehow this code does not work. cSeurat <- CellCycleScoring(cSeurat, search = TRUE, s.features = s.genes, g2m.features = g2m.genes) table(cSeurat[[]]$Phase) ``` Around 1/3 of the cells are in the S phase. ```{r} # Check if mitochondrial genes show dependency on cell type VlnPlot(cSeurat, features = "percent.mt") & theme(plot.title = element_text(size = 10), legend.position = 'none') # Check if ribosomal genes show dependency on cell type VlnPlot(cSeurat, features = "percent.rb") & theme(plot.title = element_text(size=10), legend.position = 'none') # Check # of genes and # of transcripts per cluster VlnPlot(cSeurat,features = c("nCount_RNA","nFeature_RNA")) & theme(plot.title = element_text(size=10)) # Cell cycle score does not seem to depend on the cell type much FeaturePlot(cSeurat, features = c("S.Score","G2M.Score"), label.size = 4, repel = T, label = T) & theme(plot.title = element_text(size = 10)) VlnPlot(cSeurat,features = c("S.Score","G2M.Score")) & theme(plot.title = element_text(size = 10), legend.position = 'none') ``` In this case, it looks like most of the cell types show low cell cycle scores, except for the proliferating cells. ```{r} # Plot in UMAP DimPlot(cSeurat, group.by = "Phase") # Most of the clusters contain all three cell cycle states. # Plot with clusters compTable.int <- table(cSeurat$Annotation, cSeurat$Phase) compTable.int <- (compTable.int / rowSums(compTable.int)) * 100 pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100) # FeaturePlot for showing mod score FeaturePlot(cSeurat, features = c("S.Score", "G2M.Score")) ``` This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase. ```{r} # This part is replaced when using the SCTransform DefaultAssay(cSeurat) <- "RNA" # Perform log-normalization cSeurat <- NormalizeData(cSeurat) # Find variable features cSeurat <- FindVariableFeatures(cSeurat, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(cSeurat) cSeurat <- ScaleData(cSeurat, features = all.genes) # Designate the markers all.markers <- FindAllMarkers(cSeurat, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5) dim(all.markers) table(all.markers$cluster) top3_markers <- as.data.frame(all.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)) # We can check the dataframe briefly head(top3_markers) ``` Clustering based on Cell Type annotation ```{r fig.height = 4, fig.width = 6} options(ggrepel.max.overlaps = Inf) pdf("cSeuratUMAP.pdf") DimPlot(cSeurat, label = T , repel = T, label.size = 3) + NoLegend() dev.off() # Syncytin-1 is a gene that allows the differentiation from cytotrophoblasts to SCTs. FeaturePlot(cSeurat, features = "ERVW-1") DimPlot(cSeurat, reduction = "pca") DimPlot(cSeurat, reduction = "tsne") DimPlot(cSeurat, reduction = "umap") # ggsave(filename = "Check.png", plot = p1, width = 30, height = 10) saveRDS(cSeurat, "cSeurat_annotated.rds") ``` Luckily, it seems that the cell types are preserved even after changing the mitochondrial gene percentage to 5%. Another difference is that the plots show more clustering compared to the 10% threshold, which could be promising in terms of clarity. ## Still Work in Progress ```{r eval = FALSE} # Pseudotime # Convert to monocle3 cell_data_set hSeurat.cds <- as.cell_data_set(hSeurat) class(hSeurat.cds) # Re-cluster (required for trajectories in monocle3) hSeurat.cds <- cluster_cells(hSeurat.cds) # Learn the trajectory hSeurat.cds <- learn_graph(hSeurat.cds) # Plot the trajectory plot_cells(hSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE) # Find the pseudotime order DimPlot(hSeurat, group.by = "Phase") hSeurat.cds <- order_cells(hSeurat.cds) # Plot pseudotime plot_cells(hSeurat.cds, color_cells_by = "pseudotime") # Add pseudotime score back into the dataset hSeurat$pseudotime_score <- hSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime hSeurat$pseudotime_score[hSeurat$pseudotime_score == Inf] <- NA # FeaturePlot with Pseudotime FeaturePlot(hSeurat, features = c("pseudotime_score")) DimPlot(hSeurat, group.by = "Annotation") FeaturePlot(hSeurat, features = c("pseudotime_score")) ```