--- title: "Suryawanshi et al. (2018)" author: "Young-June Kim" date: "2022-11-09" output: html_document --- ```{r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) library(scDblFinder) library(magrittr) library(Matrix) library(GEOquery) library(pheatmap) # library(monocle3) # library(SeuratWrappers) ``` 1. Creating the Seurat Object ```{r} # Data Wrangling Processes # sCounts <- read.delim("Rockefeller.expression.txt", header = TRUE, sep = "\t", dec = ".") # head(sCounts) # Warning: This count matrix only contains the genes related to COVID-19. We should use the loom file that contains all the genes, and filter out the cells that are annotated (consistent with the annotations file) sAnnotations <- read.delim("Rockefeller.meta.txt") sAnnotations <- sAnnotations[-1,] # (2023-05-13) The annotations were not quite correct. We should exclude the decidual samples sAnnotations$Sample_ID %>% factor() %>% summary() ``` For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory. Since we have the metadata for this dataset, let's add the cell annotations first, and filter the dataset to only include cells from the placenta. This will prevent the Seurat object becoming too large and time-costly to process. ```{r} # Load the RDS file sSeurat_raw <- readRDS("seuratObj.rds") # Check the dimensions of this object dim(sSeurat_raw) dim(sSeurat_raw@meta.data) # Please note that the number of cells in this dataset is much more than that reported in the Suryawanshi paper. # Also, the annotations file only covers the cells relevant to the paper. # We will make the names identical with our Seurat object sAnnotations$NAME <- substr(sAnnotations$NAME, 21, 36) overlaps <- sAnnotations %>% group_by(NAME) %>% dplyr::count() %>% filter(n > 1) %>% pull(NAME) # There are overlaps in the UMI in the annotations. We should filter out the overlaps sAnno_no_overlaps <- sAnnotations[!duplicated(sAnnotations$NAME),] # Check if the annotated cells are contained within the Seurat object (sAnno_no_overlaps$NAME %in% colnames(sSeurat_raw)) %>% summary() # Extract the metadata and copy it. meta_data.copy <- sSeurat_raw@meta.data # Check if metadata rownames are unique. rownames(sSeurat_raw@meta.data) %>% unique() %>% length() # Make a copy of annotations data annotations.copy <- sAnno_no_overlaps %>% select(NAME, Sample_ID, CellType) %>% # filter out the sample ids that are from the decidua filter(Sample_ID != "P6D_10X_Placenta_23") meta_data.copy <- meta_data.copy %>% rownames_to_column() meta_data.copy <- meta_data.copy %>% mutate(NAME = rowname) # render the metadata to include the annotation data meta_data.copy <- left_join(meta_data.copy, annotations.copy) meta_data.copy <- meta_data.copy %>% as.data.frame() rownames(meta_data.copy) <- meta_data.copy$rowname dim(meta_data.copy) sSeurat_raw@meta.data <- meta_data.copy sSeurat3 <- sSeurat_raw[,!is.na(sSeurat_raw@meta.data$CellType)] dim(sSeurat3) ``` Since the MetaMarkers analysis is indicating that the cytotrophoblasts from this dataset has a lot of ribosomal genes as top scoring markers, let's remove the ribosomal genes at this point. ```{r} # select for protein-coding genes gene_list <- read.delim("~/Meta-analysis/Protein-coding_Genes.txt") rb_gene_list <- read.csv("~/Meta-analysis/Gene_List.csv") gene_list <- gene_list$Symbol summary(rb_gene_list$Approved.symbol %in% gene_list) summary(!(rownames(sdataset_protein_coding) %in% rb_gene_list$Approved.symbol)) sdataset_protein_coding <- sSeurat3[rownames(sSeurat3) %in% gene_list,] sSeurat3 <- sdataset_protein_coding[!(rownames(sdataset_protein_coding) %in% rb_gene_list$Approved.symbol),] ``` ```{r} # Add in the Mitochondrial PCT% information sSeurat3$percent.mt <- PercentageFeatureSet(sSeurat3, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(sSeurat3$nCount_RNA) summary(sSeurat3@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(sSeurat3$nFeature_RNA) summary(sSeurat3@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(sSeurat3$percent.mt) # We see that the mitochondrial genes were removed. # Check for the percentage of ribosomal genes sSeurat3[["percent.rb"]] <- PercentageFeatureSet(sSeurat3, pattern = "^RP[SL]") hist(sSeurat3$percent.rb) summary(sSeurat3$percent.rb) dim(sSeurat3) ``` We now have a sSeurat3 object that has 10488 cells, which would be sufficient for downstream analysis. Another important part for QC is removing doublets. ```{r} # Find for doublets sSeurat3 <- as.SingleCellExperiment(sSeurat3) sSeurat3 <- scDblFinder(sSeurat3) # Convert back to Seurat object. sSeurat3 <- as.Seurat(sSeurat3) # Notice that our identifiers have changed. levels(Idents(object = sSeurat3)) # We want to specify the identifier whether the cell is a singlet or doublet. table(sSeurat3@meta.data$scDblFinder.class) Idents(object = sSeurat3) <- "scDblFinder.class" sSeurat3@meta.data$scDblFinder.class <- as.character(sSeurat3@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. sSeurat3 <- subset(sSeurat3, idents = "singlet") levels(as.factor(sSeurat3@meta.data$scDblFinder.class)) # Check the dimensions of the dataset again. dim(sSeurat3) ``` Now, note that we have 9471 cells ```{r} colnames(sSeurat3@meta.data) # convert column names to make the protocol coherent colnames(sSeurat3@meta.data)[2] <- "nCount_RNA" colnames(sSeurat3@meta.data)[3] <- "nFeature_RNA" # Make a violin plot of the QC columns plt <- VlnPlot(sSeurat3, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2, pt.size = 0.1) & theme(plot.title = element_text(size=10)) plt # We can say that most of the QC was already performed. ggsave(filename = "QC_(2023-05-13).png", plot = plt, width = 7, height = 3.5) ``` Looking at the violin plot, we do see some extreme values that should be filtered out. However, let's put the subsetting part for later and proceed to next steps and come back later. ```{r} # Make scatter plots to compare plot1 <- FeatureScatter(sSeurat3, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") plot1 # remove the cells with high mitochondrial gene percentage sSeurat3 <- subset(sSeurat3, subset = percent.mt < 5) dim(sSeurat3) ``` Since we want to compare the difference between subsetted data and the control (no subsetting), we make a copy for the sSeurat2. ```{r eval = F} subset_sSeurat3 <- sSeurat3 subset_sSeurat3 <- subset(subset_sSeurat3, subset = nFeature_RNA > 3000) dim(subset_sSeurat3) ``` Note that this number of cells is approximately identical to that of the publication (they used the same threshold) Next, we need to normalize the data. ```{r} # Next, we normalize the data sSeurat3 <- NormalizeData(sSeurat3) # Explore the most variable features, let's use the default version first. sSeurat3 <- FindVariableFeatures(sSeurat3) # sSeurat3 <- FindVariableFeatures(sSeurat3, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(sSeurat3), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(sSeurat3) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(sSeurat3) sSeurat3 <- ScaleData(sSeurat3, features = all.genes) ``` Before moving on, let's compare the above with the subsetted Seurat object. ```{r eval = F} # Next, we normalize the data subset_sSeurat2 <- NormalizeData(subset_sSeurat2) # Explore the most variable features, let's use the default version first. subset_sSeurat2 <- FindVariableFeatures(subset_sSeurat2) # subset_sSeurat2 <- FindVariableFeatures(subset_sSeurat2, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10_subset <- head(VariableFeatures(subset_sSeurat2), 10) top10_subset # plot variable features with and without labels plot2_subset <- VariableFeaturePlot(subset_sSeurat2) plot2_subset <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2_subset # Convert normalized gene expression to Z-score all.genes_subset <- rownames(subset_sSeurat2) subset_sSeurat2 <- ScaleData(subset_sSeurat2, features = all.genes) # Check if the top 10 hvg are the same top10_subset %in% top10 # There are two genes that are different after subsetting! ``` Now, we are ready to run PCA. ```{r} # Run PCA analysis sSeurat3 <- RunPCA(sSeurat3, features = VariableFeatures(object = sSeurat3)) # Plot the PCA DimPlot(sSeurat3, reduction = "pca") # Plot the loadings VizDimLoadings(sSeurat3, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(sSeurat3) # FeaturePlot of PC1 top features FeaturePlot(sSeurat3, features = c("PAPPA", "PCOLCE")) # FeaturePlot of PC2 top features FeaturePlot(sSeurat3, features = c("HLA-G", "VIM")) ``` Next, we move on to derive clusters. ```{r} # Now perform clustering sSeurat3 <- FindNeighbors(sSeurat3, dims = 1:10) sSeurat3 <- FindClusters(sSeurat3, resolution = 0.5) # For visualization purposes, we need to generate UMAP reduced dimensionality representation sSeurat3 <- RunUMAP(sSeurat3, dims = 1:10, verbose = F) # Plot the UMAP with clustering DimPlot(sSeurat3, reduction = "umap") # Get the TSNE embedding sSeurat3 <- RunTSNE(sSeurat3, dims = 1:10, check_duplicates = FALSE) # Plot the t-SNE with clustering DimPlot(sSeurat3, reduction = "tsne") # Let's look at cluster sizes table(sSeurat3@meta.data$seurat_clusters) # Let's plot those clusters! DimPlot(sSeurat3,label.size = 4,repel = T,label = T) # Check expression patterns for some genes in top10 FeaturePlot(sSeurat3, features = c("KRT7", "VIM")) ``` Let's calculate the cell cycle scores at this point. ```{r} sSeurat3 <- SetIdent(sSeurat3, value = "CellType") # Calculate the cell cycle scores s.genes <- cc.genes.updated.2019$s.genes g2m.genes <- cc.genes.updated.2019$g2m.genes sSeurat3 <- CellCycleScoring(sSeurat3, search = TRUE, s.features = s.genes, g2m.features = g2m.genes) table(sSeurat3[[]]$Phase) # Check if ribosomal genes show dependency on cluster FeaturePlot(sSeurat3,features = "percent.rb", label.size = 4, repel = T, label = F) & theme(plot.title = element_text(size=10)) # Check using the violin plot VlnPlot(sSeurat3, features = "percent.rb") & theme(plot.title = element_text(size=10), legend.position = 'none') # Check # of genes per cluster VlnPlot(sSeurat3,features = "nFeature_RNA") & theme(plot.title = element_text(size=10), legend.position = 'none') # Cell cycle score does not seem to depend on the cell type much FeaturePlot(sSeurat3, features = c("S.Score","G2M.Score"), label.size = 4, repel = T, label = T) & theme(plot.title = element_text(size = 10)) VlnPlot(sSeurat3, features = c("S.Score","G2M.Score")) & theme(plot.title = element_text(size = 10), legend.position = 'none') # Plot in UMAP DimPlot(sSeurat3, group.by = "Phase") # Plot with clusters # compTable.int <- table(sSeurat3$CellType, sSeurat3$Phase) # compTable.int <- (compTable.int / rowSums(compTable.int)) * 100 # WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100) # pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100) # FeaturePlot for showing mod score FeaturePlot(sSeurat3, features = c("S.Score", "G2M.Score")) ``` Note that most of the cells are in the G1 phase. ```{r} DefaultAssay(sSeurat3) <- "RNA" # Perform log-normalization sSeurat3 <- NormalizeData(sSeurat3) # Find variable features sSeurat3 <- FindVariableFeatures(sSeurat3, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(sSeurat3) sSeurat3 <- ScaleData(sSeurat3, features = all.genes) # Designate the markers all.markers <- FindAllMarkers(sSeurat3, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5) dim(all.markers) table(all.markers$cluster) top3_markers <- as.data.frame(all.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)) head(top3_markers) dim(sSeurat3) ``` Clustering based on Cell Type annotation ```{r} p1 <- DimPlot(sSeurat3, label = T , repel = T, label.size = 3) + NoLegend() p2 <- DimPlot(sSeurat3, reduction = "pca") p3 <- DimPlot(sSeurat3, reduction = "tsne") p4 <- DimPlot(sSeurat3, reduction = "umap") ggsave(filename = "UMAP(2023-05-13).png", plot = p1, width = 7, height = 3.5) ggsave(filename = "tsne(2023-05-13).png", plot = p3, width = 7, height = 3.5) ggsave(filename = "umap_legend(2023-05-13).png", plot = p4, width = 6, height = 3.5) p1 p2 p3 p4 # ggsave(filename = "Check.png", plot = p1, width = 30, height = 10) saveRDS(sSeurat3, "sSeurat_20230514.rds") ``` # Still Work in Progress ```{r eval = F} # Pseudotime # Convert to monocle3 cell_data_set sSeurat2.cds <- as.cell_data_set(sSeurat2) class(sSeurat2.cds) # Re-cluster (required for trajectories in monocle3) sSeurat2.cds <- cluster_cells(sSeurat2.cds) # Learn the trajectory sSeurat2.cds <- learn_graph(sSeurat2.cds) # Plot the trajectory plot_cells(sSeurat2.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE) # Find the pseudotime order DimPlot(sSeurat2, group.by = "Phase") sSeurat2.cds <- order_cells(sSeurat2.cds) # Plot pseudotime plot_cells(sSeurat2.cds, color_cells_by = "pseudotime") # Add pseudotime score back into the dataset sSeurat2$pseudotime_score <- sSeurat2.cds@principal_graph_aux@listData$UMAP$pseudotime sSeurat2$pseudotime_score[sSeurat2$pseudotime_score == Inf] <- NA # FeaturePlot with Pseudotime FeaturePlot(sSeurat2, features = c("pseudotime_score")) DimPlot(sSeurat2, group.by = "Annotation") FeaturePlot(sSeurat2, features = c("pseudotime_score")) ```