--- title: "Han et al." author: "Young-June Kim" date: "2022-10-05" output: html_document --- ```{r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) library(scDblFinder) library(magrittr) library(pheatmap) # Define color palettes WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100) ``` 1. Creating the Seurat Object ```{r eval = FALSE} # Data Wrangling Processes Hcounts <- read.delim("GSM4008722_Placenta1_dge.txt") Hcounts[1:10, 1:10] # set the row names Hcounts %<>% column_to_rownames("GENE") head(Hcounts) # We have our prepared annotations ready as well # Load the cell annotation data annotations <- read_csv(file = "Placenta1_rmbatchAnno.csv") # Notice that the number of samples (cells) in shSeurat is not identical to the number of rows of this annotation file. dim(annotations)[1] == dim(shSeurat)[2] sweep <- (colnames(shSeurat) %in% (annotations$Cell_barcode)) # Filter out the count data Hcounts <- Hcounts[, sweep] # Create the Seurat object shSeurat <- CreateSeuratObject(counts = Hcounts) ``` Let's continue adding the annotations to the metadata. ```{r eval = FALSE} # make the annotations object compatible for data analysis. anno_revised <- annotations %>% select(Cell_barcode, ident, Celltype) # Extract the metadata meta_data.copy <- shSeurat@meta.data # make new column that copies the rownames meta_data.copy %<>% mutate(Cell_barcode = rownames(meta_data.copy)) # add the annotations!!! meta_data.copy <- inner_join(meta_data.copy, anno_revised, by = 'Cell_barcode') # Reassign the metadata shSeurat@meta.data$Cluster <- meta_data.copy$ident shSeurat@meta.data$Annotation <- meta_data.copy$Celltype # The Seurat object became smaller. dim(shSeurat) # Use head() to inspect the metadata head(shSeurat@meta.data) # Let's save this object to overwrite RDS file saveRDS(shSeurat, "shSeurat.rds") ``` For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory. ```{r} # Start running from this line of code shSeurat <- readRDS("shSeurat.rds") # Add in the Mitochondrial PCT% information shSeurat$percent.mt <- PercentageFeatureSet(shSeurat, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(shSeurat$nCount_RNA) summary(shSeurat@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(shSeurat$nFeature_RNA) summary(shSeurat@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(shSeurat$percent.mt) # Later, we will subset the Seurat object based on a percent.mt cutoff. # Check for the percentage of ribosomal genes shSeurat[["percent.rb"]] <- PercentageFeatureSet(shSeurat, pattern = "^RP[SL]") summary(shSeurat$percent.rb) ``` Another important part for QC is removing doublets. ```{r removing_doublets} # Find for doublets shSeurat <- as.SingleCellExperiment(shSeurat) shSeurat <- scDblFinder(shSeurat) # Convert back to Seurat object. shSeurat <- as.Seurat(shSeurat) # Notice that our identifiers have changed. levels(Idents(object = shSeurat)) # We want to specify the identifier whether the cell is a singlet or doublet. table(shSeurat@meta.data$scDblFinder.class) Idents(object = shSeurat) <- "scDblFinder.class" shSeurat@meta.data$scDblFinder.class <- as.character(shSeurat@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. shSeurat <- subset(shSeurat, idents = "singlet") levels(as.factor(shSeurat@meta.data$scDblFinder.class)) ``` ```{r} # convert column names to make the protocol coherent colnames(shSeurat@meta.data)[2] <- "nCount_RNA" colnames(shSeurat@meta.data)[3] <- "nFeature_RNA" # We draw a violin plot to explore the values in meta data VlnPlot(shSeurat, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"), ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10)) ``` ```{r} # The following four lines of code are for scatterplots addressing the correlations of every 2 pairs FeatureScatter(shSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt") FeatureScatter(shSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") FeatureScatter(shSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb") FeatureScatter(shSeurat, feature1 = "percent.rb", feature2 = "percent.mt") # We also check if there is correlation between number of features and the doublet score FeatureScatter(shSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score") # Since the typical cutoff for percent.mt is 5%, we will try out < 5. # Last week we used 10%, and couldn't see much difference. # Also, let't filter out cells that have less than 5% ribosomal content. shSeurat <- subset(shSeurat, subset = percent.mt < 5 & percent.rb > 5) dim(shSeurat) ``` I tried to subset the dataset based on nCount_RNA > 500 as written in the publication, but the dataset seems to be already filtered. Now, we have a reduced version of the Seurat object, where the cells contain less than 5% mitochondrial genes. ```{r} # Next, we normalize the data shSeurat <- NormalizeData(shSeurat) # Explore the most variable features, let's use the default version first. shSeurat <- FindVariableFeatures(shSeurat) # shSeurat <- FindVariableFeatures(shSeurat, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(shSeurat), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(shSeurat) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(shSeurat) shSeurat <- ScaleData(shSeurat, features = all.genes) ``` ```{r} # Run PCA analysis shSeurat <- RunPCA(shSeurat, features = VariableFeatures(object = shSeurat)) # Plot the PCA DimPlot(shSeurat, reduction = "pca") # Plot the loadings VizDimLoadings(shSeurat, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(shSeurat) # FeaturePlot of PC1 top features FeaturePlot(shSeurat, features = c("EBI3", "KRT7")) # FeaturePlot of PC2 top features FeaturePlot(shSeurat, features = c("B2M", "TYROBP")) ``` ```{r} # Now perform clustering shSeurat <- FindNeighbors(shSeurat, dims = 1:10) shSeurat <- FindClusters(shSeurat, resolution = 0.5) # For visualization purposes, we need to generate UMAP reduced dimensionality representation shSeurat <- RunUMAP(shSeurat, dims = 1:10, verbose = F) # Plot the UMAP with clustering DimPlot(shSeurat, reduction = "umap") # Get the TSNE embedding shSeurat <- RunTSNE(shSeurat, dims = 1:10) # Plot the t-SNE with clustering DimPlot(shSeurat, reduction = "tsne") # Let's look at cluster sizes table(shSeurat@meta.data$seurat_clusters) # Let's plot those clusters! DimPlot(shSeurat,label.size = 4,repel = T,label = T) FeaturePlot(shSeurat, features = c("MMP12", "SERPINE2", "CSH1", "CSH2")) ``` Let's calculate the cell cycle scores at this point. ```{r fig.height = 4, fig.width = 6.5} shSeurat <- SetIdent(shSeurat, value = "Annotation") # Calculate the cell cycle scores s.genes <- cc.genes.updated.2019$s.genes g2m.genes <- cc.genes.updated.2019$g2m.genes # Somehow this code does not work. shSeurat <- CellCycleScoring(shSeurat, search = TRUE, s.features = s.genes, g2m.features = g2m.genes) table(shSeurat[[]]$Phase) ``` Around 1/3 of the cells are in the S phase. ```{r} # Check if mitochondrial genes show dependency on cell type VlnPlot(shSeurat, features = "percent.mt") & theme(plot.title = element_text(size = 10), legend.position = 'none') # Check if ribosomal genes show dependency on cell type VlnPlot(shSeurat, features = "percent.rb") & theme(plot.title = element_text(size=10), legend.position = 'none') # Check # of genes and # of transcripts per cluster VlnPlot(shSeurat,features = c("nCount_RNA","nFeature_RNA")) & theme(plot.title = element_text(size=10)) # Cell cycle score does not seem to depend on the cell type much FeaturePlot(shSeurat, features = c("S.Score","G2M.Score"), label.size = 4, repel = T, label = T) & theme(plot.title = element_text(size = 10)) VlnPlot(shSeurat,features = c("S.Score","G2M.Score")) & theme(plot.title = element_text(size = 10), legend.position = 'none') ``` In this case, it looks like most of the cell types show low cell cycle scores, except for the proliferating cells. ```{r} # Plot in UMAP DimPlot(shSeurat, group.by = "Phase") # Most of the clusters contain all three cell cycle states. # Plot with clusters compTable.int <- table(shSeurat$Annotation, shSeurat$Phase) compTable.int <- (compTable.int / rowSums(compTable.int)) * 100 pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100) # FeaturePlot for showing mod score FeaturePlot(shSeurat, features = c("S.Score", "G2M.Score")) ``` This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase. ```{r} # This part is replaced when using the SCTransform DefaultAssay(shSeurat) <- "RNA" # Perform log-normalization shSeurat <- NormalizeData(shSeurat) # Find variable features shSeurat <- FindVariableFeatures(shSeurat, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(shSeurat) shSeurat <- ScaleData(shSeurat, features = all.genes) # Designate the markers all.markers <- FindAllMarkers(shSeurat, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5) dim(all.markers) table(all.markers$cluster) top3_markers <- as.data.frame(all.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)) # We can check the dataframe briefly head(top3_markers) ``` Clustering based on Cell Type annotation ```{r fig.height = 4, fig.width = 6} DimPlot(shSeurat, label = T , repel = T, label.size = 3) + NoLegend() DimPlot(shSeurat, reduction = "pca") DimPlot(shSeurat, reduction = "tsne") DimPlot(shSeurat, reduction = "umap") # ggsave(filename = "Check.png", plot = p1, width = 30, height = 10) saveRDS(shSeurat, "shSeurat_annotated.rds") ``` Luckily, it seems that the cell types are preserved even after changing the mitochondrial gene percentage to 5%. Another difference is that the plots show more clustering compared to the 10% threshold, which could be promising in terms of clarity. ## Still Work in Progress ```{r eval = FALSE} # Pseudotime # Convert to monocle3 cell_data_set shSeurat.cds <- as.cell_data_set(shSeurat) class(shSeurat.cds) # Re-cluster (required for trajectories in monocle3) shSeurat.cds <- cluster_cells(shSeurat.cds) # Learn the trajectory shSeurat.cds <- learn_graph(shSeurat.cds) # Plot the trajectory plot_cells(shSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE) # Find the pseudotime order DimPlot(shSeurat, group.by = "Phase") shSeurat.cds <- order_cells(shSeurat.cds) # Plot pseudotime plot_cells(shSeurat.cds, color_cells_by = "pseudotime") # Add pseudotime score back into the dataset shSeurat$pseudotime_score <- shSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime shSeurat$pseudotime_score[shSeurat$pseudotime_score == Inf] <- NA # FeaturePlot with Pseudotime FeaturePlot(shSeurat, features = c("pseudotime_score")) DimPlot(shSeurat, group.by = "Annotation") FeaturePlot(shSeurat, features = c("pseudotime_score")) ```