--- title: "Han et al." author: "Young-June Kim" date: "2022-10-05" output: html_document --- ```{r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) library(scDblFinder) library(magrittr) library(pheatmap) # Define color palettes WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100) ``` 1. Creating the Seurat Object ```{r eval = FALSE} # Data Wrangling Processes Hcounts <- read.delim("GSM4008722_Placenta1_dge.txt") Hcounts[1:10, 1:10] # set the row names Hcounts %<>% column_to_rownames("GENE") head(Hcounts) # We have our prepared annotations ready as well # Load the cell annotation data annotations <- read_csv(file = "Placenta1_rmbatchAnno.csv") # Notice that the number of samples (cells) in hSeurat is not identical to the number of rows of this annotation file. dim(annotations)[1] == dim(hSeurat)[2] sweep <- (colnames(hSeurat) %in% (annotations$Cell_barcode)) # Filter out the count data Hcounts <- Hcounts[, sweep] # Create the Seurat object hSeurat <- CreateSeuratObject(counts = Hcounts) ``` Let's continue adding the annotations to the metadata. ```{r eval = FALSE} # make the annotations object compatible for data analysis. anno_revised <- annotations %>% select(Cell_barcode, ident, Celltype) # Extract the metadata meta_data.copy <- hSeurat@meta.data # make new column that copies the rownames meta_data.copy %<>% mutate(Cell_barcode = rownames(meta_data.copy)) # add the annotations!!! meta_data.copy <- inner_join(meta_data.copy, anno_revised, by = 'Cell_barcode') # Reassign the metadata hSeurat@meta.data$Cluster <- meta_data.copy$ident hSeurat@meta.data$Annotation <- meta_data.copy$Celltype # The Seurat object became smaller. dim(hSeurat) # Use head() to inspect the metadata head(hSeurat@meta.data) # Let's save this object to overwrite RDS file saveRDS(hSeurat, "hSeurat.rds") ``` For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory. ```{r} # Start running from this line of code hSeurat <- readRDS("hSeurat.rds") # Add in the Mitochondrial PCT% information hSeurat$percent.mt <- PercentageFeatureSet(hSeurat, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(hSeurat$nCount_RNA) summary(hSeurat@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(hSeurat$nFeature_RNA) summary(hSeurat@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(hSeurat$percent.mt) # Later, we will subset the Seurat object based on a percent.mt cutoff. # Check for the percentage of ribosomal genes hSeurat[["percent.rb"]] <- PercentageFeatureSet(hSeurat, pattern = "^RP[SL]") summary(hSeurat$percent.rb) ``` Another important part for QC is removing doublets. ```{r removing_doublets} # Find for doublets hSeurat <- as.SingleCellExperiment(hSeurat) hSeurat <- scDblFinder(hSeurat) # Convert back to Seurat object. hSeurat <- as.Seurat(hSeurat) # Notice that our identifiers have changed. levels(Idents(object = hSeurat)) # We want to specify the identifier whether the cell is a singlet or doublet. table(hSeurat@meta.data$scDblFinder.class) Idents(object = hSeurat) <- "scDblFinder.class" hSeurat@meta.data$scDblFinder.class <- as.character(hSeurat@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. hSeurat <- subset(hSeurat, idents = "singlet") levels(as.factor(hSeurat@meta.data$scDblFinder.class)) ``` ```{r} # convert column names to make the protocol coherent colnames(hSeurat@meta.data)[2] <- "nCount_RNA" colnames(hSeurat@meta.data)[3] <- "nFeature_RNA" # We draw a violin plot to explore the values in meta data VlnPlot(hSeurat, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"), ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10)) ``` ```{r} # The following four lines of code are for scatterplots addressing the correlations of every 2 pairs FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt") FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb") FeatureScatter(hSeurat, feature1 = "percent.rb", feature2 = "percent.mt") # We also check if there is correlation between number of features and the doublet score FeatureScatter(hSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score") # Since the typical cutoff for percent.mt is 5%, we will try out < 5. # Last week we used 10%, and couldn't see much difference. # Also, let't filter out cells that have less than 5% ribosomal content. hSeurat <- subset(hSeurat, subset = percent.mt < 5 & percent.rb > 5) dim(hSeurat) ``` I tried to subset the dataset based on nCount_RNA > 500 as written in the publication, but the dataset seems to be already filtered. Now, we have a reduced version of the Seurat object, where the cells contain less than 5% mitochondrial genes. ```{r} # Next, we normalize the data hSeurat <- NormalizeData(hSeurat) # Explore the most variable features, let's use the default version first. hSeurat <- FindVariableFeatures(hSeurat) # hSeurat <- FindVariableFeatures(hSeurat, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(hSeurat), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(hSeurat) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(hSeurat) hSeurat <- ScaleData(hSeurat, features = all.genes) ``` ```{r} # Run PCA analysis hSeurat <- RunPCA(hSeurat, features = VariableFeatures(object = hSeurat)) # Plot the PCA DimPlot(hSeurat, reduction = "pca") # Plot the loadings VizDimLoadings(hSeurat, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(hSeurat) # FeaturePlot of PC1 top features FeaturePlot(hSeurat, features = c("EBI3", "KRT7")) # FeaturePlot of PC2 top features FeaturePlot(hSeurat, features = c("B2M", "TYROBP")) ``` ```{r} # Now perform clustering hSeurat <- FindNeighbors(hSeurat, dims = 1:10) hSeurat <- FindClusters(hSeurat, resolution = 0.5) # For visualization purposes, we need to generate UMAP reduced dimensionality representation hSeurat <- RunUMAP(hSeurat, dims = 1:10, verbose = F) # Plot the UMAP with clustering DimPlot(hSeurat, reduction = "umap") # Get the TSNE embedding hSeurat <- RunTSNE(hSeurat, dims = 1:10) # Plot the t-SNE with clustering DimPlot(hSeurat, reduction = "tsne") # Let's look at cluster sizes table(hSeurat@meta.data$seurat_clusters) # Let's plot those clusters! DimPlot(hSeurat,label.size = 4,repel = T,label = T) FeaturePlot(hSeurat, features = c("MMP12", "SERPINE2", "CSH1", "CSH2")) ``` Let's calculate the cell cycle scores at this point. ```{r fig.height = 4, fig.width = 6.5} hSeurat <- SetIdent(hSeurat, value = "Annotation") # Calculate the cell cycle scores s.genes <- cc.genes.updated.2019$s.genes g2m.genes <- cc.genes.updated.2019$g2m.genes # Somehow this code does not work. hSeurat <- CellCycleScoring(hSeurat, search = TRUE, s.features = s.genes, g2m.features = g2m.genes) table(hSeurat[[]]$Phase) ``` Around 1/3 of the cells are in the S phase. ```{r} # Check if mitochondrial genes show dependency on cell type VlnPlot(hSeurat, features = "percent.mt") & theme(plot.title = element_text(size = 10), legend.position = 'none') # Check if ribosomal genes show dependency on cell type VlnPlot(hSeurat, features = "percent.rb") & theme(plot.title = element_text(size=10), legend.position = 'none') # Check # of genes and # of transcripts per cluster VlnPlot(hSeurat,features = c("nCount_RNA","nFeature_RNA")) & theme(plot.title = element_text(size=10)) # Cell cycle score does not seem to depend on the cell type much FeaturePlot(hSeurat, features = c("S.Score","G2M.Score"), label.size = 4, repel = T, label = T) & theme(plot.title = element_text(size = 10)) VlnPlot(hSeurat,features = c("S.Score","G2M.Score")) & theme(plot.title = element_text(size = 10), legend.position = 'none') ``` In this case, it looks like most of the cell types show low cell cycle scores, except for the proliferating cells. ```{r} # Plot in UMAP DimPlot(hSeurat, group.by = "Phase") # Most of the clusters contain all three cell cycle states. # Plot with clusters compTable.int <- table(hSeurat$Annotation, hSeurat$Phase) compTable.int <- (compTable.int / rowSums(compTable.int)) * 100 pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100) # FeaturePlot for showing mod score FeaturePlot(hSeurat, features = c("S.Score", "G2M.Score")) ``` This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase. ```{r} # This part is replaced when using the SCTransform DefaultAssay(hSeurat) <- "RNA" # Perform log-normalization hSeurat <- NormalizeData(hSeurat) # Find variable features hSeurat <- FindVariableFeatures(hSeurat, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(hSeurat) hSeurat <- ScaleData(hSeurat, features = all.genes) # Designate the markers all.markers <- FindAllMarkers(hSeurat, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5) dim(all.markers) table(all.markers$cluster) top3_markers <- as.data.frame(all.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)) # We can check the dataframe briefly head(top3_markers) ``` Clustering based on Cell Type annotation ```{r fig.height = 4, fig.width = 6} DimPlot(hSeurat, label = T , repel = T, label.size = 3) + NoLegend() DimPlot(hSeurat, reduction = "pca") DimPlot(hSeurat, reduction = "tsne") DimPlot(hSeurat, reduction = "umap") # ggsave(filename = "Check.png", plot = p1, width = 30, height = 10) saveRDS(hSeurat, "hSeurat_annotated.rds") hSeurat <- readRDS("/home/youkim/MetaMarkers/4-dataset-test/hSeurat_annotated.rds") pdf("hSeuratUMAP2.pdf") DimPlot(hSeurat, label = T , repel = T, label.size = 5) + NoLegend() dev.off() ``` Luckily, it seems that the cell types are preserved even after changing the mitochondrial gene percentage to 5%. Another difference is that the plots show more clustering compared to the 10% threshold, which could be promising in terms of clarity. ## Still Work in Progress ```{r eval = FALSE} # Pseudotime # Convert to monocle3 cell_data_set hSeurat.cds <- as.cell_data_set(hSeurat) class(hSeurat.cds) # Re-cluster (required for trajectories in monocle3) hSeurat.cds <- cluster_cells(hSeurat.cds) # Learn the trajectory hSeurat.cds <- learn_graph(hSeurat.cds) # Plot the trajectory plot_cells(hSeurat.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE) # Find the pseudotime order DimPlot(hSeurat, group.by = "Phase") hSeurat.cds <- order_cells(hSeurat.cds) # Plot pseudotime plot_cells(hSeurat.cds, color_cells_by = "pseudotime") # Add pseudotime score back into the dataset hSeurat$pseudotime_score <- hSeurat.cds@principal_graph_aux@listData$UMAP$pseudotime hSeurat$pseudotime_score[hSeurat$pseudotime_score == Inf] <- NA # FeaturePlot with Pseudotime FeaturePlot(hSeurat, features = c("pseudotime_score")) DimPlot(hSeurat, group.by = "Annotation") FeaturePlot(hSeurat, features = c("pseudotime_score")) ```