--- title: "Vento-Tormo et al.(2018)" author: "Young-June Kim" date: "2022-10-15" output: html_document --- ```{r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) library(scDblFinder) library(magrittr) library(pheatmap) # Define color palettes WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100) ``` 1. Creating the Seurat Object ```{r eval = FALSE} # Data Wrangling Processes VTcounts2 <- read.delim("raw_data_10x.txt") VTcounts2[1:10, 1:10] # We notice that the gene names are too lengthy VTcounts2[,1] <- sub("_.*", "", VTcounts2[,1]) # check for duplicated gene names before setting row names gene_dups <- VTcounts2[,1] %>% duplicated() # check the dimensions of the count matrix dim(VTcounts2) # filter out the duplicated genes VTcounts2 <- VTcounts2[!gene_dups,] dim(VTcounts2) rownames(VTcounts2) <- NULL # set the row names VTcounts2 %<>% column_to_rownames("Gene") head(VTcounts2) # Create the Seurat object vtSeurat2 <- CreateSeuratObject(counts = VTcounts2) # Let's save the Seurat object as an .rds file to facilitate analysis. saveRDS(vtSeurat2, "vtSeurat2.rds") ``` For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory. Since we have the metadata for this dataset, let's add the cell annotations first, and filter the dataset to only include cells from the placenta. This will prevent the Seurat object becoming too large and time-costly to process. ```{r eval = FALSE} vtSeurat2 <- readRDS("vtSeurat2.rds") # View(vtSeurat2@meta.data) # Check the dimensions of this object dim(vtSeurat2) # Load the cell annotation data annotations <- read_delim(file = "meta_10x.txt") # Check if the number of samples (cells) in vtSeurat2 is identical to the number of rows of this annotation file. dim(annotations)[1] == dim(vtSeurat2)[2] # make the annotations object compatible for data analysis. anno_revised <- annotations %>% separate(annotation, into = c("Cluster", "Annotation"), sep = "\t") %>% mutate(Location = final_cluster, Sample = location) %>% select(-location, -final_cluster) # Extract the metadata meta_data.copy <- vtSeurat2@meta.data # make new column that copies the rownames meta_data.copy %<>% mutate(Fetus = rownames(meta_data.copy)) # add the annotations!!! meta_data.copy <- inner_join(meta_data.copy, anno_revised, by = 'Fetus') # Reassign the metadata vtSeurat2@meta.data$Cluster <- meta_data.copy$Cluster vtSeurat2@meta.data$Annotation <- meta_data.copy$Annotation vtSeurat2@meta.data$Location <- meta_data.copy$Location vtSeurat2@meta.data$Sample <- meta_data.copy$Sample # Set the identity (classifier) of the cells based on the location # Set the column "Location" as characters vtSeurat2@meta.data$Location <- as.character(vtSeurat2@meta.data$Location) Idents(object = vtSeurat2) <- "Location" levels(Idents(vtSeurat2)) vtSeurat2@meta.data %>% filter(Location == "Placenta") # Subset the vtSeurat2 object to a smaller one. vtSeurat2 <- subset(vtSeurat2, idents = "Placenta") # The Seurat object became smaller. dim(vtSeurat2) # Let's save this object as another RDS file saveRDS(vtSeurat2, "vtSeurat2_placenta.rds") ``` ```{r} vtSeurat2 <- readRDS("vtSeurat2_placenta.rds") # Add in the Mitochondrial PCT% information vtSeurat2$percent.mt <- PercentageFeatureSet(vtSeurat2, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(vtSeurat2$nCount_RNA) summary(vtSeurat2@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(vtSeurat2$nFeature_RNA) summary(vtSeurat2@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(vtSeurat2$percent.mt) # We see that the mitochondrial genes were removed. # Check for the percentage of ribosomal genes vtSeurat2[["percent.rb"]] <- PercentageFeatureSet(vtSeurat2, pattern = "^RP[SL]") summary(vtSeurat2$percent.rb) dim(vtSeurat2) ``` We now have a nice vtSeurat object that contains only cells from the placenta. Luckily, we still have 18547 cells, which would be sufficient for downstream analysis. Another important part for QC is removing doublets. ```{r} # Find for doublets vtSeurat2 <- as.SingleCellExperiment(vtSeurat2) vtSeurat2 <- scDblFinder(vtSeurat2) # Convert back to Seurat object. vtSeurat2 <- as.Seurat(vtSeurat2) # Notice that our identifiers have changed. levels(Idents(object = vtSeurat2)) # We want to specify the identifier whether the cell is a singlet or doublet. table(vtSeurat2@meta.data$scDblFinder.class) Idents(object = vtSeurat2) <- "scDblFinder.class" vtSeurat2@meta.data$scDblFinder.class <- as.character(vtSeurat2@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. vtSeurat2 <- subset(vtSeurat2, idents = "singlet") levels(as.factor(vtSeurat2@meta.data$scDblFinder.class)) # Check the dimensions of the dataset again. dim(vtSeurat2) ``` Now, note that we have 16779 cells that are derived from the placenta. ```{r} colnames(vtSeurat2@meta.data) # convert column names to make the protocol coherent colnames(vtSeurat2@meta.data)[2] <- "nCount_RNA" colnames(vtSeurat2@meta.data)[3] <- "nFeature_RNA" # Make a violin plot of the QC columns plt <- VlnPlot(vtSeurat2, features = c("nFeature_RNA", "nCount_RNA" ,"percent.rb"), ncol = 3, pt.size = 0.1) & theme(plot.title = element_text(size=10)) plt ggsave(filename = "QC.png", plot = plt, width = 7, height = 3.5) ``` Looking at the violin plot, we do see some extreme values that should be filtered out. However, let's put the subsetting part for later and proceed to next steps and come back later. ```{r} # Make scatter plots to compare plot1 <- FeatureScatter(vtSeurat2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") plot1 # Looking at the scatterplot, we should subset to remove some outliers # This is because further data analysis steps requires the assumption that the data has relatively moderate, homogeneous technical variants. dim(vtSeurat2) # Let's perform some filtering! vtSeurat2 <- subset(vtSeurat2, subset = percent.rb > 5) ``` Next, we need to normalize the data. ```{r} # Next, we normalize the data vtSeurat2 <- NormalizeData(vtSeurat2) # Explore the most variable features, let's use the default version first. vtSeurat2 <- FindVariableFeatures(vtSeurat2) # vtSeurat2 <- FindVariableFeatures(vtSeurat2, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(vtSeurat2), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(vtSeurat2) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(vtSeurat2) vtSeurat2 <- ScaleData(vtSeurat2, features = all.genes) ``` Now, we are ready to run PCA. ```{r} # Run PCA analysis vtSeurat2 <- RunPCA(vtSeurat2, features = VariableFeatures(object = vtSeurat2)) # Plot the PCA DimPlot(vtSeurat2, reduction = "pca") # Plot the loadings VizDimLoadings(vtSeurat2, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(vtSeurat2) # FeaturePlot of PC1 top features FeaturePlot(vtSeurat2, features = c("TMSB4X", "FTL")) # FeaturePlot of PC2 top features FeaturePlot(vtSeurat2, features = c("EGFL6", "COL6A2")) ``` Next, we move on to derive clusters. ```{r} # Now perform clustering vtSeurat2 <- FindNeighbors(vtSeurat2, dims = 1:10) vtSeurat2 <- FindClusters(vtSeurat2, resolution = 0.5) # For visualization purposes, we need to generate UMAP reduced dimensionality representation vtSeurat2 <- RunUMAP(vtSeurat2, dims = 1:10, verbose = F) # Plot the UMAP with clustering DimPlot(vtSeurat2, reduction = "umap") # Get the TSNE embedding vtSeurat2 <- RunTSNE(vtSeurat2, dims = 1:10) # Plot the t-SNE with clustering DimPlot(vtSeurat2, reduction = "tsne") # Let's look at cluster sizes table(vtSeurat2@meta.data$seurat_clusters) # Let's plot those clusters! DimPlot(vtSeurat2,label.size = 4,repel = T,label = T) # Check expression patterns for some genes in top10 FeaturePlot(vtSeurat2, features = c("PLA2G2A", "MMP12", "AREG", "AOC1")) ``` Let's calculate the cell cycle scores at this point. ```{r} vtSeurat2 <- SetIdent(vtSeurat2, value = "Annotation") # Calculate the cell cycle scores s.genes <- cc.genes.updated.2019$s.genes g2m.genes <- cc.genes.updated.2019$g2m.genes # Somehow this code does not work. vtSeurat2 <- CellCycleScoring(vtSeurat2, search = TRUE, s.features = s.genes, g2m.features = g2m.genes) table(vtSeurat2[[]]$Phase) ``` Around 1/5 of the cells are in the S phase. ```{r} # Check if ribosomal genes show dependency on cluster FeaturePlot(vtSeurat2,features = "percent.rb", label.size = 4, repel = T, label = F) & theme(plot.title = element_text(size=10)) # Check using the violin plot VlnPlot(vtSeurat2, features = "percent.rb") & theme(plot.title = element_text(size=10), legend.position = 'none') # Check # of genes and # of transcripts per cluster VlnPlot(vtSeurat2,features = "nCount_RNA") & theme(plot.title = element_text(size=10), legend.position = 'none') VlnPlot(vtSeurat2,features = "nFeature_RNA") & theme(plot.title = element_text(size=10), legend.position = 'none') # Cell cycle score does not seem to depend on the cell type much FeaturePlot(vtSeurat2, features = c("S.Score","G2M.Score"), label.size = 4, repel = T, label = T) & theme(plot.title = element_text(size = 10)) VlnPlot(vtSeurat2, features = c("S.Score","G2M.Score")) & theme(plot.title = element_text(size = 10), legend.position = 'none') ``` In this case, it looks like some cells do show some cell cycle scores, which should be examined more carefully. ```{r} # Plot in UMAP DimPlot(vtSeurat2, group.by = "Phase") # Plot with clusters compTable.int <- table(vtSeurat2$Annotation, vtSeurat2$Phase) compTable.int <- (compTable.int / rowSums(compTable.int)) * 100 pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100) ``` This heatmap is interesting, as most syncytiotrophoblasts are in the G1 phase, and almost all endothelial cells are in the S phase. ```{r} # Proceed to SCTransform vtSeurat2 <- SCTransform(vtSeurat2, assay = "RNA", method = "glmGamPoi", ncells = 4367, vars.to.regress = c("percent.mt","S.Score","G2M.Score"), verbose = F) # View the object vtSeurat2 GetAssay(vtSeurat2) # Standard PCA, UMAP, and clustering vtSeurat2 <- RunPCA(vtSeurat2, verbose = F) vtSeurat2 <- RunUMAP(vtSeurat2, dims = 1:30, verbose = F) vtSeurat2 <- FindNeighbors(vtSeurat2, dims = 1:30, verbose = F) vtSeurat2 <- FindClusters(vtSeurat2, verbose = F) table(vtSeurat2[[]]$seurat_clusters) vtSeurat2 <- SetIdent(vtSeurat2, value = "Annotation") DimPlot(vtSeurat2, label = T) + NoLegend() # Check whether the SCTransform clustering and default clustering are identical summary(vtSeurat2$SCT_snn_res.0.8 == vtSeurat2$seurat_clusters) FeaturePlot(vtSeurat2,"SPP1") & scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral"))) FeaturePlot(vtSeurat2,"CSH1") & scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral"))) # Designate the markers all.markers <- FindAllMarkers(vtSeurat2, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5) dim(all.markers) table(all.markers$cluster) top3_markers <- as.data.frame(all.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)) # We can check the dataframe briefly head(top3_markers) ``` Clustering based on Cell Type annotation ```{r} DimPlot(vtSeurat2, label = T , repel = T, label.size = 3) + NoLegend() DimPlot(vtSeurat2, reduction = "pca") DimPlot(vtSeurat2, reduction = "tsne") DimPlot(vtSeurat2, reduction = "umap") ``` ```{r eval = FALSE} # ggsave(filename = "Check.png", plot = p1, width = 30, height = 10) saveRDS(vtSeurat2, "vtSeurat2_annotated_SCT.rds") ``` # Still Work in Progress ```{r eval = FALSE} # Pseudotime # Convert to monocle3 cell_data_set vtSeurat2.cds <- as.cell_data_set(vtSeurat2) class(vtSeurat2.cds) # Re-cluster (required for trajectories in monocle3) vtSeurat2.cds <- cluster_cells(vtSeurat2.cds) # Learn the trajectory vtSeurat2.cds <- learn_graph(vtSeurat2.cds) # Plot the trajectory plot_cells(vtSeurat2.cds, color_cells_by = "seurat_clusters", show_trajectory_graph = TRUE) # Find the pseudotime order DimPlot(vtSeurat2, group.by = "Phase") vtSeurat2.cds <- order_cells(vtSeurat2.cds) # Plot pseudotime plot_cells(vtSeurat2.cds, color_cells_by = "pseudotime") # Add pseudotime score back into the dataset vtSeurat2$pseudotime_score <- vtSeurat2.cds@principal_graph_aux@listData$UMAP$pseudotime vtSeurat2$pseudotime_score[vtSeurat2$pseudotime_score == Inf] <- NA # FeaturePlot with Pseudotime FeaturePlot(vtSeurat2, features = c("pseudotime_score")) DimPlot(vtSeurat2, group.by = "Annotation") FeaturePlot(vtSeurat2, features = c("pseudotime_score")) ```