--- title: "Han et al." author: "Young-June Kim" date: "2022-10-05" output: html_document --- ```{r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) #BiocManager::install("scDblFinder") library(scDblFinder) library(magrittr) ``` 1. Creating the Seurat Object ```{r eval = FALSE} # Data Wrangling Processes Hcounts <- read.delim("GSM4008722_Placenta1_dge.txt") Hcounts[1:10, 1:10] # set the row names Hcounts %<>% column_to_rownames("GENE") head(Hcounts) # Create the Seurat object hSeurat <- CreateSeuratObject(counts = Hcounts) # Let's save the Seurat object as an .rds file to facilitate analysis. saveRDS(hSeurat, "hSeurat.rds") ``` For convenience, we can start running from this chunk of code, since we have already saved the Seurat object as a file in our directory. ```{r} hSeurat <- readRDS("hSeurat.rds") # Add in the Mitochondrial PCT% information hSeurat$percent.mt <- PercentageFeatureSet(hSeurat, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(hSeurat$nCount_RNA) summary(hSeurat@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(hSeurat$nFeature_RNA) summary(hSeurat@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(hSeurat$percent.mt) # We see that the mitochondrial genes were removed. # Check for the percentage of ribosomal genes hSeurat[["percent.rb"]] <- PercentageFeatureSet(hSeurat, pattern = "^RP[SL]") summary(hSeurat$percent.rb) ``` Another important part for QC is removing doublets. ```{r} # Find for doublets hSeurat <- as.SingleCellExperiment(hSeurat) hSeurat <- scDblFinder(hSeurat) # Convert back to Seurat object. hSeurat <- as.Seurat(hSeurat) # Notice that our identifiers have changed. levels(Idents(object = hSeurat)) # We want to specify the identifier whether the cell is a singlet or doublet. table(hSeurat@meta.data$scDblFinder.class) Idents(object = hSeurat) <- "scDblFinder.class" hSeurat@meta.data$scDblFinder.class <- as.character(hSeurat@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. hSeurat <- subset(hSeurat, idents = "singlet") levels(as.factor(hSeurat@meta.data$scDblFinder.class)) ``` ```{r} # convert column names to make the protocol coherent colnames(hSeurat@meta.data)[2] <- "nCount_RNA" colnames(hSeurat@meta.data)[3] <- "nFeature_RNA" # We draw a violin plot to explore the values in meta data VlnPlot(hSeurat, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"), ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10)) ``` ```{r} # The following four lines of code are for scatterplots addressing the correlations of every 2 pairs FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.mt") FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") FeatureScatter(hSeurat, feature1 = "nCount_RNA", feature2 = "percent.rb") FeatureScatter(hSeurat, feature1 = "percent.rb", feature2 = "percent.mt") # We also check if there is correlation between number of features and the doublet score FeatureScatter(hSeurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score") ``` ```{r eval = FALSE} # Looking at the scatterplots above, we can subset the data subset_hSeurat <- hSeurat max(hSeurat$nFeature_RNA) subset_hSeurat <- subset(subset_hSeurat, subset = nFeature_RNA > 500 & nFeature_RNA < 2000 & percent.mt < 5) dim(hSeurat) ``` ```{r} # Next, we normalize the data hSeurat <- NormalizeData(hSeurat) # Explore the most variable features, let's use the default version first. hSeurat <- FindVariableFeatures(hSeurat) # hSeurat <- FindVariableFeatures(hSeurat, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(hSeurat), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(hSeurat) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(hSeurat) hSeurat <- ScaleData(hSeurat, features = all.genes) ``` ```{r} # Run PCA analysis hSeurat <- RunPCA(hSeurat, features = VariableFeatures(object = hSeurat)) # Plot the PCA DimPlot(hSeurat, reduction = "pca") # Plot the loadings VizDimLoadings(hSeurat, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(hSeurat) # FeaturePlot of PC1 top features FeaturePlot(hSeurat, features = c("COL3A1", "COL1A1")) # FeaturePlot of PC2 top features FeaturePlot(hSeurat, features = c("FTL", "TMSB4X")) ``` ```{r} # Now perform clustering hSeurat <- FindNeighbors(hSeurat, dims = 1:10) hSeurat <- FindClusters(hSeurat, resolution = 0.5) # For visualization purposes, we need to generate UMAP reduced dimensionality representation hSeurat <- RunUMAP(hSeurat, dims = 1:10, verbose = F) # Plot the UMAP with clustering DimPlot(hSeurat, reduction = "umap") # Get the TSNE embedding hSeurat <- RunTSNE(hSeurat, dims = 1:10) # Plot the t-SNE with clustering DimPlot(hSeurat, reduction = "tsne") # Let's look at cluster sizes table(hSeurat@meta.data$seurat_clusters) # Let's plot those clusters! DimPlot(hSeurat,label.size = 4,repel = T,label = T) FeaturePlot(hSeurat, features = c("FN1", "SERPINE2", "IGF2", "RGS5")) ``` ```{r} DefaultAssay(hSeurat) <- "RNA" # Perform log-normalization hSeurat <- NormalizeData(hSeurat) # Find variable features hSeurat <- FindVariableFeatures(hSeurat, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(hSeurat) hSeurat <- ScaleData(hSeurat, features = all.genes) # Designate the markers all.markers <- FindAllMarkers(hSeurat, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5) dim(all.markers) table(all.markers$cluster) top3_markers <- as.data.frame(all.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)) head(top3_markers) ``` ```{r} #install libraries # BiocManager::install("celldex") # BiocManager::install("SingleR") # Load the library library(SingleR) hpca.ref <- celldex::HumanPrimaryCellAtlasData() hSCE <- as.SingleCellExperiment(DietSeurat(hSeurat)) hSCE hpca.main <- SingleR(test = hSCE, assay.type.test = 1, ref = hpca.ref, labels = hpca.ref$label.main) hpca.fine <- SingleR(test = hSCE, assay.type.test = 1, ref = hpca.ref, labels = hpca.ref$label.fine) # table(hpca.main$pruned.labels) # table(hpca.fine$pruned.labels) hSeurat@meta.data$hpca.main <- hpca.main$pruned.labels hSeurat@meta.data$hpca.fine <- hpca.fine$pruned.labels hSeurat <- SetIdent(hSeurat, value = "hpca.main") DimPlot(hSeurat, label = T , repel = T, label.size = 3) + NoLegend() DimPlot(hSeurat, reduction = "pca") hSeurat saveRDS(hSeurat, "hSeurat_annotated.rds") ``` # Still Work in Progress ```{r eval = FALSE} # Calculate the cell cycle scores s.genes <- cc.genes.updated.2019$s.genes g2m.genes <- cc.genes.updated.2019$g2m.genes placenta <- CellCycleScoring(placenta, s.features = s.genes, g2m.features = g2m.genes) table(placenta[[]]$Phase) FeaturePlot(placenta,features = "percent.mt",label.size = 4,repel = T,label = F) & theme(plot.title = element_text(size=10)) VlnPlot(placenta,features = "percent.mt") & theme(plot.title = element_text(size=10)) FeaturePlot(placenta,features = "percent.rb",label.size = 4,repel = T,label = F) & theme(plot.title = element_text(size=10)) VlnPlot(placenta,features = "percent.rb") & theme(plot.title = element_text(size=10)) VlnPlot(placenta,features = c("nCount_originalexp","nFeature_originalexp")) & theme(plot.title = element_text(size=10)) # Cell cycle score does not seem to depend on the cell type much FeaturePlot(placenta,features = c("S.Score","G2M.Score"),label.size = 4,repel = T,label = T) & theme(plot.title = element_text(size=10)) VlnPlot(placenta,features = c("S.Score","G2M.Score")) & theme(plot.title = element_text(size=10)) ``` 8.3 SCTransform normalization and clustering ```{r eval = FALSE} # Proceed to SCTransform placenta <- SCTransform(placenta, assay = "originalexp", method = "glmGamPoi", ncells = 8824, vars.to.regress = c("percent.mt","S.Score","G2M.Score"), verbose = F) # View the object placenta GetAssay(placenta) # Standard PCA, UMAP, and clustering placenta <- RunPCA(placenta, verbose = F) placenta <- RunUMAP(placenta, dims = 1:30, verbose = F) placenta <- FindNeighbors(placenta, dims = 1:30, verbose = F) placenta <- FindClusters(placenta, verbose = F) table(placenta[[]]$seurat_clusters) DimPlot(placenta, label = T) FeaturePlot(placenta,"SPP1") & scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral"))) FeaturePlot(placenta,"CSH1") & scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral"))) ``` For later uses, examine how scanpy works ```{r eval = FALSE} placenta %>% SCTransform(return.only.var.genes = FALSE) %>% RunPCA(features = VariableFeatures(object = .)) %>% FindNeighbors(dims = 1:40) %>% FindClusters(resolution = c(0.5, 0.6, 0.8, 1, 1.2, 1.5, 1.8,2,2.5,3)) %>% RunUMAP(dims = 1:40) %>% RunTSNE(dims = 1:40) # Using scanpy install.packages("renv") ```