--- title: "Analysis (2023-01-19)" output: html_document --- ```{r} # Load libraries library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) library(scDblFinder) library(magrittr) library(GEOquery) # BiocManager::install("GEOquery") devtools::install_github('satijalab/seurat-data') library(SeuratData) ``` 1. Creating the Seurat Object ```{r eval = FALSE} zcounts <- ReadMtx( mtx = "GSM5293860_D1.matrix.mtx.gz", features = "GSM5293860_D1.features.tsv.gz", cells = "GSM5293860_D1.barcodes.tsv.gz" ) # check for duplicated gene names before setting row names gene_dups <- rownames(zcounts) %>% duplicated() # check the dimensions of the count matrix dim(zcounts) # filter out the duplicated genes zcounts <- zcounts[!gene_dups,] dim(zcounts) head(zcounts) # Create the Seurat object zSeurat <- CreateSeuratObject(counts = zcounts) # Let's save the Seurat object as an .rds file to facilitate analysis. saveRDS(zSeurat, "zSeurat.rds") ``` ```{r} # Examine the Seurat object metadata zSeurat@meta.data dim(zSeurat) rownames(zSeurat) ``` ```{r} setwd("~/Meta-analysis/Zheng et al. (2022)/D8") zcounts2 <- ReadMtx( mtx = "GSM5293861_D8.matrix.mtx.gz", features = "GSM5293861_D8.features.tsv.gz", cells = "GSM5293861_D8.barcodes.tsv.gz" ) # check for duplicated gene names before setting row names gene_dups <- rownames(zcounts2) %>% duplicated() # check the dimensions of the count matrix dim(zcounts2) # filter out the duplicated genes zcounts2 <- zcounts2[!gene_dups,] dim(zcounts2) head(zcounts2) # Create the Seurat object zSeurat2 <- CreateSeuratObject(counts = zcounts2) # Let's save the Seurat object as an .rds file to facilitate analysis. # saveRDS(zSeurat, "zSeurat.rds") ``` ```{r} setwd("~/Meta-analysis/Zheng et al. (2022)/D9") zcounts3 <- ReadMtx( mtx = "GSM5293862_D9.matrix.mtx.gz", features = "GSM5293862_D9.features.tsv.gz", cells = "GSM5293862_D9.barcodes.tsv.gz" ) # check for duplicated gene names before setting row names gene_dups <- rownames(zcounts3) %>% duplicated() # check the dimensions of the count matrix dim(zcounts3) # filter out the duplicated genes zcounts3 <- zcounts3[!gene_dups,] dim(zcounts3) head(zcounts3) # Create the Seurat object zSeurat3 <- CreateSeuratObject(counts = zcounts3) # Let's save the Seurat object as an .rds file to facilitate analysis. # saveRDS(zSeurat, "zSeurat.rds") ``` ```{r} setwd("~/Meta-analysis/Zheng et al. (2022)/D10") zcounts4 <- ReadMtx( mtx = "GSM5293863_D10.matrix.mtx.gz", features = "GSM5293863_D10.features.tsv.gz", cells = "GSM5293863_D10.barcodes.tsv.gz" ) # check for duplicated gene names before setting row names gene_dups <- rownames(zcounts4) %>% duplicated() # check the dimensions of the count matrix dim(zcounts4) # filter out the duplicated genes zcounts4 <- zcounts4[!gene_dups,] dim(zcounts4) head(zcounts4) # Create the Seurat object zSeurat4 <- CreateSeuratObject(counts = zcounts4) # Let's save the Seurat object as an .rds file to facilitate analysis. # saveRDS(zSeurat, "zSeurat.rds") ``` ```{r} setwd("~/Meta-analysis/Zheng et al. (2022)/D13") zcounts5 <- ReadMtx( mtx = "GSM5293864_D13.matrix.mtx.gz", features = "GSM5293864_D13.features.tsv.gz", cells = "GSM5293864_D13.barcodes.tsv.gz" ) # check for duplicated gene names before setting row names gene_dups <- rownames(zcounts5) %>% duplicated() # check the dimensions of the count matrix dim(zcounts5) # filter out the duplicated genes zcounts5 <- zcounts5[!gene_dups,] dim(zcounts5) head(zcounts5) # Create the Seurat object zSeurat5 <- CreateSeuratObject(counts = zcounts5) # Let's save the Seurat object as an .rds file to facilitate analysis. # saveRDS(zSeurat, "zSeurat.rds") ``` ```{r} dim(zSeurat) dim(zSeurat2) dim(zSeurat3) dim(zSeurat4) dim(zSeurat5) ``` ```{r} # Let's try merging the five Seurat objects merged_Seurat <- merge(zSeurat, y = c(zSeurat2, zSeurat3, zSeurat4, zSeurat5), add.cell.ids = c("1", "2", "3", "4", "5"), project = "Zheng et al. 2022") dim(merged_Seurat) dim(zSeurat) dim(zSeurat2) dim(zSeurat3) dim(zSeurat4) dim(zSeurat5) ``` The next line of code will be necessary for adding the cell type annotations to our data ```{r} # Email the authors for this data ``` Next, we proceed the standard pipeline of scRNA-seq analysis. ```{r} # Add in the Mitochondrial PCT% information merged_Seurat$percent.mt <- PercentageFeatureSet(merged_Seurat, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(merged_Seurat$nCount_RNA) summary(merged_Seurat@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(merged_Seurat$nFeature_RNA) summary(merged_Seurat@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(merged_Seurat$percent.mt) # Later, we will subset the Seurat object based on a percent.mt cutoff. # Check for the percentage of ribosomal genes merged_Seurat[["percent.rb"]] <- PercentageFeatureSet(merged_Seurat, pattern = "^RP[SL]") summary(merged_Seurat$percent.rb) ``` Another important part for QC is removing doublets. ```{r removing_doublets} # Find for doublets merged_Seurat <- as.SingleCellExperiment(merged_Seurat) merged_Seurat <- scDblFinder(merged_Seurat) # Convert back to Seurat object. merged_Seurat <- as.Seurat(merged_Seurat) # Notice that our identifiers have changed. levels(Idents(object = merged_Seurat)) # We want to specify the identifier whether the cell is a singlet or doublet. table(merged_Seurat@meta.data$scDblFinder.class) Idents(object = merged_Seurat) <- "scDblFinder.class" merged_Seurat@meta.data$scDblFinder.class <- as.character(merged_Seurat@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. merged_Seurat <- subset(merged_Seurat, idents = "singlet") levels(as.factor(merged_Seurat@meta.data$scDblFinder.class)) ``` ```{r} # We draw a violin plot to explore the values in meta data VlnPlot(merged_Seurat, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"), ncol = 4, pt.size = 0.1) & theme(plot.title = element_text(size=10)) ``` ```{r} # The following four lines of code are for scatterplots addressing the correlations of every 2 pairs FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "percent.mt") FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") FeatureScatter(merged_Seurat, feature1 = "nCount_RNA", feature2 = "percent.rb") FeatureScatter(merged_Seurat, feature1 = "percent.rb", feature2 = "percent.mt") # We also check if there is correlation between number of features and the doublet score FeatureScatter(merged_Seurat, feature1 = "nFeature_RNA", feature2 = "scDblFinder.score") # Since the typical cutoff for percent.mt is 5%, we will try out < 5. # Last week we used 10%, and couldn't see much difference. # Also, let's filter out cells that have less than 5% ribosomal content. merged_Seurat <- subset(merged_Seurat, subset = percent.mt < 5 & percent.rb > 5) dim(merged_Seurat) ``` ```{r} # Next, we normalize the data merged_Seurat <- NormalizeData(merged_Seurat) # Explore the most variable features, let's use the default version first. merged_Seurat <- FindVariableFeatures(merged_Seurat) # merged_Seurat <- FindVariableFeatures(merged_Seurat, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(merged_Seurat), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(merged_Seurat) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(merged_Seurat) merged_Seurat <- ScaleData(merged_Seurat, features = all.genes) ``` ```{r} # Run PCA analysis hSeurat <- RunPCA(hSeurat, features = VariableFeatures(object = hSeurat)) # Plot the PCA DimPlot(hSeurat, reduction = "pca") # Plot the loadings VizDimLoadings(hSeurat, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(hSeurat) ```