--- title: "Arutyanyan et al. (2023)" author: "Young-June Kim" date: "2023-06-14" output: html_document --- ```{r} # Load libraries #install.packages('Seurat') #remotes::install_github("mojaveazure/seurat-disk") #BiocManager::install("zellkonverter") library(Seurat) library(SeuratDisk) library(zellkonverter) library(SingleCellExperiment) library(tidyverse) library(Seurat) library(RColorBrewer) BiocManager::install("scDblFinder") library(scDblFinder) library(magrittr) library(Matrix) library(GEOquery) library(pheatmap) ``` ```{r} # Let's load the h5ad file first # my_data <- readH5AD(file = "/home/youkim/adata_all_donors_all_cell_states_raw_counts_in_raw_normlog_counts_in_X_for_download_UPD_20230307.h5ad", X_name = "X") # Quick glance on the matrix # my_data@assays@data$X[1:5, 1:5] # Convert the SCE object into a Seurat object # This step keeps failing, using dactyl is recommended # my_data <- as.Seurat(my_data, counts = my_data@assays@data$X) setwd("~/") aSeurat <- readRDS("Seurat_obj.rds") # copy metadata meta_data <- aSeurat@meta.data # filter out placental cells placental_cells <- row.names(meta_data[meta_data$tissue == "placenta", ]) # Subset the Seurat object aSeurat_placenta <- subset(aSeurat, cells = placental_cells) # Remove original Seurat object and write new RDS file rm(aSeurat) rm(my_data) # check the datasets that the placental cells originate aSeurat_placenta@meta.data %>% group_by(dataset) %>% count() # perform filteration to include only the new dataset meta_data <- aSeurat_placenta@meta.data # filter out the new dataset cells newdata_cells <- row.names(meta_data[meta_data$dataset == "samples_MFI_new_sc_batch_2", ]) # Subset the Seurat object aSeurat_placenta_new <- subset(aSeurat_placenta, cells = newdata_cells) # Remove Seurat_placenta object and write new RDS file rm(aSeurat_placenta) rm(meta_data) #saveRDS(aSeurat_placenta_new, "aSeurat_placenta_new.rds") rm(aSeurat_placenta_new) ``` Now we have the Seurat object ready for analysis. ```{r} # Read in the .rds file aSeurat2 <- readRDS("aSeurat_placenta_new.rds") # Add in the Mitochondrial PCT% information aSeurat2$percent.mt <- PercentageFeatureSet(aSeurat2, pattern = "^MT-") # nCount_RNA is the number of UMI counts in a cell hist(aSeurat2$nCount_RNA) summary(aSeurat2@meta.data$nCount_RNA) # nFeature_RNA is the number of different genes that had any reads hist(aSeurat2$nFeature_RNA) summary(aSeurat2@meta.data$nFeature_RNA) # percent.mt is the percent mitochondrial reads summary(aSeurat2$percent.mt) # We see that the mitochondrial genes were removed. # Check for the percentage of ribosomal genes aSeurat2[["percent.rb"]] <- PercentageFeatureSet(aSeurat2, pattern = "^RP[SL]") hist(aSeurat2$percent.rb) summary(aSeurat2$percent.rb) dim(aSeurat2) ``` We now have a aSeurat2 object that has 33439 cells, which would be sufficient for downstream analysis. Another important part for QC is removing doublets. ```{r} # Find for doublets aSeurat2 <- as.SingleCellExperiment(aSeurat2) aSeurat2 <- scDblFinder(aSeurat2) # Convert back to Seurat object. aSeurat2 <- as.Seurat(aSeurat2) # Notice that our identifiers have changed. levels(Idents(object = aSeurat2)) # We want to specify the identifier whether the cell is a singlet or doublet. table(aSeurat2@meta.data$scDblFinder.class) Idents(object = aSeurat2) <- "scDblFinder.class" aSeurat2@meta.data$scDblFinder.class <- as.character(aSeurat2@meta.data$scDblFinder.class) # Subset the Seurat object to only include singlets. aSeurat2 <- subset(aSeurat2, idents = "singlet") levels(as.factor(aSeurat2@meta.data$scDblFinder.class)) # Check the dimensions of the dataset again. dim(aSeurat2) ``` Now, note that we have 31521 cells ```{r} colnames(aSeurat2@meta.data) # Make a violin plot of the QC columns plt <- VlnPlot(aSeurat2, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2, pt.size = 0.1) & theme(plot.title = element_text(size=10)) plt # We can say that most of the QC was already performed. ggsave(filename = "QC_(2023-06-14).png", plot = plt, width = 7, height = 3.5) ``` Looking at the violin plot, we do see some extreme values that should be filtered out. However, let's put the subsetting part for later and proceed to next steps and come back later. ```{r} # Make scatter plots to compare plot1 <- FeatureScatter(aSeurat2, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") plot1 # remove the cells with high mitochondrial gene percentage aSeurat2 <- subset(aSeurat2, subset = percent.mt < 5) dim(aSeurat2) ``` Next, we need to normalize the data. ```{r} # Next, we normalize the data aSeurat2 <- NormalizeData(aSeurat2) # Explore the most variable features, let's use the default version first. aSeurat2 <- FindVariableFeatures(aSeurat2) # aSeurat2 <- FindVariableFeatures(aSeurat2, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(aSeurat2), 10) top10 # plot variable features with and without labels plot2 <- VariableFeaturePlot(aSeurat2) plot2 <- LabelPoints(plot = plot2, points = top10, repel = TRUE) plot2 # Convert normalized gene expression to Z-score all.genes <- rownames(aSeurat2) aSeurat2 <- ScaleData(aSeurat2, features = all.genes) ``` Now, we are ready to run PCA. ```{r} # Run PCA analysis aSeurat2 <- RunPCA(aSeurat2, features = VariableFeatures(object = aSeurat2)) # Plot the PCA DimPlot(aSeurat2, reduction = "pca") # Plot the loadings VizDimLoadings(aSeurat2, dims = 1:2, reduction = "pca") & theme(axis.text=element_text(size=8), axis.title=element_text(size = 8,face="bold")) # Check the elbow plot to see how much variation is covered for each PC ElbowPlot(aSeurat2) # FeaturePlot of PC1 top features FeaturePlot(aSeurat2, features = c("KRT7", "KRT18")) # FeaturePlot of PC2 top features FeaturePlot(aSeurat2, features = c("HLA-G", "VIM")) ``` Next, we move on to derive clusters. ```{r} # Now perform clustering aSeurat2 <- FindNeighbors(aSeurat2, dims = 1:10) aSeurat2 <- FindClusters(aSeurat2, resolution = 0.5) # For visualization purposes, we need to generate UMAP reduced dimensionality representation aSeurat2 <- RunUMAP(aSeurat2, dims = 1:10, verbose = F) # Plot the UMAP with clustering DimPlot(aSeurat2, reduction = "umap") # Get the TSNE embedding aSeurat2 <- RunTSNE(aSeurat2, dims = 1:10, check_duplicates = FALSE) # Plot the t-SNE with clustering DimPlot(aSeurat2, reduction = "tsne") # Let's look at cluster sizes table(aSeurat2@meta.data$seurat_clusters) # Let's plot those clusters! DimPlot(aSeurat2,label.size = 4,repel = T,label = T) # Check expression patterns for some genes in top10 FeaturePlot(aSeurat2, features = c("KRT7", "EGFR")) ``` Let's calculate the cell cycle scores at this point. ```{r} aSeurat2 <- SetIdent(aSeurat2, value = "cell_type") Idents(aSeurat2) # Calculate the cell cycle scores s.genes <- cc.genes.updated.2019$s.genes g2m.genes <- cc.genes.updated.2019$g2m.genes aSeurat2 <- CellCycleScoring(aSeurat2, search = TRUE, s.features = s.genes, g2m.features = g2m.genes) table(aSeurat2[[]]$Phase) # Check if ribosomal genes show dependency on cluster FeaturePlot(aSeurat2,features = "percent.rb", label.size = 4, repel = T, label = F) & theme(plot.title = element_text(size=10)) # Check using the violin plot VlnPlot(aSeurat2, features = "percent.rb") & theme(plot.title = element_text(size=10), legend.position = 'none') # Check # of genes per cluster VlnPlot(aSeurat2,features = "nFeature_RNA") & theme(plot.title = element_text(size=10), legend.position = 'none') # Cell cycle score does not seem to depend on the cell type much FeaturePlot(aSeurat2, features = c("S.Score","G2M.Score"), label.size = 4, repel = T, label = T) & theme(plot.title = element_text(size = 10)) VlnPlot(aSeurat2, features = c("S.Score","G2M.Score")) & theme(plot.title = element_text(size = 10), legend.position = 'none') # Plot in UMAP DimPlot(aSeurat2, group.by = "Phase") # Plot with clusters compTable.int <- table(aSeurat2$coarse_annot, aSeurat2$Phase) compTable.int <- (compTable.int / rowSums(compTable.int)) * 100 WtOrRd_pal <- colorRampPalette(c("#FFFFFF", brewer.pal(9, "OrRd")))(100) pheatmap(compTable.int, color = WtOrRd_pal, breaks = 1:100) # FeaturePlot for showing mod score FeaturePlot(aSeurat2, features = c("S.Score", "G2M.Score")) ``` ```{r} DefaultAssay(aSeurat2) <- "RNA" # Perform log-normalization aSeurat2 <- NormalizeData(aSeurat2) # Find variable features aSeurat2 <- FindVariableFeatures(aSeurat2, selection.method = "vst", nfeatures = 2000) all.genes <- rownames(aSeurat2) aSeurat2 <- ScaleData(aSeurat2, features = all.genes) # Designate the markers all.markers <- FindAllMarkers(aSeurat2, only.pos = T, min.pct = 0.5, logfc.threshold = 0.5) dim(all.markers) table(all.markers$cluster) top3_markers <- as.data.frame(all.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)) head(top3_markers) dim(aSeurat2) p1 <- DimPlot(aSeurat2, label = T , repel = T, label.size = 3) + NoLegend() p2 <- DimPlot(aSeurat2, reduction = "pca") p3 <- DimPlot(aSeurat2, reduction = "tsne") p4 <- DimPlot(aSeurat2, reduction = "umap") p1 p2 p3 p4 ggsave(filename = "UMAP(2023-06-14).pdf", plot = p1, width = 9, height = 7) ggsave(filename = "tsne(2023-05-13).png", plot = p3, width = 7, height = 3.5) ggsave(filename = "umap_legend(2023-05-13).png", plot = p4, width = 6, height = 3.5) # ggsave(filename = "Check.png", plot = p1, width = 30, height = 10) saveRDS(aSeurat2, "aSeurat_analyzed.rds") ```