--- title: "Bulk_RNAseq_Analysis_(2023-06-20)" output: html_document --- First, we load our libraries ```{r} # install libraries #install.packages("ggfortify") #install.packages("dequer") #BiocManager::install("genefilter") # install.packages("gsubfn") # Load libraries library(dplyr) library(magrittr) library(readr) library(edgeR) library(ggplot2) library(tidyverse) library(ggfortify) library(genefilter) library(cowplot) library(DT) library(limma) library(pheatmap) library(dequer) library(gsubfn) library(conflicted) # BiocManager::install("org.Mm.eg.db") library(org.Mm.eg.db) conflict_prefer("filter", "dplyr") conflict_prefer("select", "dplyr") conflict_prefer("rename", "dplyr") conflict_prefer("intersect", "dplyr") conflict_prefer("desc", "dplyr") ``` Let's first load our count matrix. ```{r} counts <- readRDS("count_matrix_20230925.rds") # Create a copy of the count matrix backup <- readRDS("count_matrix_20230925.rds") # Check dimensions of the matrix and use head() dim(counts) colnames(counts) # Taking a glance of the matrix counts[1:10, 1:6] ``` As a first step, we should remove the first four rows and make the gene column as the row names. ```{r} counts <- data.frame(counts) rownames(counts) <- counts$gene counts %<>% dplyr::select(-gene) counts <- counts[5:nrow(counts),] # We will check if there are any duplicated ENSEMBL IDs dups1 <- duplicated(rownames(counts)) summary(dups1) rm(dups1) # Since we don't have any duplicated ENSG identifiers, we move on to remove the low or 0 count genes. First, we normalize using counts per million (cpm) cpm <- edgeR::cpm(counts) # Wrangling the backup object to check for the number of total reads per sample backup <- data.frame(backup) class(backup) # Check for the library size rownames(backup) <- backup$gene backup %<>% dplyr::select(-gene) backup <- backup[5:nrow(backup),] columnsums <- backup %>% colSums() # This value calculates for the mean library size across all samples mean(columnsums) cpm_backup <- edgeR::cpm(backup) # Let's not discard any genes at this step! dim(cpm_backup) ``` For our main analysis, we will preserve all 56941 genes for further data exploration. It is important to keep as many genes as possible. For the backup, we will select out the genes that showed enough counts, and later see how it makes a difference in our findings. Before we move on, we want to know the average alignment rate across the 72 samples. Instead of recording all 72 values manually, let's try writing a piece of code. ```{r} # As a first step, we need to get a list of directories that contain the outputLog.final file. file_list <- list.files(path = "/data/circadian_rhythms/STAR", full.names = TRUE) # Next, use a loop to read in each file, extract the "alignment rate" value, and store it in a table: table <- tibble() # initialize an empty vector to store the rates for (i in seq_along(file_list)) { # set the working directory setwd(file_list[i]) # Read in the file inside_STAR <- list.files(path = file_list[i], full.names = TRUE) file_content <- readLines(inside_STAR[grepl(pattern = "Log.final.out", inside_STAR)]) # Extract the alignment rate value rate_string <- grep("Uniquely mapped reads %", file_content, value = TRUE) # find the line containing the rate rate_value <- as.numeric(gsub("[^0-9.]+", "", rate_string)) # extract the numeric value from the line # Store the rate value in the vector table %<>% bind_rows(tibble(file_list[i], rate_value)) } mean_rate <- mean(table$rate_value) mean_rate ``` Overall, we have an average alignment rate (Uniquely mapped reads) of 91.25%. ### Check the success of the low count filter Make before and after graphs of the raw data and the filtered data to ensure your filter worked. ```{r hist_cpm_dirty} # We previously defined the object cpm, but for clarity, we define again: cpm_dirty1 <- cpm # We check with a histogram ``` Since the number of genes with very low expression (almost zero) vastly outweighs the number of genes being expressed, taking the log value is helpful. ```{r hist_lcpm_dirty} # make a lcpm object with the lcpm for the unfiltered data lcpm_dirty1 <- edgeR::cpm(counts, log = TRUE, prior.count = 2) # the argument prior.count = 2 is to prevent the log value being zero. hist(lcpm_dirty1, n = 100) ``` ```{r fig.width = 15, fig.height = 10} # Now we compare raw and filtered data. # make the dirty lcpm plot for unfiltered data p1 <- lcpm_dirty1 %>% data.frame() %>% pivot_longer(cols = everything(), names_to = "samples", values_to = "exprs") %>% ggplot(aes(x = exprs, colour = samples)) + geom_density() + labs(title = "Unfiltered lcpm") + theme(legend.position = "bottom", legend.key.size = unit(0.5, 'cm')) plot(p1) ``` Let's modify the backup file at this point. ```{r} # Let's check how many genes have at least 1 cpm in at least 1 of the 72 samples. summary(rowSums(cpm_backup >= 1) >= 1) keep_exprs <- (rowSums(cpm_backup >= 1) >= 1) # Now filter based on keep_exprs. Note that the argument keep.lib.sizes = FALSE allows access to the sample data. backup_filtered <- backup[keep_exprs,] # double check how many rows exist in x_filtered nrow(counts_filtered) ``` ### Normalization The next step would be normalizing the factors. ```{r fig.width = 13, fig.height = 9} # We can normalize the data and calculate the normalization factors using the function calcNormFactors(). # Let's make it consistent with the original script, by making the matrix into a DGElist object colnames(counts) rownames(counts) %>% head() x <- DGEList(counts = counts, samples = colnames(counts), genes = rownames(counts)) x_normalized <- calcNormFactors(x, method = "TMM") # We verify whether the normalization occurred correctly, by comparing the boxplots of the data before and after normalization. # DATA not normalized (use normalized.lib.sizes = F) p3 <- edgeR::cpm(x_normalized, log = TRUE, normalized.lib.sizes = F) %>% data.frame() %>% pivot_longer(cols = everything(), names_to = "samples", values_to = "exprs") %>% ggplot(aes(x = samples, y = exprs, fill = samples)) + geom_boxplot() + labs(title = "Non-normalized") + # modify the legend size theme(legend.position = "bottom", legend.key.size = unit(0.5, 'cm'), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # DATA normalized (use normalized.lib.sizes = T) p4 <- edgeR::cpm(x_normalized, log = T, normalized.lib.sizes = T) %>% data.frame() %>% pivot_longer(cols = everything(), names_to = "samples", values_to = "exprs") %>% ggplot(aes(x = samples, y = exprs, fill = samples)) + geom_boxplot() + labs(title = "Normalized") + # modify the legend size theme(legend.position = "bottom", legend.key.size = unit(0.5, 'cm'), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Make a side by side plot for comparison plot_grid(p3, p4) # Let's check the calculated normalization factors head(x_normalized$samples$norm.factors) ``` In order to make comparisons between the samples possible, we need to normalize the data. As we compare plots before and after normalization, we now observe that the median values of the samples are now nearly identical, supporting that our normalization was successful. Here's something new that we'll try. We will get the information of all the Y chromosome genes and get an average expression level. This is because we observe a discrepancy in the sex annotations, which causes an issue if we want to focus on XCI. ```{r} #BiocManager::install("GenomicFeatures") #BiocManager::install("BSgenome.Hsapiens.UCSC.hg38") library(GenomicFeatures) library(BSgenome.Hsapiens.UCSC.hg38) gtf_file <- "/home/youkim/Genome_indexing/Mus_musculus.GRCm39.110.gtf" txdb <- makeTxDbFromGFF(gtf_file, format = "gtf") chromosome_name <- "Y" #Extract the Y chromosome Ensembl IDs genes_on_Y <- genes(txdb, filter = list(tx_chrom = "Y")) ensembl_gene_ids <- genes_on_Y@ranges@NAMES #Check for all Y chromosome genes Y_chr_expression <- x_normalized$counts[ensembl_gene_ids,] %>% colSums() %>% data.frame(Y_exp = .) #check if the order of the rows are identical rownames(x_normalized$samples) == rownames(Y_chr_expression) #add the data on to the metadata x_normalized$samples <- x_normalized$samples %>% mutate(Y_chr_expression) # Add biological sex information in metadata x_normalized$samples <- x_normalized$samples %>% mutate(sex = ifelse(x_normalized$samples$Y_exp > 1000, "male", "female")) pdf(file="20230925_Y_chr_analysis.pdf", width = 10, height = 7) x_normalized$samples %>% ggplot(aes(x = samples, y = Y_exp)) + geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) x_normalized$samples %>% ggplot(aes(x= sex, y= Y_exp)) + geom_boxplot() + theme_bw() dev.off() # Let's save the x_normalized object as an RDS object first, and we could load it later. # saveRDS(x_normalized, "Normalized_all_genes.rds") ``` Based on the bar plot, we notice that odd samples have high expression of Y chromosome genes. In other words, it would be safe to say that odd samples are male and even samples are female. As a first step, let's perform DE based on biological sex. ```{r} x_normalized <- readRDS("Normalized_all_genes.rds") # Let's also add columns addressing their circadian status (time of the day) # Load the .csv file including the timepoint data tpdata <- read_csv("Sample_Info(Sept.2023).csv", col_names = F) # Modify the second column extracted_part <- str_extract(tpdata$X2, "(?<=_).*?(?=_)") %>% rev() # Attach the timepoint data to our bulk RNA-seq metadata. x_normalized$samples <- x_normalized$samples %>% mutate(time_point = extracted_part) # We want to modify our colnames of the count matrix, so that it can be compatible for NiteCap sample_numbers <- str_extract(x_normalized$samples$samples, "(?<=_)[^_]+(?=_)") x_normalized$samples <- x_normalized$samples %>% mutate(sample_numbers = sample_numbers) new_colnames <- paste0(x_normalized$samples$sex, "_", x_normalized$samples$time_point, "_", x_normalized$samples$sample_numbers) colnames(x_normalized$counts) <- new_colnames rownames(x_normalized$samples) <- new_colnames # Check new column names colnames(x_normalized$counts) # Let's create new .csv containing the count matrix in a reverse order for future reference! copy_count <- x_normalized$counts[, ncol(x_normalized$counts):1] # write.csv(copy_count, "Labelled_counts.csv") male_samples <- x_normalized$samples[x_normalized$samples$sex == "male",] female_samples <- x_normalized$samples[x_normalized$samples$sex == "female",] male_dge <- x_normalized[, (colnames(x_normalized) %in% rownames(male_samples))] female_dge <- x_normalized[, (colnames(x_normalized) %in% rownames(female_samples))] male_count <- male_dge$counts[, ncol(male_dge$counts):1] # write.csv(male_count, "male_counts.csv") female_count <- female_dge$counts[, ncol(female_dge$counts):1] # write.csv(female_count, "female_counts.csv") ``` Before running NiteCap, I thought it would be much easier if we add the gene IDs. ```{r} ensembl_ids <- rownames(x_normalized$counts) gene_symbols <- mapIds(org.Mm.eg.db, keys = ensembl_ids, column = "SYMBOL", keytype = "ENSEMBL") head(gene_symbols) gene_symbols %<>% as_tibble(rownames = "ENSEMBL") %>% dplyr::rename(gene = value) gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) gene_symbols %<>% dplyr::filter(!is.na(gene)) gene_symbols_NA <- gene_symbols %>% dplyr::filter(is.na(gene)) #check for 1-many or many-1 gene_symbols %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) # write.csv(gene_symbols, "mm_gene_symbols.csv") #check for 1-many or many-1 for gene_symbols_NA gene_symbols_NA %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1) # Now we need to combine the two data frames male_count <- male_count %>% data.frame() %>% rownames_to_column() %>% rename(ENSEMBL = rowname) %>% inner_join(gene_symbols) female_count <- female_count %>% data.frame() %>% rownames_to_column() %>% rename(ENSEMBL = rowname) %>% inner_join(gene_symbols) # Note that we are focusing on annotated genes! (gene_ids with NAs were removed) # write.csv(male_count, "male_counts_20231009.csv") # write.csv(female_count, "female_counts_20231009.csv") male_count <- read.csv("male_counts_20231009.csv") tibble_colnames <- tibble(name = colnames(male_count)) %>% filter(grepl("male_", name)) %>% mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% arrange(timepoint) male_count_modified <- male_count[,c("gene", tibble_colnames$name)] # write.csv(male_count_modified, "male_counts_20231010.csv") female_count <- read.csv("female_counts_20231009.csv") tibble_colnames <- tibble(name = colnames(female_count)) %>% filter(grepl("female_", name)) %>% mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% arrange(timepoint) female_count_modified <- female_count[,c("gene", tibble_colnames$name)] # write.csv(female_count_modified, "female_counts_20231010.csv") male_count_NA <- male_count %>% data.frame() %>% rownames_to_column() %>% rename(ENSEMBL = rowname) %>% inner_join(gene_symbols_NA) female_count_NA <- female_count %>% data.frame() %>% rownames_to_column() %>% rename(ENSEMBL = rowname) %>% inner_join(gene_symbols_NA) # write.csv(male_count_NA, "male_counts_NA_20231010.csv") # write.csv(female_count_NA, "female_counts_NA_20231010.csv") ``` I think we can create a new code chunk at this point, to form data frames Alex requests ```{r} # Added Oct.13th male_count_modified <- read.csv("male_counts_20231010.csv") female_count_modified <- read.csv("female_counts_20231010.csv") # Now, we want to remove the genes that had zero counts for all samples. male_gene_counts <- rowSums(male_count_modified[,3:ncol(male_count_modified)]) female_gene_counts <- rowSums(female_count_modified[,3:ncol(female_count_modified)]) # Check the minimum gene count for males and females min(male_gene_counts) min(female_gene_counts) nonzero_gene_counts_M <- which(male_gene_counts != 0) nonzero_gene_counts_F <- which(female_gene_counts != 0) filtered_male_count <- male_count_modified[nonzero_gene_counts_M,] filtered_female_count <- female_count_modified[nonzero_gene_counts_F,] # Check whether the minimum gene count has changed filtered_gene_counts_M <- rowSums(filtered_male_count[,3:ncol(filtered_male_count)]) filtered_gene_counts_F <- rowSums(filtered_female_count[,3:ncol(filtered_female_count)]) min(filtered_gene_counts_M) min(filtered_gene_counts_F) summary(female_gene_counts == 0) # write.csv(filtered_male_count, "filtered_male_counts_20231013.csv") # write.csv(filtered_female_count, "filtered_female_counts_20231013.csv") ``` ```{r eval = F} # (IPR) I think we could also try generating a linear model too later (time of day vs biological sex) summary(lm(time_point ~ sex, data = x_normalized$samples)) ``` Let's start with making the model matrix for the samples ```{r} designSEQ <- model.matrix(~ 0 + x_normalized$samples$sex) # Change the column names of the design matrix colnames(designSEQ) <- c("female","male") # check the design matrix head(designSEQ) dim(designSEQ) ``` Now with this design matrix, we will fit the data into a linear model. For this purpose, we will need the contrast matrix, which is generated in the next problem. Note that the design matrix has 72 rows, which are for each different sample. ```{r contrasts} # We make the contrast matrix contr.SEQ <- makeContrasts( femaleVsMale = female - male, levels = colnames(designSEQ)) # Explore the contrast matrix contr.SEQ ``` Apply the voom transformation. This step adjusts the mean variance trend. Recall that RNA seq data needs to have the heteroscedasticity removed. ```{r voom} # Create the voom object. Set plot = T, to observe the plot. voomObj <- voom(x_normalized, designSEQ, plot = T) ``` ### Create graphs to observe the effect of heteroscedasticity removal ```{r heteroscedasticity} # With our voom-transformed object, we perform further limma pipeline steps. # create linear models of designSEQ vfit <- lmFit(voomObj, design = designSEQ) # fit the contrast to the model with contrasts.fit() vfit <- contrasts.fit(vfit, contrast = contr.SEQ) # apply a Bayesian correction to the model, with threshold 0.01 efit <- eBayes(vfit, 0.01) # Compare before and after removing the heteroscedasticity effects par(mfrow = c(1,2)) voomObj <- voom(x_normalized, designSEQ, plot = T) plotSA(efit, main = "Heteroscedasticity removed") ``` ### Use PCA to inspect the sample distribution Create a figure/s and discuss if the samples separate by group and in which PC axis. Do you think there could be a batch effect present? Any other observations of note. ```{r PCA} # We notice that the voomObj$E contains the count data, which will be compatible for creating a pca object. pca1 <- voomObj$E %>% t() %>% prcomp() View(x_normalized$samples) x_normalized$samples %<>% mutate(EGFR_labels_short = dplyr::case_match(Switched_EGFR_Labels, "EGFR_positive" ~ "EGFR+", "EGFR_negative" ~ "EGFR-", "Undigested_tissue" ~ "UT")) # Using the pca1 object, plot the PCA data categorized by the three groups # Save the plot as an object p1 <- autoplot(pca1, data = x_normalized$samples) + labs(title = "PCA Plot by Biological Sex") + geom_point(aes(colour = x_normalized$samples$sex), size=2.5) + labs(colour = "") + theme_bw() + theme(legend.position="bottom", plot.margin = margin(2,2,2,2, "cm")) pdf(file="PCA_EGFRexp_20230623.pdf", width = 6, height = 6) plot(p1) dev.off() ``` Since we observe that the lobules are not well clustered in the PCA plot using PC1 and PC2, we will try using PC3. ```{r} pca_values <- pca1$x pca_rotations <- pca1$rotation # Save the PCA plot as an object pc1 <- ggplot(data.frame(pca_values)) + aes(x = PC1, y = PC2, colour = x_normalized$samples$lobules) + geom_point() + labs(title = "PCA Plot by Lobule (PC1 and PC2)") + labs(colour = "lobules") pc2 <- ggplot(data.frame(pca_values)) + aes(x = PC1, y = PC3, colour = x_normalized$samples$lobules) + geom_point() + labs(title = "PCA Plot by Lobule (PC1 and PC3)") + labs(colour = "lobules") pc3 <- ggplot(data.frame(pca_values)) + aes(x = PC2, y = PC3, colour = x_normalized$samples$lobules) + geom_point() + labs(title = "PCA Plot by Lobule (PC2 and PC3)") + labs(colour = "lobules") png(filename="PCA_lobules_PC1_PC2.png") plot(pc1) dev.off() png(filename="PCA_lobules_PC1_PC3.png") plot(pc2) dev.off() png(filename="PCA_lobules_PC2_PC3.png") plot(pc3) dev.off() ``` Below, we use the function plotMDS(), which is similar to PCA, and compare it with the PCA plots we generated above. ```{r} # Since this is RNA-seq data, we could also use the function plotMDS() # We first factor the EGFR_labels column in the samples, and change them to more simple entries (e.g., P, N, NonSep), to prevent labels more visible x_normalized$samples$EGFR_labels <- x_normalized$samples$EGFR_labels %>% factor() factor(x_normalized$samples$EGFR_labels) levels(x_normalized$samples$EGFR_labels) <- c("N", "NonSep", "P") # We do the similar thing for placenta x_normalized$samples$placenta <- x_normalized$samples$placenta %>% factor() factor(x_normalized$samples$placenta) # Get the mulidimensional scaling plot plotMDS(voomObj, labels = x_normalized$samples$placenta) plotMDS(voomObj, labels = x_normalized$samples$lobules) ``` ### Limma Apply the limma pipeline/workflow to calculate differential expressed genes and display a nicely formatted datatable. Consider which columns you should include or remove. ```{r limma} # First, we generate a table of top-ranked genes from linear modeling, using the topTable() function. We will set the number of top genes as 50. tT <- topTable(efit, adjust = "fdr", sort.by = "B", number = 50) # Examine the table object using head() head(tT) # It seems that the first column is redundant, so we can remove them. # Now we make the interactive data table. data_table <- tT[, c(2:7)] %>% # Let's round the numbers to two decimal places mutate_if(is.numeric, funs(round(., 2))) %>% datatable(options = list(scrollX = TRUE)) # Examine the data table object data_table ``` ### Heatmap using the top 100 genes ```{r fig.height = 7, fig.width = 6} # Since we want top genes, we need to set a specific var.cutoff value for the varFilter() function. To do this, we need to calculate the following: ratio <- 1 - 100 / nrow(voomObj$E) # put the ratio as the value for var.cutoff varG <- varFilter(voomObj$E, var.cutoff = ratio) # confirm that varG has 20 rows nrow(varG) # Set the annotation. We already saw in problem #8 that there are no batch effects, so we only set disease state in the annotation column. anno.col <- data.frame(row.names = colnames(varG), pheno = as.factor(x_normalized$samples$sex)) # Plot the heatmap and make 3 clusters to examine how well the clusters catch the disease states. pheatmap(varG, scale = "row", annotation_col = anno.col, labels_row = x_normalized$genes[c(row.names(varG)),], cutree_cols = 2) ```