--- title: "Expression-Focused_Analysis_20231205" output: html_document --- ```{r Load libraries} library(DESeq2) library(apeglm) library(EnhancedVolcano) library(tidyverse) library(dplyr) library(org.Hs.eg.db) library(GenomicFeatures) ``` We first load the combined bulk RNA-seq generated so far. ```{r Data wrangling} accumul_dataset <- readRDS("accumulated_counts_2023.rds") # It is suggested to use a DGEList object to deal with bulk RNA-seq datasets. # But since I need to perform normalization first, I will use DESeq2. # Let's add the gene symbol at this point, and filter out those that don't have gene symbols from the counts matrix ensembl_ids <- rownames(accumul_dataset) # This function checks for the gene symbols that match the particular ENSEMBL ID. gene_symbols <- mapIds(org.Hs.eg.db, keys = ensembl_ids, column = "SYMBOL", keytype = "ENSEMBL") # how many gene symbols? length(gene_symbols) gene_symbols %<>% as_tibble(rownames = "ENSEMBL") %>% dplyr::rename(gene = value) gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) # Filter out the genes where it doesn't have a gene symbol gene_symbols %<>% dplyr::filter(!is.na(gene)) # Save the genes that are NAs (invalid since we already filtered them out) # gene_symbols_NA <- gene_symbols %>% dplyr::filter(is.na(gene)) #check for duplicate ENSEMBL IDs gene_symbols %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1) #check for duplicate gene symbols gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1) # Filter the counts matrix (these have a registered gene symbol) accumul_dataset <- accumul_dataset[gene_symbols$ENSEMBL,] # Check the dimensions of the filtered dataset dim(accumul_dataset) # Check order for gene symbols summary(gene_symbols$ENSEMBL == rownames(accumul_dataset)) # We'll select out the first gene ENSEMBL ID among the duplicates. accumul_dataset <- accumul_dataset[!duplicated(gene_symbols$gene),] # Let's also filter out genes that have zero expression throughout all samples. accumul_dataset <- accumul_dataset[(rowSums(accumul_dataset) > 0),] # We check the dimensions of this dataset. dim(accumul_dataset) ``` ```{r} # Load the human gtf file, containing information on each transcript gtf_file <- "/home/youkim/Genome_indexing/Homo_sapiens.GRCh38.109.gtf" txdb <- makeTxDbFromGFF(gtf_file, format = "gtf") # Select out the Y chromosome chromosome_name <- "Y" #Extract the Y chromosome Ensembl IDs genes_on_Y <- genes(txdb, filter = list(tx_chrom = "Y")) ensembl_gene_ids <- genes_on_Y@ranges@NAMES #Check for all Y chromosome genes Y_chr_expression <- accumul_dataset[(rownames(accumul_dataset) %in% ensembl_gene_ids),] %>% colSums() %>% data.frame(Y_exp = .) # We should also set the colData for DESeq2 coldata <- data.frame(sample = colnames(accumul_dataset), Y_chr_expression) #check if the order of the rows are identical rownames(coldata) == rownames(Y_chr_expression) # Add biological sex information in metadata coldata <- coldata %>% mutate(sex = ifelse(coldata$Y_exp > 500, "male", "female")) coldata <- coldata %>% mutate(sample = colnames(accumul_dataset)) %>% mutate(lobule = substr(sample, start = 2, stop = 9)) %>% mutate(labels = substr(sample, 10,12)) %>% dplyr::select(-sample) coldata[coldata$labels == '',]$labels <- 'EGFR_negative' coldata[coldata$labels == 'pos',]$labels <- 'EGFR_positive' coldata[coldata$labels == 'ns',]$labels <- "Non-separated" coldata[coldata$labels == 'L',]$labels <- "After_digestion" ``` I tried running DESeq2, but the number of groups (lobules) and the DE groups are identical, which doesn't allow normalization. We will use edgeR instead. ```{r eval = F} # Extract only EGFR+ and EGFR- dds_filtered <- dds[,dds$labels != "Non-separated"] dds_filtered <- dds_filtered[, dds_filtered$labels != "After_digestion"] # Verify colnames of counts equals rownames of colData summary(rownames(dds_filtered@colData) == colnames(dds_filtered@assays@data$counts)) # Extracting only EGFR+ pos <- dds_filtered[,(dds_filtered$labels == "EGFR_positive")] # We also specify the minimum number of samples to be 36, which is the number of M/F samples # smallestGroupSize <- 3 # (rowSums(counts(dds) >= 10) >= smallestGroupSize) %>% summary() # To reduce additional sources of variation, and since we want to know the different sets of genes expressed in different lobules, we generate to versions of the normalized dds object dds_filtered_total <- DESeq(dds_filtered) dds_filtered_pos <- DESeq(pos) ``` ```{r} x <- DGEList(counts = accumul_dataset, samples = coldata, genes = rownames(accumul_dataset)) x_normalized <- calcNormFactors(x, method = "TMM") x_normalized_filtered <- x_normalized[,x_normalized$samples$labels != "Non-separated"] x_normalized_filtered <- x_normalized_filtered[, x_normalized_filtered$samples$labels != "After_digestion"] # Check the dimensions for the filtered count matrix dim(x_normalized_filtered) x_normalized_filtered$samples <- x_normalized_filtered$samples %>% mutate(placenta = substr(lobule,1,5)) # Filter out the EGFR+ and EGFR- portions pos <- x_normalized_filtered[,(x_normalized_filtered$samples$labels == "EGFR_positive")] neg <- x_normalized_filtered[,(x_normalized_filtered$samples$labels == "EGFR_negative")] # I just realized that edgeR doesn't really "normalize" the count matrix. # We will use counts per million first. cpm_pos <- edgeR::cpm(pos, log = T, normalized.lib.sizes = T) cpm_neg <- edgeR::cpm(neg, log = T, normalized.lib.sizes = T) # We check for the dimensions again, and remove genes with cpm = 0. # Our main interest would be EGFR+. placentas <- unique(pos$samples$placenta) placentas_neg <- unique(neg$samples$placenta) new_tb <- data.frame() for(i in placentas){ mean_expression <- rowMeans(cpm_pos[,pos$samples$placenta == i]) std_dev <- apply(cpm_pos[,pos$samples$placenta == i], 1, sd) # Calculate CV for each gene for each placenta cv <- (std_dev / mean_expression) * 100 new_row <- data.frame(COV = cv) %>% t() %>% data.frame() new_tb <- bind_rows(new_tb, new_row) new_tb <- new_tb %>% t() %>% data.frame() %>% dplyr::rename(!!paste("COV_", i, sep = "") := "COV") %>% t() %>% data.frame() } final_tb <- new_tb %>% t() %>% data.frame() cor(final_tb, method = "spearman") new_tb_2 <- data.frame() for(i in placentas_neg){ mean_expression <- rowMeans(cpm_neg[,neg$samples$placenta == i]) std_dev <- apply(cpm_neg[,neg$samples$placenta == i], 1, sd) # Calculate CV for each gene for each placenta cv <- (std_dev / mean_expression) * 100 new_row <- data.frame(COV = cv) %>% t() %>% data.frame() new_tb_2 <- bind_rows(new_tb_2, new_row) new_tb_2 <- new_tb_2 %>% t() %>% data.frame() %>% dplyr::rename(!!paste("COV_", i, sep = "") := "COV") %>% t() %>% data.frame() } final_tb_2 <- new_tb_2 %>% t() %>% data.frame() cor(final_tb_2, method = "spearman") colnames(final_tb_2) == colnames(final_tb) cor(final_tb_2$COV_12950, final_tb$COV_12950, method = "spearman") cor(final_tb_2$COV_12919, final_tb$COV_12919, method = "spearman") strings <- paste0("COV_", placentas) new_tb <- tibble() for(i in strings){ c <- cor(final_tb_2[,i], final_tb[,i], method = "spearman") single_placenta <- data.frame(placenta = substr(i, 5, 10), cor_between_pos_neg = c) new_tb <- bind_rows(new_tb, single_placenta) } # Check for NA values summary(final_tb %>% is.na()) gene_symbols %>% filter(ENSEMBL %in% extreme_genes) %>% pull(gene) %>% cat() ``` Trophoblast metamarker COVs? Co-expression networks for placentas ```{r} # load library library(data.table) library(matrixStats) placentas <- unique(pos$samples$placenta) # new_tibble <- tibble() for(i in placentas){ transpose <- t(cpm_pos[,pos$samples$placenta == i]) transpose <- apply(transpose, 2, frank, ties.method = "average") transpose <- remove_constant_columns(transpose) cor_results <- fast_pearson_matrix_cor(transpose) # Sort the row and column names sorted_names <- sort(colnames(cor_results)) # sorted_names <- head(sorted_names, 100) cor_results <- cor_results[sorted_names,sorted_names] # Set upper triangle as NA cor_results[upper.tri(cor_results)] <- NA single_cor <- cor_results %>% data.frame() %>% rownames_to_column(var = "gene_1") %>% # Add row names as a column pivot_longer(cols = -gene_1, names_to = "gene_2", values_to = "correlation") %>% filter(is.na(correlation) == FALSE) %>% mutate(placenta = i) # new_tibble <- bind_rows(new_tibble, single_cor) write_csv(single_cor, paste0("placenta_", i, "_corrs_20231222.csv.gz")) } # corr_12764 <- read_csv("placenta_12764_corrs_20231221.csv") # corr_12788 <- read_csv("placenta_12788_corrs_20231221.csv") head(corr_12764) tibble_all <- tibble() files <- list.files(pattern = "*.csv") for(file in files){ print(file) corr_single <- read_csv(file) corr_single <- corr_single %>% dplyr::rename(!!substr(file, 10, 20) := correlation) %>% dplyr::select(-placenta) corr_single <- corr_single %>% mutate(gene_1 = sort(c(gene_1,gene_2))[1], gene_2 = sort(c(gene_1,gene_2))[2]) if(nrow(tibble_all) == 0){ tibble_all <- corr_single } else { tibble_all <- full_join(tibble_all, corr_single) } } tibble_all %>% dplyr::select(gene_1, gene_2) %>% inner_join(tibble_all %>% dplyr::select(gene_1 = gene_2, gene_2 = gene_1)) %>% nrow() rownames(corr_12764) transpose <- t(pos$counts) dim(transpose) transpose <- apply(transpose, 2, frank, ties.method = "average") dim(transpose) transpose <- remove_constant_columns(transpose) dim(transpose) transpose[1:5,1:5] cor_results <- fast_pearson_matrix_cor(transpose) cor_results[1:5,1:5] cor(t(pos$counts)[,1], t(pos$counts)[,2], method = "spearman") #Pearson correlation between two matrices. Mirroring the R cor function, it performs column versus column correlation ``` ```{r} remove_constant_columns <- function(matrix_data) { # Identify constant columns constant_cols <- apply(matrix_data, 2, function(col) all(col == col[1])) matrix_without_constants <- matrix_data[, !constant_cols, drop = FALSE] return(matrix_without_constants) } fast_pearson_matrix_cor <- function(matrixA) { matrixA <- sweep(matrixA, 2, colMeans(matrixA), FUN = "-") matrixA <- sweep(matrixA, 2, sqrt(colSums(matrixA^2)), FUN = "/") cr = crossprod(matrixA, matrixA) dimnames(cr) = list(colnames(matrixA), colnames(matrixA)) cr } ``` Refer to Jonathan's code for fast correlations ```{r} library(reticulate) library(knitr) ``` ```{python} import numpy as np import scipy.stats as sci ``` ```{r} test_py_exp = np$array(exp_data) #Turn R matrix into numpy array ``` ```{python} rank_test_py_exp = sci.rankdata(r.test_py_exp, method = 'average', axis = 1) #Row ranks rank_test_py_exp = rank_test_py_exp - rank_test_py_exp.mean(axis = 1)[1] #Center each gene rank_test_py_exp = rank_test_py_exp /np.sqrt(np.square(rank_test_py_exp).sum(axis = 1))[:,None] #divide by sqrt(rowSums) cr_python = np.dot(rank_test_py_exp, rank_test_py_exp.T) # Get correlations cr_python = pd.DataFrame(cr_python, columns = r.r_genes, index = r.r_genes ) #Turn to pandas for converting to R ``` ```{r} #Convert back to R and save corr_matrix = py$cr_python corr_matrix = as.matrix(corr_matrix) save(corr_matrix, file_path) ```