---
title: "Expression-Focused_Analysis_20231205"
output: html_document
---

```{r Load libraries}
library(DESeq2)
library(apeglm)
library(EnhancedVolcano)
library(tidyverse)
library(dplyr)
library(org.Hs.eg.db)
library(GenomicFeatures)
```

We first load the combined bulk RNA-seq generated so far.

```{r Data wrangling}
accumul_dataset <- readRDS("accumulated_counts_2023.rds")

# It is suggested to use a DGEList object to deal with bulk RNA-seq datasets.
# But since I need to perform normalization first, I will use DESeq2.

# Let's add the gene symbol at this point, and filter out those that don't have gene symbols from the counts matrix
ensembl_ids <- rownames(accumul_dataset) 

# This function checks for the gene symbols that match the particular ENSEMBL ID.
gene_symbols <- mapIds(org.Hs.eg.db, 
                       keys = ensembl_ids, 
                       column = "SYMBOL",
                       keytype = "ENSEMBL")

# how many gene symbols?
length(gene_symbols)

gene_symbols %<>% 
  as_tibble(rownames = "ENSEMBL") %>% 
  dplyr::rename(gene = value)

gene_symbols %>% 
  group_by(gene) %>% 
  dplyr::count() %>% 
  filter(n != 1)

# Filter out the genes where it doesn't have a gene symbol
gene_symbols %<>% dplyr::filter(!is.na(gene))

# Save the genes that are NAs (invalid since we already filtered them out)
# gene_symbols_NA <- gene_symbols %>% dplyr::filter(is.na(gene))

#check for duplicate ENSEMBL IDs
gene_symbols %>% group_by(ENSEMBL) %>% dplyr::count() %>% filter(n != 1)

#check for duplicate gene symbols
gene_symbols %>% group_by(gene) %>% dplyr::count() %>% filter(n != 1)

# Filter the counts matrix (these have a registered gene symbol)
accumul_dataset <- accumul_dataset[gene_symbols$ENSEMBL,]
# Check the dimensions of the filtered dataset
dim(accumul_dataset)

# Check order for gene symbols
summary(gene_symbols$ENSEMBL == rownames(accumul_dataset))

# We'll select out the first gene ENSEMBL ID among the duplicates.
accumul_dataset <- accumul_dataset[!duplicated(gene_symbols$gene),]

# Let's also filter out genes that have zero expression throughout all samples.
accumul_dataset <- accumul_dataset[(rowSums(accumul_dataset) > 0),]

# We check the dimensions of this dataset.
dim(accumul_dataset)
```

```{r}
# Load the human gtf file, containing information on each transcript
gtf_file <- "/home/youkim/Genome_indexing/Homo_sapiens.GRCh38.109.gtf"
txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")

# Select out the Y chromosome
chromosome_name <- "Y"

#Extract the Y chromosome Ensembl IDs
genes_on_Y <- genes(txdb, filter = list(tx_chrom = "Y"))
ensembl_gene_ids <- genes_on_Y@ranges@NAMES


#Check for all Y chromosome genes
Y_chr_expression <- accumul_dataset[(rownames(accumul_dataset) %in% ensembl_gene_ids),] %>% 
  colSums() %>% 
  data.frame(Y_exp = .)

# We should also set the colData for DESeq2
coldata <- data.frame(sample = colnames(accumul_dataset), Y_chr_expression)
  
#check if the order of the rows are identical
rownames(coldata) == rownames(Y_chr_expression)

# Add biological sex information in metadata
coldata <- coldata %>% 
  mutate(sex = ifelse(coldata$Y_exp > 500, "male", "female"))
coldata <- coldata %>% 
  mutate(sample = colnames(accumul_dataset)) %>% 
  mutate(lobule = substr(sample, start = 2, stop = 9)) %>% 
  mutate(labels = substr(sample, 10,12)) %>% 
  dplyr::select(-sample)

coldata[coldata$labels == '',]$labels <- 'EGFR_negative'
coldata[coldata$labels == 'pos',]$labels <- 'EGFR_positive'
coldata[coldata$labels == 'ns',]$labels <- "Non-separated"
coldata[coldata$labels == 'L',]$labels <- "After_digestion"

```

I tried running DESeq2, but the number of groups (lobules) and the DE groups are identical, which doesn't allow normalization. We will use edgeR instead.

```{r eval = F}
# Extract only EGFR+ and EGFR-
dds_filtered <- dds[,dds$labels != "Non-separated"]
dds_filtered <- dds_filtered[, dds_filtered$labels != "After_digestion"]

# Verify colnames of counts equals rownames of colData
summary(rownames(dds_filtered@colData) == colnames(dds_filtered@assays@data$counts))

# Extracting only EGFR+
pos <- dds_filtered[,(dds_filtered$labels == "EGFR_positive")]

# We also specify the minimum number of samples to be 36, which is the number of M/F samples
# smallestGroupSize <- 3
# (rowSums(counts(dds) >= 10) >= smallestGroupSize) %>% summary()

# To reduce additional sources of variation, and since we want to know the different sets of genes expressed in different lobules, we generate to versions of the normalized dds object

dds_filtered_total <- DESeq(dds_filtered)
dds_filtered_pos <- DESeq(pos)
```

```{r}
x <- DGEList(counts = accumul_dataset, 
             samples = coldata,
             genes = rownames(accumul_dataset))

x_normalized <- calcNormFactors(x, method = "TMM")

x_normalized_filtered <- x_normalized[,x_normalized$samples$labels != "Non-separated"]
x_normalized_filtered <- x_normalized_filtered[, x_normalized_filtered$samples$labels != "After_digestion"]

# Check the dimensions for the filtered count matrix
dim(x_normalized_filtered)

x_normalized_filtered$samples <- x_normalized_filtered$samples %>% 
  mutate(placenta = substr(lobule,1,5))

# Filter out the EGFR+ and EGFR- portions
pos <- x_normalized_filtered[,(x_normalized_filtered$samples$labels == "EGFR_positive")]
neg <- x_normalized_filtered[,(x_normalized_filtered$samples$labels == "EGFR_negative")]

# I just realized that edgeR doesn't really "normalize" the count matrix.
# We will use counts per million first.
cpm_pos <- edgeR::cpm(pos, log = T, normalized.lib.sizes = T)
cpm_neg <- edgeR::cpm(neg, log = T, normalized.lib.sizes = T)

# We check for the dimensions again, and remove genes with cpm = 0.
# Our main interest would be EGFR+.
placentas <- unique(pos$samples$placenta)
placentas_neg <- unique(neg$samples$placenta)

new_tb <- data.frame()
for(i in placentas){
mean_expression <- rowMeans(cpm_pos[,pos$samples$placenta == i])
std_dev <- apply(cpm_pos[,pos$samples$placenta == i], 1, sd)
# Calculate CV for each gene for each placenta
cv <- (std_dev / mean_expression) * 100
new_row <- data.frame(COV = cv) %>%
    t() %>%
    data.frame()
new_tb <- bind_rows(new_tb, new_row)
new_tb <- new_tb %>% t() %>% 
  data.frame() %>% 
  dplyr::rename(!!paste("COV_", i, sep = "") := "COV") %>% 
  t() %>% 
  data.frame()
}
final_tb <- new_tb %>% t() %>% data.frame()

cor(final_tb, method = "spearman")

new_tb_2 <- data.frame()
for(i in placentas_neg){
mean_expression <- rowMeans(cpm_neg[,neg$samples$placenta == i])
std_dev <- apply(cpm_neg[,neg$samples$placenta == i], 1, sd)
# Calculate CV for each gene for each placenta
cv <- (std_dev / mean_expression) * 100
new_row <- data.frame(COV = cv) %>%
    t() %>%
    data.frame()
new_tb_2 <- bind_rows(new_tb_2, new_row)
new_tb_2 <- new_tb_2 %>% t() %>% 
  data.frame() %>% 
  dplyr::rename(!!paste("COV_", i, sep = "") := "COV") %>% 
  t() %>% 
  data.frame()
}
final_tb_2 <- new_tb_2 %>% t() %>% data.frame()

cor(final_tb_2, method = "spearman")

colnames(final_tb_2) == colnames(final_tb)
cor(final_tb_2$COV_12950, final_tb$COV_12950, method = "spearman")
cor(final_tb_2$COV_12919, final_tb$COV_12919, method = "spearman")

strings <- paste0("COV_", placentas)
new_tb <- tibble()
for(i in strings){
  c <- cor(final_tb_2[,i], final_tb[,i], method = "spearman")
  single_placenta <- data.frame(placenta = substr(i, 5, 10), cor_between_pos_neg = c) 
  new_tb <- bind_rows(new_tb, single_placenta)
}

# Check for NA values
summary(final_tb %>% is.na())

gene_symbols %>% 
  filter(ENSEMBL %in% extreme_genes) %>% 
  pull(gene) %>% 
  cat() 
```

Trophoblast metamarker COVs?

Co-expression networks for placentas
```{r}
# load library
library(data.table)
library(matrixStats)

placentas <- unique(pos$samples$placenta)

# new_tibble <- tibble()
for(i in placentas){
  transpose <- t(cpm_pos[,pos$samples$placenta == i])
  transpose <- apply(transpose, 2, frank, ties.method = "average")
  transpose <- remove_constant_columns(transpose)
  cor_results <- fast_pearson_matrix_cor(transpose)
  # Sort the row and column names
  sorted_names <- sort(colnames(cor_results))
  # sorted_names <- head(sorted_names, 100)
  cor_results <- cor_results[sorted_names,sorted_names]
  # Set upper triangle as NA
  cor_results[upper.tri(cor_results)] <- NA
  single_cor <- cor_results %>% 
    data.frame() %>%
    rownames_to_column(var = "gene_1") %>%  # Add row names as a column
    pivot_longer(cols = -gene_1, names_to = "gene_2", values_to = "correlation") %>% 
    filter(is.na(correlation) == FALSE) %>% 
    mutate(placenta = i)
  # new_tibble <- bind_rows(new_tibble, single_cor)
  write_csv(single_cor, paste0("placenta_", i, "_corrs_20231222.csv.gz"))
}


# corr_12764 <- read_csv("placenta_12764_corrs_20231221.csv")
# corr_12788 <- read_csv("placenta_12788_corrs_20231221.csv")

head(corr_12764)
tibble_all <- tibble()

files <- list.files(pattern = "*.csv")
for(file in files){
  print(file)
  corr_single <- read_csv(file)
  corr_single <- corr_single %>% 
    dplyr::rename(!!substr(file, 10, 20) := correlation) %>% 
    dplyr::select(-placenta)
  corr_single <- corr_single %>% 
    mutate(gene_1 = sort(c(gene_1,gene_2))[1], gene_2 = sort(c(gene_1,gene_2))[2])
  if(nrow(tibble_all) == 0){
    tibble_all <- corr_single
  } else {
    tibble_all <- full_join(tibble_all, corr_single)
  }
}

tibble_all %>% 
  dplyr::select(gene_1, gene_2) %>% 
  inner_join(tibble_all %>% 
  dplyr::select(gene_1 = gene_2, gene_2 = gene_1)) %>% 
  nrow()

rownames(corr_12764)

transpose <- t(pos$counts)
dim(transpose)

transpose <- apply(transpose, 2, frank, ties.method = "average")
dim(transpose)

transpose <- remove_constant_columns(transpose)
dim(transpose)
transpose[1:5,1:5]

cor_results <- fast_pearson_matrix_cor(transpose)
cor_results[1:5,1:5]

cor(t(pos$counts)[,1], t(pos$counts)[,2], method = "spearman")
#Pearson correlation between two matrices. Mirroring the R cor function, it performs column versus column correlation 

```

```{r}
remove_constant_columns <- function(matrix_data) {
  # Identify constant columns
  constant_cols <- apply(matrix_data, 2, function(col) all(col == col[1]))
  
  matrix_without_constants <- matrix_data[, !constant_cols, drop = FALSE]
  
  return(matrix_without_constants)
}

fast_pearson_matrix_cor <- function(matrixA) {
  matrixA <- sweep(matrixA, 2, colMeans(matrixA), FUN = "-") 

  matrixA <- sweep(matrixA, 2, sqrt(colSums(matrixA^2)), FUN = "/") 

  cr = crossprod(matrixA, matrixA)
  dimnames(cr) = list(colnames(matrixA), colnames(matrixA))
  cr
}
```


Refer to Jonathan's code for fast correlations

```{r}
library(reticulate)
library(knitr)
```

```{python}
import numpy as np

import scipy.stats as sci

```

```{r}
test_py_exp = np$array(exp_data) #Turn R matrix into numpy array
```

```{python}
rank_test_py_exp = sci.rankdata(r.test_py_exp, method = 'average', axis = 1)                    #Row ranks
rank_test_py_exp = rank_test_py_exp - rank_test_py_exp.mean(axis = 1)[1]                        #Center each gene
rank_test_py_exp = rank_test_py_exp /np.sqrt(np.square(rank_test_py_exp).sum(axis = 1))[:,None] #divide by sqrt(rowSums)
cr_python = np.dot(rank_test_py_exp, rank_test_py_exp.T)                                        # Get correlations
cr_python = pd.DataFrame(cr_python, columns = r.r_genes, index = r.r_genes )                    #Turn to pandas for converting to R
```
```{r}
#Convert back to R and save
corr_matrix = py$cr_python
corr_matrix = as.matrix(corr_matrix)
save(corr_matrix, file_path)
```