---
title: "Bulk_RNAseq_Analysis_September_Samples(2023-10-23)"
output: html_document
---
First, we load our libraries
```{r}
# install libraries
#install.packages("ggfortify")
#install.packages("dequer")
#BiocManager::install("genefilter")
# install.packages("gsubfn")
# install.packages("MetaCycle")

# Load libraries
library(dplyr)
library(magrittr)
library(readr)
library(edgeR)
library(ggplot2)
library(tidyverse)
library(ggfortify)
library(genefilter)
library(cowplot)
library(DT)
library(limma)
library(pheatmap)
library(dequer)
library(gsubfn)
# BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
#BiocManager::install("GenomicFeatures")
#BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
library(GenomicFeatures)
library(BSgenome.Hsapiens.UCSC.hg38)
library(MetaCycle)
```

Let's first load our count matrix generated from STAR alignment.
```{r}
counts <- readRDS("count_matrix_20230925.rds")

# Create a copy of the count matrix
# Check dimensions of the matrix and use head()
dim(counts)
colnames(counts)

# Taking a glance of the matrix
counts[1:10, 1:6]
```

As a first step, we should remove the first four rows and make the gene column as the row names.
```{r}
counts <- data.frame(counts)
rownames(counts) <- counts$gene
counts %<>% dplyr::select(-gene)
counts <- counts[5:nrow(counts),]

# We will check if there are any duplicated ENSEMBL IDs
dups1 <- duplicated(rownames(counts))

# We don't see any duplicated ENSEMBL IDs!
summary(dups1)

# Remove dups1 object from the environment
rm(dups1)

# Check for the total number of reads for each sample
columnsums <- counts %>% colSums()

# This value calculates for the mean library size across all samples
mean(columnsums)
```

Before we move on, we want to know the average alignment rate across the 72 samples.
Instead of recording all 72 values manually, let's try writing a piece of code.

```{r}
# As a first step, we need to get a list of directories that contain the outputLog.final file.
file_list <- list.files(path = "/data/circadian_rhythms/STAR", full.names = TRUE)

# Next, use a loop to read in each file, extract the "alignment rate" value, and store it in a table:
table <- tibble()
# initialize an empty vector to store the rates
for (i in seq_along(file_list)) {
  # set the working directory
  setwd(file_list[i])
  # Read in the file
  inside_STAR <- list.files(path = file_list[i], full.names = TRUE)
  file_content <- readLines(inside_STAR[grepl(pattern = "Log.final.out", inside_STAR)])
  # Extract the alignment rate value
  rate_string <- grep("Uniquely mapped reads %", file_content, value = TRUE)  # find the line containing the rate
  rate_value <- as.numeric(gsub("[^0-9.]+", "", rate_string))  # extract the numeric value from the line
  
  # Store the rate value in the vector
  table %<>% bind_rows(tibble(file_list[i], rate_value))
}

mean_rate <- mean(table$rate_value)

# This value is the overall alignment rate
mean_rate
```

Overall, we have an average alignment rate (Uniquely mapped reads) of 91.25%.

### Data Exploration - to confirm males vs females

Here's something new that we'll try.
We will get the information of all the Y chromosome genes and get an average expression level. 

```{r fig.width = 13, fig.height = 9}
# We can normalize the data later and calculate the normalization factors using the function calcNormFactors().
# Sample information for the column names
colnames(counts)

# ENSEMBL gene IDs for the rownames
rownames(counts) %>% head()

maleVSfemale <- DGEList(counts = counts, 
             samples = colnames(counts),
             genes = rownames(counts))
```


```{r}
# Load the mouse gtf file, containing information on each transcript
gtf_file <- "/home/youkim/Genome_indexing/Mus_musculus.GRCm39.110.gtf"
txdb <- makeTxDbFromGFF(gtf_file, format = "gtf")

# Select out the Y chromosome
chromosome_name <- "Y"

#Extract the Y chromosome Ensembl IDs
genes_on_Y <- genes(txdb, filter = list(tx_chrom = "Y"))
ensembl_gene_ids <- genes_on_Y@ranges@NAMES

#Check for all Y chromosome genes
Y_chr_expression <- maleVSfemale$counts[ensembl_gene_ids,] %>% 
  colSums() %>% 
  data.frame(Y_exp = .) 

#check if the order of the rows are identical
rownames(maleVSfemale$samples) == rownames(Y_chr_expression)

#add the data on to the metadata
maleVSfemale$samples <- maleVSfemale$samples %>% 
  mutate(Y_chr_expression)
# Add biological sex information in metadata
maleVSfemale$samples <- maleVSfemale$samples %>% 
  mutate(sex = ifelse(maleVSfemale$samples$Y_exp > 1000, "male", "female"))

pdf(file="Y_chr_analysis.pdf", width = 10, height = 7)
maleVSfemale$samples %>% 
  ggplot(aes(x = samples, y = Y_exp)) +
  geom_col() + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

maleVSfemale$samples %>% 
  ggplot(aes(x= sex, y= Y_exp)) +
  geom_boxplot() +
  theme_bw()
dev.off()

# Let's save the x_normalized object as an RDS object first, and we could load it later.
# saveRDS(x_normalized, "Normalized_all_genes.rds")
```

Based on the bar plot, we notice that odd samples have high expression of Y chromosome genes.
In other words, it would be safe to say that odd samples are male and even samples are female.


At this point, we want to remove all the genes that do not have any reads (zero value for all samples).
To do so, we wanted to perform some more data wrangling.

```{r}
# Let's also add columns addressing their circadian status (time of the day)

# Load the .csv file including the timepoint data
tpdata <- read_csv("Sample_Info(Sept.2023).csv", col_names = F)

# Modify the second column
extracted_part <- str_extract(tpdata$X2, "(?<=_).*?(?=_)") %>% 
  rev()

# Attach the timepoint data to our bulk RNA-seq metadata.
maleVSfemale$samples <- maleVSfemale$samples %>% 
  mutate(time_point = extracted_part)

# We want to modify our colnames of the count matrix, so that it can be compatible for NiteCap
sample_numbers <- str_extract(maleVSfemale$samples$samples, "(?<=_)[^_]+(?=_)")
maleVSfemale$samples <- maleVSfemale$samples %>%
  mutate(sample_numbers = sample_numbers)

new_colnames <- paste0(maleVSfemale$samples$sex, 
                       "_", 
                       maleVSfemale$samples$time_point,
                       "_",
                       maleVSfemale$samples$sample_numbers)

colnames(maleVSfemale$counts) <- new_colnames
rownames(maleVSfemale$samples) <- new_colnames

# Check new column names
colnames(maleVSfemale$counts)

# Let's create new .csv containing the count matrix in a reverse order for future reference!
copy_count <- maleVSfemale$counts[, ncol(maleVSfemale$counts):1] 
# write.csv(copy_count, "Labelled_counts.csv")

male_samples <- maleVSfemale$samples[maleVSfemale$samples$sex == "male",]
female_samples <- maleVSfemale$samples[maleVSfemale$samples$sex == "female",]

male_dge <- maleVSfemale[, (colnames(maleVSfemale) %in% rownames(male_samples))]
female_dge <- maleVSfemale[, (colnames(maleVSfemale) %in% rownames(female_samples))]

male_count <- male_dge$counts[, ncol(male_dge$counts):1]
# write.csv(male_count, "male_counts.csv")
female_count <- female_dge$counts[, ncol(female_dge$counts):1]
# write.csv(female_count, "female_counts.csv")
```

Before running NiteCap, I thought it would be much easier if we add the gene IDs.

```{r}
ensembl_ids <- rownames(maleVSfemale$counts) 

# This function checks for the gene symbols that match the particular ENSEMBL ID.
gene_symbols <- mapIds(org.Mm.eg.db, 
                       keys = ensembl_ids, 
                       column = "SYMBOL",
                       keytype = "ENSEMBL")

head(gene_symbols)
gene_symbols %<>% 
  as_tibble(rownames = "ENSEMBL") %>% 
  dplyr::rename(gene = value)

gene_symbols %>% 
  group_by(gene) %>% 
  count() %>% 
  filter(n != 1)

gene_symbols %<>% dplyr::filter(!is.na(gene))

gene_symbols_NA <- gene_symbols %>% dplyr::filter(is.na(gene))

#check for 1-many or many-1
gene_symbols %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)
gene_symbols %>% group_by(gene) %>% count() %>% filter(n != 1)

#check for 1-many or many-1 for gene_symbols_NA
gene_symbols_NA %>% group_by(ENSEMBL) %>% count() %>% filter(n != 1)

# Now we need to combine the two data frames
male_count <- male_count %>% 
  data.frame() %>% 
  rownames_to_column() %>% 
  dplyr::rename(ENSEMBL = rowname) %>% 
  inner_join(gene_symbols)

female_count <- female_count %>% 
  data.frame() %>% 
  rownames_to_column() %>% 
  rename(ENSEMBL = rowname) %>% 
  inner_join(gene_symbols)

# Note that we are focusing on annotated genes! (gene_ids with NAs were removed)

# write.csv(male_count, "male_counts_20231009.csv")
# write.csv(female_count, "female_counts_20231009.csv")

male_count <- read.csv("male_counts_20231009.csv")
tibble_colnames <- tibble(name = colnames(male_count)) %>% 
  filter(grepl("male_", name)) %>% 
  mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% 
  arrange(timepoint)
male_count_modified <- male_count[,c("gene", tibble_colnames$name)]
# write.csv(male_count_modified, "male_counts_20231010.csv")


female_count <- read.csv("female_counts_20231009.csv")
tibble_colnames <- tibble(name = colnames(female_count)) %>% 
  filter(grepl("female_", name)) %>% 
  mutate(timepoint = as.numeric(gsub(".*_ZT(.*)_.*", "\\1", name))) %>% 
  arrange(timepoint)
female_count_modified <- female_count[,c("gene", tibble_colnames$name)]
# write.csv(female_count_modified, "female_counts_20231010.csv")

# These lines of code account for genes where their ENSEMBL IDs have NA gene ID
male_count_NA <- male_count %>% 
  data.frame() %>% 
  rownames_to_column() %>% 
  rename(ENSEMBL = rowname) %>% 
  inner_join(gene_symbols_NA)

female_count_NA <- female_count %>% 
  data.frame() %>% 
  rownames_to_column() %>% 
  rename(ENSEMBL = rowname) %>% 
  inner_join(gene_symbols_NA)

# write.csv(male_count_NA, "male_counts_NA_20231010.csv")
# write.csv(female_count_NA, "female_counts_NA_20231010.csv")
```

I think we can create a new code chunk at this point, to form data frames Alex requests

```{r}
# Added Oct.13th
male_count_modified <- read.csv("male_counts_20231010.csv")
female_count_modified <- read.csv("female_counts_20231010.csv")

# Now, we want to remove the genes that had zero counts for all samples.
male_gene_counts <- rowSums(male_count_modified[,3:ncol(male_count_modified)])
female_gene_counts <- rowSums(female_count_modified[,3:ncol(female_count_modified)])

# Check the minimum gene count for males and females
min(male_gene_counts)
min(female_gene_counts)

nonzero_gene_counts_M <- which(male_gene_counts != 0)
nonzero_gene_counts_F <- which(female_gene_counts != 0)

filtered_male_count <- male_count_modified[nonzero_gene_counts_M,] %>% 
  dplyr::select(-X)
filtered_female_count <- female_count_modified[nonzero_gene_counts_F,] %>% 
  dplyr::select(-X)

# Check whether the minimum gene count has changed
filtered_gene_counts_M <- rowSums(filtered_male_count[,3:ncol(filtered_male_count)])
filtered_gene_counts_F <- rowSums(filtered_female_count[,3:ncol(filtered_female_count)])
min(filtered_gene_counts_M)
min(filtered_gene_counts_F)


# write.csv(filtered_male_count, "filtered_male_counts_20231013.csv", row.names = FALSE)
# write.csv(filtered_female_count, "filtered_female_counts_20231013.csv", row.names = FALSE)

# Let's try using the MetaCycle package at this point!
MetaCycle::meta2d(infile = "filtered_male_counts_20231013.csv", 
                  filestyle = "csv",
                  timepoints = rep(seq(3, 23, by=4), each=6))
MetaCycle::meta2d(infile = "filtered_female_counts_20231013.csv", 
                  filestyle = "csv",
                  timepoints = rep(seq(3, 23, by=4), each=6))
male_metaCycle <- read.csv("~/Circadian_Rhythms/metaout/meta2d_filtered_male_counts_20231013.csv")
female_metaCycle <- read.csv("~/Circadian_Rhythms/metaout/meta2d_filtered_female_counts_20231013.csv")

male_cycling <- male_metaCycle %>% 
  filter(meta2d_pvalue < 0.05) %>% pull(CycID)
female_cycling <- female_metaCycle %>% 
  filter(meta2d_pvalue < 0.05) %>% pull(CycID)

length(male_cycling)
length(female_cycling)

# Find the common genes that cycle between males and females
common_cycling <- intersect(male_cycling, female_cycling)

"Per2" %in% common_cycling
"Bmal1" %in% common_cycling
"Clock" %in% common_cycling
"Npas2" %in% common_cycling
"Cry1" %in% common_cycling

 
male_phase <- male_metaCycle %>% 
  filter(CycID %in% common_cycling) %>% 
  pull(meta2d_phase)

female_phase <- female_metaCycle %>% 
  filter(CycID %in% common_cycling) %>% 
  pull(meta2d_phase)

test1 <- female_metaCycle %>% 
  filter(CycID %in% common_cycling)
test1[(abs(male_phase - female_phase) > 6) & (abs(male_phase - female_phase) < 18),] %>% 
  pull(CycID) %>% cat()

test2 <- male_metaCycle %>% 
  filter(CycID %in% common_cycling)
test2[(abs(male_phase - female_phase) > 6) & (abs(male_phase - female_phase) < 18),] %>% 
  pull(CycID) %>% cat()
```

### Normalization
As a first step, let's perform DE based on biological sex.
Make before and after graphs of the raw data and filter the data based on the normalizes values.

```{r hist_cpm_dirty}
# I think we can revisit the maleVSfemale DGEList object at this point.
# We notice that this DGEList object's column names are what we wanted.
colnames(maleVSfemale)

# Check before filtering
dim(maleVSfemale$counts)

# Let's filter out the genes that have zeroes again:
gene_counts <- rowSums(maleVSfemale$counts)
min(gene_counts)
nonzero_gene_counts <- which(gene_counts != 0)
maleVSfemale$counts <- maleVSfemale$counts[nonzero_gene_counts,]

# Check after filtering
dim(maleVSfemale$counts)

# First, we normalize using counts per million (cpm)
cpm <- edgeR::cpm(maleVSfemale$counts)

# We previously defined the object cpm, but for clarity, we define again:
cpm_dirty1 <- cpm

# We check with a histogram
hist(cpm_dirty1, n = 100)
```

Since the number of genes with very low expression (almost zero) vastly outweighs the number of genes being expressed, taking the log value is helpful.

```{r hist_lcpm_dirty}
# make a lcpm object with the lcpm for the unfiltered data
lcpm_dirty1 <- edgeR::cpm(counts, log = TRUE, prior.count = 2) 
# the argument prior.count = 2 is to prevent the log value being zero.

hist(lcpm_dirty1, n = 100)
```

```{r fig.width = 15, fig.height = 10}
# Now we compare raw and filtered data.
# make the dirty lcpm plot for unfiltered data
p1 <- lcpm_dirty1 %>% 
  data.frame() %>% 
  pivot_longer(cols = everything(), 
               names_to = "samples", 
               values_to = "exprs") %>% 
  ggplot(aes(x = exprs, colour = samples)) +
  geom_density() +
  labs(title = "Unfiltered lcpm") + 
  theme(legend.position = "bottom",
        legend.key.size = unit(0.5, 'cm'))

plot(p1)

```

Let's modify the cpm threshold at this point.

```{r}
# Let's check how many genes have at least 1 cpm in at least 1 of the 72 samples.
summary(rowSums(cpm >= 1) >= 1)

keep_exprs <- (rowSums(cpm >= 1) >= 1)

# Now filter based on keep_exprs. Note that the argument keep.lib.sizes = FALSE allows access to the sample data.
backup <- maleVSfemale$counts

maleVSfemale$counts <- maleVSfemale$counts[keep_exprs,]

# double check how many rows exist in x_filtered
dim(maleVSfemale$counts)
```

In order to make comparisons between the samples possible, we use edgeR.
As we compare plots before and after normalization, we now observe that the median values of the samples are now nearly identical, supporting that our normalization was successful.

```{r}
maleVSfemale_normalized <- calcNormFactors(maleVSfemale, method = "TMM")

# We verify whether the normalization occurred correctly, by comparing the boxplots of the data before and after normalization.

# DATA not normalized (use normalized.lib.sizes = F)
p3 <- edgeR::cpm(maleVSfemale_normalized, log = TRUE, normalized.lib.sizes = F) %>% 
  data.frame() %>% 
  pivot_longer(cols = everything(), 
               names_to = "samples",
               values_to = "exprs") %>% 
  ggplot(aes(x = samples, y = exprs, fill = samples)) +
  geom_boxplot() +
  labs(title = "Non-normalized") + 
  # modify the legend size
  theme(legend.position = "bottom",
        legend.key.size = unit(0.5, 'cm'),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# DATA normalized (use normalized.lib.sizes = T)
p4 <- edgeR::cpm(maleVSfemale_normalized, log = T, normalized.lib.sizes = T) %>% 
  data.frame() %>% 
  pivot_longer(cols = everything(), 
               names_to = "samples",
               values_to = "exprs") %>% 
  ggplot(aes(x = samples, y = exprs, fill = samples)) +
  geom_boxplot() +
  labs(title = "Normalized") +
  # modify the legend size
  theme(legend.position = "bottom",
        legend.key.size = unit(0.5, 'cm'),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

# Make a side by side plot for comparison
pdf(file = "20231023_Normalization_Comparison.pdf")
plot_grid(p3, p4)
dev.off()
```

Let's start with making the Volcano plot that compares male and female samples

```{r}
# Let's add the gene IDs like we did previously.
# maleVSfemale$counts %>% 
#   data.frame() %>% 
#   rownames_to_column() %>% 
#   dplyr::rename(ENSEMBL = rowname) %>% 
#   inner_join(gene_symbols)

designSEQ <- model.matrix(~ 0 + 
                            maleVSfemale_normalized$samples$sex)

# Change the column names of the design matrix
colnames(designSEQ) <- c("female","male")

# check the design matrix
head(designSEQ)
dim(designSEQ)


y <- DGEList(counts = maleVSfemale$counts, group = maleVSfemale$samples$sex)
y <- calcNormFactors(y)

y <- estimateGLMCommonDisp(y, designSEQ)
y <- estimateGLMTrendedDisp(y, designSEQ)
y <- estimateGLMTagwiseDisp(y, designSEQ)


et <- exactTest(y)
# get the top DE genes
dgeTest <- topTags(et,n=nrow(et$table))

sum(dgeTest$table$FDR < 0.05)

volcanoData <- cbind(dgeTest$table$logFC, -log10(dgeTest$table$FDR))
colnames(volcanoData) <- c("logFC", "negLogPval")
head(volcanoData)

plot(volcanoData, pch=19)
head(dgeTest$table, 20)
```


```{r}
# What is et?
class(et)

# Looks complicated
# What is the structure of the object?
str(et)

# there's a clue - it contains a table
# try accessing it:
head(et$table)

### How many genes look significant?
sum(et$table$PValue < 0.05)

### Histogram of the p-values
hist(et$table$PValue)

### How many genes show 2-fold enrichment?
sum(et$table$PValue < 0.05 & et$table$logFC > 1)

### Let set an index vector to identify a sub-population
# use boolean logic to highlight genes with multiple criteria.
sig.up <- et$table$PValue < 0.05 & et$table$logFC > 1
sig.dn <- et$table$PValue < 0.05 & et$table$logFC < -1

### What does the distribution of log ratios look like?
hist(et$table$logFC)

# Where is our sub-population in this plot?
hist(et$table$logFC[sig.up], add=TRUE, col="red")

### What is the range of expression ratios?
range(et$table$logFC)

### What is the range of expression values?
range(et$table$logCPM)

plot(et$table$logCPM, et$table$logFC, main="MA plot tol10b/gd7", xlab="A", ylab="M")
points(et$table$logCPM[iv], et$table$logFC[iv], col="orange")

et$table$padj <- p.adjust(et$table$PValue, method="BH")
sum(et$table$padj < 0.01)

# create index vector of DE
iv.padj <- et$table$padj < 0.01

# highlight DE on the plot
plot(et$table$logCPM, et$table$logFC, main="MA plot tol10b/gd7", xlab="A", ylab="M")
points(et$table$logCPM[iv.padj], et$table$logFC[iv.padj], col="blue")

# draw some lines on the plot
abline(h=0, col="grey")

# draw some 2-fold change lines
abline(h=c(-1,1), lty="dashed", col="grey")
tol10b.enriched <- et$table$logFC > 0 & et$table$padj < 0.01

# get all row numbers that meet a given criteria
iv <- which(et$table$PValue < 0.05)

# generate a boolean vector for a given criteria
iv <- et$table$PValue < 0.05


plot(et$table$logFC, -10*log10(et$table$PValue), main="Volcano plot", xlab="M", ylab="-10*log(P-val)")

# highlight our DE genes
points(et$table$logFC[tol10b.enriched], -10*log10(et$table$PValue[tol10b.enriched]), col="red")

# identify genes enriched in the control
tol10b.depleted <- et$table$logFC < 0 & et$table$padj < 0.01
points(et$table$logFC[tol10b.depleted], -10*log10(et$table$PValue[tol10b.depleted]), col="green")
```

Now with this design matrix, we will fit the data into a linear model. For this purpose, we will need the contrast matrix, which is generated in the next problem. Note that the design matrix has 72 rows, which are for each different sample.

```{r contrasts}
# We make the contrast matrix
contr.SEQ <- makeContrasts(
  femaleVsMale = female - male,
  levels = colnames(designSEQ))

# Explore the contrast matrix
contr.SEQ
```

Apply the voom transformation. This step adjusts the mean variance trend. 
Recall that RNA seq data needs to have the heteroscedasticity removed.

```{r voom}
# Create the voom object. Set plot = T, to observe the plot.
voomObj <- voom(maleVSfemale_normalized, designSEQ, plot = T)
```

### Create graphs to observe the effect of heteroscedasticity removal

```{r heteroscedasticity}
# With our voom-transformed object, we perform further limma pipeline steps.
# create linear models of designSEQ
vfit <- lmFit(voomObj, design = designSEQ)
# fit the contrast to the model with contrasts.fit()
vfit <- contrasts.fit(vfit, contrast = contr.SEQ)
# apply a Bayesian correction to the model, with threshold 0.01
efit <- eBayes(vfit, 0.01)

# Compare before and after removing the heteroscedasticity effects
par(mfrow = c(1,2))
voomObj <- voom(maleVSfemale_normalized, designSEQ, plot = T)
plotSA(efit, main = "Heteroscedasticity removed")
```

### Use PCA to inspect the sample distribution

Create a figure/s and discuss if the samples separate by group and in which PC axis. Do you think there could be a batch effect present? Any other observations of note. 

```{r PCA}
# We notice that the voomObj$E contains the count data, which will be compatible for creating a pca object. 
pca1 <- voomObj$E %>% 
  t() %>% 
  prcomp()

# Using the pca1 object, plot the PCA data categorized by the three groups
# Save the plot as an object
p1 <- autoplot(pca1, data = maleVSfemale_normalized$samples) +
	labs(title = "PCA Plot by Biological Sex") +
	geom_point(aes(colour = maleVSfemale_normalized$samples$sex), size=2.5) +
	labs(colour = "") +
  theme_bw() +
  theme(legend.position="bottom",
        plot.margin = margin(2,2,2,2, "cm"))

pdf(file="PCA_EGFRexp_20230623.pdf", width = 6, height = 6)
plot(p1)
dev.off()

```


### Limma

Apply the limma pipeline/workflow to calculate differential expressed genes and display a nicely formatted datatable. Consider which columns you should include or remove.

```{r limma}
# First, we generate a table of top-ranked genes from linear modeling, using the topTable() function. We will set the number of top genes as 50.
tT <- topTable(efit, adjust = "fdr", sort.by = "B", number = 50)

# Examine the table object using head()
head(tT)

# It seems that the first column is redundant, so we can remove them.
# Now we make the interactive data table.

data_table <- tT[, c(2:7)] %>% 
  # Let's round the numbers to two decimal places
	mutate_if(is.numeric, funs(round(., 2))) %>% 
	datatable(options = list(scrollX = TRUE))

# Examine the data table object
data_table
```

### Heatmap using the top 100 genes

```{r fig.height = 7, fig.width = 6}
# Since we want top  genes, we need to set a specific var.cutoff value for the varFilter() function. To do this, we need to calculate the following:
ratio <- 1 - 25 / nrow(voomObj$E)

# put the ratio as the value for var.cutoff
varG <- varFilter(voomObj$E, var.cutoff = ratio)

# confirm that varG has 20 rows
nrow(varG)

# Set the annotation. We already saw in problem #8 that there are no batch effects, so we only set disease state in the annotation column.
anno.col <- data.frame(row.names = colnames(varG),
                       pheno = as.factor(maleVSfemale$samples$sex))

# Plot the heatmap and make 3 clusters to examine how well the clusters catch the disease states.
pheatmap(varG, 
         scale = "row", 
         annotation_col = anno.col, 
				 labels_row = maleVSfemale$genes[c(row.names(varG)),],
				 cutree_cols = 2)
```