1.0 Loading Required Libraries
library(splitstackshape)
library(dplyr)
library(ggplot2)
library(VennDiagram)
library(RColorBrewer)
library(gridExtra)
library(cowplot)
library(knitr)
library(kableExtra)
library(tidyverse)
library(Hmisc)
library(ggridges)
library(venn)
library(ggthemes)
library(stringr)
library(splitstackshape)
library(chron)
library(magicfor)
library(ggtext)
library(reshape2)
library(plyr)
library(formattable)
library(data.table)
library(ggrepel)
require(MASS)
require(scales)
#Setting working directory
setwd("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/dataExtractions/")
2.0 Data Loading, Cleaning and Preprocessing
2.1 Loading and preprocessing benchmark data from all runs - data extracts from the system-time command
#run_results <- read.table(pipe ('ssh atlas "cat /home/asiimwe/projects/run_env/alpha_star_wasp_comparison/benchmark_results_all_runs.txt"'), sep="\t", header = FALSE, fill=TRUE, quote = "") #remote call
run_results <- read.table("benchmark_results_all_runs.txt", sep="\t", header = FALSE, fill=TRUE, quote = "")
run_results <- as.data.frame(run_results[-c(1),])
run_results$V4 <- NULL
colnames(run_results) <- c("Sample", "Run", "Thread", "Param","Value")
run_results$Param <- as.character(run_results$Param)
run_results$Param[run_results$Param == "Elapsed (wall clock) time (h"] <- "Wall_Clock"
run_results$Param[run_results$Param == "Maximum resident set size (kbytes)"] <- "Memory"
run_results$Value <- as.character(run_results$Value)
run_results$Value <- gsub("mm:ss or m:ss): ", "", run_results$Value)
run_results$Value <- gsub(" ", "", run_results$Value)
#run_results$Value
unique(run_results$Run)
run_results$Run <- gsub("_Runs", "", run_results$Run)
unique(run_results$Run)
run_results_STAR <- run_results %>% filter(Run == "STAR")#STAR base - no variants
run_results_STAR_WASP <- run_results %>% filter(Run == "STAR_WASP") #STAR WASP
run_results_WASP <- run_results %>% filter(Run == "WASP") #WASP - change naming globally
run_results_STAR$Run <- NULL
run_results_STAR_WASP$Run <- NULL
run_results_WASP$Run <- NULL
run_results_STAR <- spread(run_results_STAR, key = Thread, value = Value)
colnames(run_results_STAR) <- c("Sample", "Param", "16 Threads-STAR", "32 Threads-STAR", "8 Threads-STAR")
run_results_STAR_WASP <- spread(run_results_STAR_WASP, key = Thread, value = Value)
colnames(run_results_STAR_WASP) <- c("Sample", "Param", "16 Threads-STAR_WASP", "32 Threads-STAR_WASP", "8 Threads-STAR_WASP")
run_results_WASP <- spread(run_results_WASP, key = Thread, value = Value)
colnames(run_results_WASP) <- c("Sample", "Param", "16 Threads-WASP", "32 Threads-WASP", "8 Threads-WASP")
data <- merge(run_results_STAR, run_results_STAR_WASP, by = c("Sample", "Param"))
data <- merge(data, run_results_WASP, by = c("Sample", "Param"))
#write.csv(data, file = "/home/asiimwe/projects/run_env/alpha_star_wasp_comparison/benchmark_results_all_runs_pivoted.csv")
#Plotting speeds and memory
data_melted <- melt(data, id.vars=c("Sample", "Param"))
data_melted <- cSplit(data_melted,"variable", "-", direction="wide")
colnames(data_melted) <- c("Sample", "Param", "Value", "Threads", "Run")
data_melted$Param <- as.character(data_melted$Param)
#data_melted_subset_clock <- data_melted_subset %>% filter(Param == "Wall_Clock")
#data_melted_subset_clock$Value <- gsub("\\.", ":", data_melted_subset_clock$Value)
#data_melted_subset_clock$Value
#data_melted_subset_clock$Value <- chron(times. = data_melted_subset_clock$Value)#rejecting format - expecting h:m:s but third aguments is more than 60 for some instances
###########------------------------------------------------------------------------------
# #compressed dataset - removing parameters that are not needed
# #parameters to remove:
# param_remove <- c(
# "Average resident set size (kbytes)",
# "Average shared text size (kbytes)",
# "Average stack size (kbytes)",
# "Average total size (kbytes)",
# "Average unshared data size (kbytes)",
# "Exit status",
# "Major (requiring I/O) page faults",
# "Signals delivered",
# "Socket messages received",
# "Socket messages sent",
# "Swaps"
# )
#
# dataset_subset <- data %>% filter(Param %nin% param_remove)
# write.csv(dataset_subset, file = "/home/asiimwe/projects/run_env/alpha_star_wasp_comparison/run_results_subset_pivoted.csv")
###########------------------------------------------------------------------------------
#Benchmark plots:
#Elapsed (wall clock) time (h mm:ss or m:ss): Average resident set size (kbytes)
2.2 Loading Log.final.out results for all runs
# Loading Log.final.out results to extract required parameters
final.log.results <- read.table("final_log_results_all_runs.txt", sep="|", header = FALSE, fill=TRUE, quote = "")
#final.log.results <- read.table(pipe('ssh atlas "cat /home/asiimwe/projects/run_env/alpha_star_wasp_comparison/final_log_results_all_runs.txt"'), sep="|", header = FALSE, fill=TRUE, quote = "")
colnames(final.log.results) <- unlist(final.log.results[c(1),])
final.log.results <- as.data.frame(final.log.results[-c(1),])
final.log.results <- as.data.frame(final.log.results)
final.log.results$Sample <- as.character(final.log.results$Sample)
final.log.results$Run <- as.character(final.log.results$Run)
final.log.results$Thread <- as.character(final.log.results$Thread)
final.log.results$Param <- as.character(final.log.results$Param)
colnames(final.log.results)
final.log.results$Param <- str_trim(final.log.results$Param)
final.log.results_filt <- final.log.results %>% filter(Value != "")
final.log.results_filt$Run <- as.character(final.log.results_filt$Run)
unique(final.log.results_filt$Run)
final.log.results_filt$Run[final.log.results_filt$Run == "STAR_Runs"] <- "STAR"
final.log.results_filt$Run[final.log.results_filt$Run == "STAR_WASP_Runs"] <- "STAR_WASP"
final.log.results_filt$Run[final.log.results_filt$Run == "WASP_Runs"] <- "WASP"
colnames(final.log.results_filt)[3] <- "Threads"
final.log.results_filt$Threads <- as.character(final.log.results_filt$Threads)
final.log.results_filt$Threads[final.log.results_filt$Threads == "16threads"] <- "16 Threads"
final.log.results_filt$Threads[final.log.results_filt$Threads == "32threads"] <- "32 Threads"
final.log.results_filt$Threads[final.log.results_filt$Threads == "8threads"] <- "8 Threads"
#Extracting number of reads
final.log.results_filt_input_reads <- final.log.results_filt %>% filter(Param == "Number of input reads")
final.log.results_filt_mapping_speed <- final.log.results_filt %>% filter(Param == "Mapping speed, Million of reads per hour")
final.log.results_filt_average_input_read_length <- final.log.results_filt %>% filter(Param == "Average input read length")
unique(data_melted$Sample)
unique(data_melted$Run)
unique(data_melted$Threads)
unique(final.log.results_filt_input_reads$Sample)
unique(final.log.results_filt_input_reads$Run)
unique(final.log.results_filt_input_reads$Threads)
data_melted_join_pass1 <- inner_join(data_melted, final.log.results_filt_input_reads, by = c("Sample", "Run", "Threads"))
data_melted_join_pass2 <- inner_join(data_melted_join_pass1, final.log.results_filt_mapping_speed, by = c("Sample", "Run", "Threads"))
data_melted_beta <- inner_join(data_melted_join_pass2, final.log.results_filt_average_input_read_length, by = c("Sample", "Run", "Threads"))
colnames(data_melted_beta)
colnames(data_melted_beta) <- c( "Sample", "Param", "Value", "Threads", "Run", "Param.y", "Number_of_input_reads", "Param.x.x", "Mapping_speed_Million_of_reads_per_hour", "Param.y.y", "Average_input_read_length")
data_melted_beta$Param.y <- NULL
data_melted_beta$Param.x.x <- NULL
data_melted_beta$Param.y.y <- NULL
nrow(data_melted)
nrow(final.log.results_filt_input_reads)
nrow(final.log.results_filt_mapping_speed)
nrow(final.log.results_filt_average_input_read_length)
nrow(data_melted_beta)
#Removing samples not considered for our analyses
data_melted_beta <- data_melted_beta %>% filter(Sample != "HG00514" & Sample != "NA12878_Small" & Sample != "NA12878_RAMPAGE" & Sample != "NA12878_RAMPAGE_Rep")
#write.csv(data_melted_beta, file = "/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/data_melted_beta.csv")
Prepairing data for Manuscript summary table
data_melted_beta <- read.csv("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/data_melted_beta.csv")
data_melted_beta$X <- NULL
## Preparing data for Latex tables
# Installing required packages
# tinytex::parse_install(
# text = "! LaTeX Error: File `ae.sty' not found."
# )
#
#
# tinytex::parse_install(
# text = "! LaTeX Error: File `fullpage.sty' not found."
# )
# tinytex::parse_install(
# text = "! LaTeX Error: File `moresize.sty' not found."
# )
#
#
# tinytex::parse_install(
# text = "! LaTeX Error: File `colortbl.sty' not found."
# )
#
#
# tinytex::parse_install(
# text = "! LaTeX Error: File `soul.sty' not found."
# )
#
# tinytex::parse_install(
# text = "! LaTeX Error: File `adjustbox.sty' not found."
# )
#
#
# tinytex::parse_install(
# text = "! LaTeX Error: File `collectbox.sty' not found."
# )
# tinytex::parse_install(
# text = "! LaTeX Error: File `pdflscape.sty' not found."
# )
#tinytex::parse_install(
# text = "! LaTeX Error: File `grfext.sty' not found."
# )
# tinytex::parse_install(
# text = "! LaTeX Error: File `tabu.sty' not found."
# )
#
# tinytex::parse_install(
# text = "! LaTeX Error: File `threeparttable.sty' not found."
# )
#data_melted_beta <- read.csv("/home/asiimwe/projects/run_env/alpha_star_wasp_comparison/data_melted_beta.csv")
unique(data_melted_beta$Run)
data_melted_beta$Threads <- ordered(data_melted_beta$Threads , levels = c("8 Threads", "16 Threads", "32 Threads"))
unique(data_melted_beta$Param)
data_melted_beta_subset <- data_melted_beta %>% filter(Param == "Wall_Clock" | Param == "Memory")
data_melted_beta_subset$Average_input_read_length <- NULL
#data_melted_beta_subset$threads_run <- paste0(data_melted_beta_subset$Threads, "_", data_melted_beta_subset$Run)
#data_melted_beta_subset$Param <- NULL
#data_melted_beta_subset$Threads <- NULL
#data_melted_beta_subset$Run <- NULL
#-----------------------STAR-----------------------
data_melted_beta_subset_STAR <- data_melted_beta_subset %>% filter(Run == "STAR")
data_melted_beta_subset_STAR_Param <- data_melted_beta_subset_STAR[,c(1:4)] #check environment differences
data_melted_beta_subset_STAR_reads <- unique(data_melted_beta_subset_STAR[,c(1,6)])
data_melted_beta_subset_STAR_speed <- unique(data_melted_beta_subset_STAR[,c(1,4, 7)])
data_melted_beta_subset_STAR_Param$comb <- paste0(data_melted_beta_subset_STAR_Param$Param, "_", data_melted_beta_subset_STAR_Param$Threads, "_", "STAR")
data_melted_beta_subset_STAR_Param$Param <- NULL
data_melted_beta_subset_STAR_Param$Threads <- NULL
data_melted_beta_subset_STAR_Param <- as.data.frame(data_melted_beta_subset_STAR_Param)
data_melted_beta_subset_STAR_Param_spread <- spread(data_melted_beta_subset_STAR_Param, key = comb, value = Value)
data_melted_beta_subset_STAR_Param_spread <- data_melted_beta_subset_STAR_Param_spread %>% dplyr::group_by(Sample) %>%
dplyr::summarise_all(purrr::discard, is.na)
data_melted_beta_subset_STAR_Param_spread <- as.data.frame(data_melted_beta_subset_STAR_Param_spread)
data_melted_beta_subset_STAR_speed_spread <- spread(data_melted_beta_subset_STAR_speed, key = Threads, value = Mapping_speed_Million_of_reads_per_hour)
colnames(data_melted_beta_subset_STAR_speed_spread)
colnames(data_melted_beta_subset_STAR_speed_spread) <- c("Sample", "Mapping_Speed_8 Threads_STAR", "Mapping_Speed_16 Threads_STAR", "Mapping_Speed_32 Threads_STAR")
STAR_tab_subset <- inner_join(data_melted_beta_subset_STAR_reads, data_melted_beta_subset_STAR_Param_spread, by = "Sample")
STAR_tab_subset <- inner_join(STAR_tab_subset, data_melted_beta_subset_STAR_speed_spread, by = "Sample")
STAR_tab_subset <- STAR_tab_subset[,c(1,2,3,4,5,9,10,11, 6,7,8)]
#----------------------STAR_WASP-----------------------
data_melted_beta_subset_STAR_WASP <- data_melted_beta_subset %>% filter(Run == "STAR_WASP")
data_melted_beta_subset_STAR_WASP_Param <- data_melted_beta_subset_STAR_WASP[,c(1:4)]
data_melted_beta_subset_STAR_WASP_reads <- unique(data_melted_beta_subset_STAR_WASP[,c(1,6)])
data_melted_beta_subset_STAR_WASP_speed <- unique(data_melted_beta_subset_STAR_WASP[,c(1,4, 7)])
data_melted_beta_subset_STAR_WASP_Param$comb <- paste0(data_melted_beta_subset_STAR_WASP_Param$Param, "_", data_melted_beta_subset_STAR_WASP_Param$Threads, "_", "STAR_WASP")
data_melted_beta_subset_STAR_WASP_Param$Param <- NULL
data_melted_beta_subset_STAR_WASP_Param$Threads <- NULL
data_melted_beta_subset_STAR_WASP_Param <- as.data.frame(data_melted_beta_subset_STAR_WASP_Param)
#rownames(data_melted_beta_subset_STAR_WASP_Param) <- 1:nrow(data_melted_beta_subset_STAR_WASP_Param)
#data_melted_beta_subset_STAR_WASP_Param$id <- 1:nrow(data_melted_beta_subset_STAR_WASP_Param)
data_melted_beta_subset_STAR_WASP_Param_spread <- spread(data_melted_beta_subset_STAR_WASP_Param, key = comb, value = Value)
data_melted_beta_subset_STAR_WASP_Param_spread$id <- NULL
data_melted_beta_subset_STAR_WASP_Param_spread <- data_melted_beta_subset_STAR_WASP_Param_spread %>% dplyr::group_by(Sample) %>%
dplyr::summarise_all(purrr::discard, is.na)
data_melted_beta_subset_STAR_WASP_Param_spread <- as.data.frame(data_melted_beta_subset_STAR_WASP_Param_spread)
data_melted_beta_subset_STAR_WASP_speed_spread <- spread(data_melted_beta_subset_STAR_WASP_speed, key = Threads, value = Mapping_speed_Million_of_reads_per_hour)
colnames(data_melted_beta_subset_STAR_WASP_speed_spread)
colnames(data_melted_beta_subset_STAR_WASP_speed_spread) <- c("Sample", "Mapping_Speed_8 Threads_STAR_WASP", "Mapping_Speed_16 Threads_STAR_WASP", "Mapping_Speed_32 Threads_STAR_WASP")
STAR_WASP_tab_subset <- inner_join(data_melted_beta_subset_STAR_WASP_Param_spread, data_melted_beta_subset_STAR_WASP_speed_spread, by = "Sample")
STAR_WASP_tab_subset <- STAR_WASP_tab_subset[,c(1,2,3,4,8,9,10, 5,6,7)]
#-------------------WASP-------------------------------
data_melted_beta_subset_WASP <- data_melted_beta_subset %>% filter(Run == "WASP")
data_melted_beta_subset_WASP_Param <- data_melted_beta_subset_WASP[,c(1:4)]
data_melted_beta_subset_WASP_reads <- unique(data_melted_beta_subset_WASP[,c(1,6)])
data_melted_beta_subset_WASP_speed <- unique(data_melted_beta_subset_WASP[,c(1,4, 7)])
data_melted_beta_subset_WASP_Param$comb <- paste0(data_melted_beta_subset_WASP_Param$Param, "_", data_melted_beta_subset_WASP_Param$Threads, "_", "WASP")
data_melted_beta_subset_WASP_Param$Param <- NULL
data_melted_beta_subset_WASP_Param$Threads <- NULL
data_melted_beta_subset_WASP_Param <- as.data.frame(data_melted_beta_subset_WASP_Param)
#rownames(data_melted_beta_subset_WASP_Param) <- 1:nrow(data_melted_beta_subset_WASP_Param)
#data_melted_beta_subset_WASP_Param$id <- 1:nrow(data_melted_beta_subset_WASP_Param)
data_melted_beta_subset_WASP_Param_spread <- spread(data_melted_beta_subset_WASP_Param, key = comb, value = Value)
data_melted_beta_subset_WASP_Param_spread$id <- NULL
data_melted_beta_subset_WASP_Param_spread <- data_melted_beta_subset_WASP_Param_spread %>% dplyr::group_by(Sample) %>%
dplyr::summarise_all(purrr::discard, is.na)
data_melted_beta_subset_WASP_Param_spread <- as.data.frame(data_melted_beta_subset_WASP_Param_spread)
data_melted_beta_subset_WASP_speed_spread <- spread(data_melted_beta_subset_WASP_speed, key = Threads, value = Mapping_speed_Million_of_reads_per_hour)
colnames(data_melted_beta_subset_WASP_speed_spread)
colnames(data_melted_beta_subset_WASP_speed_spread) <- c("Sample", "Mapping_Speed_8 Threads_WASP", "Mapping_Speed_16 Threads_WASP", "Mapping_Speed_32 Threads_WASP")
WASP_tab_subset <- inner_join(data_melted_beta_subset_WASP_Param_spread, data_melted_beta_subset_WASP_speed_spread, by = "Sample")
WASP_tab_subset <- WASP_tab_subset[,c(1,2,3,4,8,9,10, 5,6,7)]
STAR_tab_subset <- as.data.frame(STAR_tab_subset)
STAR_WASP_tab_subset <- as.data.frame(STAR_WASP_tab_subset)
WASP_tab_subset <- as.data.frame(WASP_tab_subset)
n_way_merger0 <- merge(STAR_tab_subset, STAR_WASP_tab_subset, by = "Sample")
n_way_merger <- merge(n_way_merger0, WASP_tab_subset, by = "Sample")
#n_way_merger$Size_holder1 <- ""
#n_way_merger$Size_holder2 <- ""
colnames(n_way_merger)
n_way_merger <- n_way_merger[, c("Sample", "Number_of_input_reads", #"Sample", "Size_holder1","Size_holder2", "Number_of_input_reads",
"Memory_8 Threads_STAR","Memory_16 Threads_STAR", "Memory_32 Threads_STAR",
"Mapping_Speed_8 Threads_STAR","Mapping_Speed_16 Threads_STAR", "Mapping_Speed_32 Threads_STAR",
"Wall_Clock_8 Threads_STAR","Wall_Clock_16 Threads_STAR", "Wall_Clock_32 Threads_STAR",
"Memory_8 Threads_STAR_WASP", "Memory_16 Threads_STAR_WASP", "Memory_32 Threads_STAR_WASP",
"Mapping_Speed_8 Threads_STAR_WASP", "Mapping_Speed_16 Threads_STAR_WASP", "Mapping_Speed_32 Threads_STAR_WASP",
"Wall_Clock_8 Threads_STAR_WASP", "Wall_Clock_16 Threads_STAR_WASP", "Wall_Clock_32 Threads_STAR_WASP",
"Memory_8 Threads_WASP", "Memory_16 Threads_WASP", "Memory_32 Threads_WASP",
"Mapping_Speed_8 Threads_WASP", "Mapping_Speed_16 Threads_WASP", "Mapping_Speed_32 Threads_WASP",
"Wall_Clock_8 Threads_WASP", "Wall_Clock_16 Threads_WASP", "Wall_Clock_32 Threads_WASP"
)]
#knitr::kable(n_way_merger, "latex")
# Smoothing plot function
geom_xspline <- function(mapping = NULL, data = NULL, stat = "xspline",
position = "identity", show.legend = NA,
inherit.aes = TRUE, na.rm = TRUE,
spline_shape=-0.25, open=TRUE, rep_ends=TRUE, ...) {
layer(
geom = GeomXspline,
mapping = mapping,
data = data,
stat = stat,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(spline_shape=spline_shape,
open=open,
rep_ends=rep_ends,
...)
)
}
GeomXspline <- ggproto("GeomXspline", GeomLine,
required_aes = c("x", "y"),
default_aes = aes(colour = "black", size = 0.5, linetype = 1, alpha = NA)
)
stat_xspline <- function(mapping = NULL, data = NULL, geom = "line",
position = "identity", show.legend = NA, inherit.aes = TRUE,
spline_shape=-0.25, open=TRUE, rep_ends=TRUE, ...) {
layer(
stat = StatXspline,
data = data,
mapping = mapping,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(spline_shape=spline_shape,
open=open,
rep_ends=rep_ends,
...
)
)
}
StatXspline <- ggproto("StatXspline", Stat,
required_aes = c("x", "y"),
compute_group = function(self, data, scales, params,
spline_shape=-0.25, open=TRUE, rep_ends=TRUE) {
tf <- tempfile(fileext=".png")
png(tf)
plot.new()
tmp <- xspline(data$x, data$y, spline_shape, open, rep_ends, draw=FALSE, NA, NA)
invisible(dev.off())
unlink(tf)
data.frame(x=tmp$x, y=tmp$y)
}
)
3.0 Data Visualization
3.1 Memory
data_melted_beta <- read.csv("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/data_melted_beta.csv")
#data_melted_beta <- read.csv(pipe('ssh atlas "cat /home/asiimwe/projects/run_env/alpha_star_wasp_comparison/data_melted_beta.csv"'))
data_melted_beta$X <- NULL
colnames(data_melted_beta)
## [1] "Sample"
## [2] "Param"
## [3] "Value"
## [4] "Threads"
## [5] "Run"
## [6] "Number_of_input_reads"
## [7] "Mapping_speed_Million_of_reads_per_hour"
## [8] "Average_input_read_length"
#"Sample" "Param" "Value" "Threads" "Run" "Mapping_speed_Million_of_reads_per_hour" "Average_input_read_length"
## Cleaning up Sample names
unique(data_melted_beta$Sample)
## [1] HG00512 HG00513
## [3] HG00731 HG00732
## [5] HG00733 NA12878_Nucleus_nonPolyA
## [7] NA12878_Nucleus_nonPolyA_Rep NA12878_Nucleus_PolyA
## [9] NA12878_Nucleus_PolyA_Rep NA12878_PolyA
## [11] NA12878_PolyA_Rep NA12878_Total
## [13] NA12878_Total_Rep NA19238
## [15] NA19239 NA19240
## 16 Levels: HG00512 HG00513 HG00731 HG00732 HG00733 ... NA19240
#Removing NA12878 from all NA12878 derived samples
data_melted_beta$Sample <- gsub("NA12878_", "", data_melted_beta$Sample)
data_melted_beta$Sample <- gsub("Rep", "R2", data_melted_beta$Sample)
data_melted_beta$Sample[data_melted_beta$Sample == "Total"] <- "Total_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "PolyA"] <- "PolyA_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "Nucleus_PolyA"] <- "Nucleus_PolyA_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "Nucleus_nonPolyA"] <- "Nucleus_nonPolyA_R1"
#Filtering samples not needed downstream:
unique(data_melted_beta$Sample)
## [1] "HG00512" "HG00513" "HG00731"
## [4] "HG00732" "HG00733" "Nucleus_nonPolyA_R1"
## [7] "Nucleus_nonPolyA_R2" "Nucleus_PolyA_R1" "Nucleus_PolyA_R2"
## [10] "PolyA_R1" "PolyA_R2" "Total_R1"
## [13] "Total_R2" "NA19238" "NA19239"
## [16] "NA19240"
data_melted_beta$Threads <- as.character(data_melted_beta$Threads )
data_melted_beta$Threads[data_melted_beta$Threads == "8threads" ] <- "8 Threads"
data_melted_beta$Threads[data_melted_beta$Threads == "16threads" ] <- "16 Threads"
data_melted_beta$Threads[data_melted_beta$Threads == "32threads" ] <- "32 Threads"
unique(data_melted_beta$Threads)
## [1] "16 Threads" "32 Threads" "8 Threads"
#Data cleanup
str(data_melted_beta)
## 'data.frame': 3168 obs. of 8 variables:
## $ Sample : chr "HG00512" "HG00512" "HG00512" "HG00512" ...
## $ Param : Factor w/ 22 levels "Average resident set size (kbytes)",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Value : Factor w/ 1387 levels "0","1","1:01:58",..: 1 1 1 1 1 1 258 1230 531 1 ...
## $ Threads : chr "16 Threads" "16 Threads" "16 Threads" "16 Threads" ...
## $ Run : Factor w/ 3 levels "STAR","STAR_WASP",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Number_of_input_reads : int 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 ...
## $ Mapping_speed_Million_of_reads_per_hour: num 1226 1226 1226 1226 1226 ...
## $ Average_input_read_length : int 102 102 102 102 102 102 102 102 102 102 ...
data_melted_beta$Sample <- as.character(data_melted_beta$Sample)
data_melted_beta$Param <- as.character(data_melted_beta$Param)
data_melted_beta$Value <- gsub("%", "", data_melted_beta$Value)
#Reordering threads to appear in the correct order (8, 16, 32)
data_melted_beta$Threads <- as.factor(data_melted_beta$Threads)
data_melted_beta$Threads <- ordered(data_melted_beta$Threads , levels = c("8 Threads", "16 Threads", "32 Threads"))
data_melted_beta_memory <- as.data.frame(data_melted_beta %>% filter(Param == "Memory"))
unique(data_melted_beta_memory$Param)
## [1] "Memory"
data_melted_beta_memory$Threads <- ordered(data_melted_beta_memory$Threads , levels = c("8 Threads", "16 Threads", "32 Threads"))
data_melted_beta_memory$Value <- as.numeric(data_melted_beta_memory$Value)
data_melted_beta_memory$Sample <- as.factor(data_melted_beta_memory$Sample)
data_melted_beta_memory$Run <- as.factor(data_melted_beta_memory$Run)
data_melted_beta_memory <- data_melted_beta_memory[order(data_melted_beta_memory$Sample, data_melted_beta_memory$Value),]
#range(data_melted_beta_memory$Value) #30649908 38445368
data_melted_beta_memory <- data_melted_beta_memory %>%
group_by(Sample, Threads) %>%
dplyr::mutate(mem_order = sum(Value)) %>%
arrange(mem_order)
#class(data_melted_beta_memory) - "grouped_df" "tbl_df" "tbl" "data.frame"
data_melted_beta_memory <- as.data.frame(data_melted_beta_memory)
#Memory
# data_melted_beta_memory %>% filter(Sample != "NA12878_Small") %>%
# #mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
# ggplot(aes(x = reorder(Sample, (Value)/1000000), y = (Value)/1000000, group=Run)) + #Dividing by 1000000 to convert kbytes to GB
# geom_line(aes(color = Run, linetype = Run)) +
# scale_color_manual(values = c("black", "gray10", "gray50")) +
# facet_wrap(~Threads) +
# labs(y = "Memory (Gigabytes)", x="") + scale_y_continuous() +
# theme_bw() + theme(legend.title = element_blank()) + #Base, bw, excel_new(), few, light, lindraw
# theme(strip.background =element_rect(fill="white", colour = "white"))+
# theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
# #theme(strip.text.x = element_blank()) +
# theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) #+
# #geom_text(x = 6, y = max(data_melted_beta_memory$Value), aes(label = label),
# # data = dat_text, check_overlap = TRUE, inherit.aes = FALSE)
global_colors <- c("darkorange3", "dodgerblue4", "firebrick4")
#global_colors <- c('#999999','#E69F00', '#56B4E9')
#Overal we shall use the same sample order across the board based on Average read length, Number of Input Reads and Sample pairs for better comparison
STAR_subset <- data_melted_beta %>% filter(Run == "STAR")
df_orders <- STAR_subset[order(STAR_subset$Sample, STAR_subset$Average_input_read_length, STAR_subset$Number_of_input_reads),]
order_df <- as.character(rev(unique(df_orders$Sample)))
#data_melted_beta_memory <-data_melted_beta_memory[
# order( data_melted_beta_memory[,6], data_melted_beta_memory[,8] ),
#]
# ggplot(aes(x= factor(Sample, levels=order_df), y=Average_input_read_length))
data_melted_beta_memory %>%
# mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
#ggplot(aes(x = reorder(Sample, (Value)/1000000), y = (Value)/1000000, group=Run,color=factor(Run))) + #note that memory is in kb, dividing by 1000000 to convert it to gb (Maximum resident set size (kbytes))
ggplot(aes(x= factor(Sample, levels=order_df), y = (Value)/1000000, group=Run,color=factor(Run))) +
geom_point(aes(colour=factor(Run)), alpha = 0.9, size=1) + #geom_smooth(aes(colour=factor(Run)), fill = "white", alpha = 0.1, size = 0.1) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = "Memory (Gigabytes)", x="") + scale_y_continuous(limits = c(0,40)) + #rescaling to range from 0-40 GB
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# Without re-scaling
data_melted_beta_memory %>%
# mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
#ggplot(aes(x = reorder(Sample, (Value)/1000000), y = (Value)/1000000, group=Run,color=factor(Run))) + #note that memory is in kb, dividing by 1000000 to convert it to gb (Maximum resident set size (kbytes))
# ggplot(aes(x= fct_reorder2(Sample, Average_input_read_length, Value, .desc = FALSE), y = (Value)/1000000, group=Run,color=factor(Run))) +
ggplot(aes(x= factor(Sample, levels=order_df), y = (Value)/1000000, group=Run,color=factor(Run))) +
geom_point(aes(colour=factor(Run)), size=0.7) + #geom_smooth(aes(colour=factor(Run)), fill = "white", size = 0.1) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = "Memory (Gigabytes)", x="") + scale_y_continuous() + #rescaling to range from 0-40 GB
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# 30-40 scale
data_melted_beta_memory %>%
# mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run,color=factor(Run))) + #note that memory is in kb, dividing by 1000000 to convert it to gb (Maximum resident set size (kbytes))
geom_point(aes(colour=factor(Run)), size=0.7) + #geom_smooth(aes(colour=factor(Run)), fill = "white", size = 0.1) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = "Memory (Gigabytes)", x="") + scale_y_continuous(limits = c(30,40)) + #rescaling to range from 0-40 GB
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# Representation 2
data_melted_beta_memory %>%
#mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run)) +
geom_point(aes(color=factor(Threads),shape=factor(Run),), size=3, alpha=0.9)+
#geom_line()+
scale_shape_manual(values=c("STAR"=3, "WASP"=1, "STAR_WASP"=17))+ #WASP 16 #"STAR"=4, "WASP"=1, "STAR_WASP"=17
#scale_color_manual(values=c('#999999','#E69F00', '#56B4E9'))+
scale_color_manual(values = global_colors) +
# facet_wrap(~Threads) +
#labs(y = "Mapping Speed (Million of reads/hour", x="") +
labs(y = "Memory (Gigabytes)", x="")+ scale_y_continuous(limits = c(30,40)) +
theme_bw() + theme(legend.title = element_blank()) + #Base, bw, excel_new(), few, light, lindraw
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
#theme(strip.text.x = element_blank()) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) #+
#geom_text(x = 6, y = max(data_melted_beta_memory$Value), aes(label = label),
# data = dat_text, check_overlap = TRUE, inherit.aes = FALSE)
#Plot 1 - Splitting up plots by thread
# # 8 Threads
# data_melted_beta_memory %>% filter(Threads == "8 Threads") %>%
# ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run)) +
# geom_point(aes(color=factor(Run), fill=factor(Run)), size=2, shape = 3)+
# scale_color_manual(values = global_colors) +
# labs(y = "Memory (Gigabytes)", x="")+ scale_y_continuous(limits = c(30,40)) +
# theme_bw() + theme(legend.title = element_blank()) + #Base, bw, excel_new(), few, light, lindraw
# theme(strip.background =element_rect(fill="white", colour = "white"))+
# theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
# theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
#
#
#
#
# # 16 Threads
# data_melted_beta_memory %>% filter(Threads == "16 Threads") %>%
# ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run)) +
# geom_point(aes(color=factor(Run), fill=factor(Run)), size=2, shape = 17)+
# scale_color_manual(values = global_colors) +
# labs(y = "Memory (Gigabytes)", x="")+ scale_y_continuous(limits = c(30,40)) +
# theme_bw() + theme(legend.title = element_blank()) + #Base, bw, excel_new(), few, light, lindraw
# theme(strip.background =element_rect(fill="white", colour = "white"))+
# theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
# theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
#
#
# # 32 threads
# data_melted_beta_memory %>% filter(Threads == "32 Threads") %>%
# ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run)) +
# geom_point(aes(color=factor(Run), fill=factor(Run)), size=2, shape = 16)+
# scale_color_manual(values = global_colors) +
# labs(y = "Memory (Gigabytes)", x="")+ scale_y_continuous(limits = c(30,40)) +
# theme_bw() + theme(legend.title = element_blank()) + #Base, bw, excel_new(), few, light, lindraw
# theme(strip.background =element_rect(fill="white", colour = "white"))+
# theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
# theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# 8 Threads
data_melted_beta_memory %>% filter(Threads == "8 Threads") %>%
# mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run,color=factor(Run))) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+
scale_shape_manual(values=c(3, 17, 16))+
scale_color_manual(values=global_colors)+
labs(y = paste0("Memory - Gigabytes", "\n", "(8 Threads)"), x="") + scale_y_continuous(limits = c()) +
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# 16 Threads
data_melted_beta_memory %>% filter(Threads == "16 Threads") %>%
# mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run,color=factor(Run))) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+
scale_shape_manual(values=c(3, 17, 16))+
scale_color_manual(values=global_colors)+
labs(y = paste0("Memory - Gigabytes", "\n", "(16 Threads)"), x="") + scale_y_continuous(limits = c()) +
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# 32 threads
data_melted_beta_memory %>% filter(Threads == "32 Threads") %>%
# mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = (Value)/1000000, group=Run,color=factor(Run))) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+
scale_shape_manual(values=c(3, 17, 16))+
scale_color_manual(values=global_colors)+
labs(y = paste0("Memory - Gigabytes", "\n", "(32 Threads)"), x="") + scale_y_continuous(limits = c()) +
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
#extracting max memory used per run and thread for qsub mem allocation guage
data_melted_beta_memory %>% filter(Run == "STAR" & Threads == "8 Threads") %>% dplyr::select(Value) %>% max() #31041464 run2-31041356
## [1] 31041356
data_melted_beta_memory %>% filter(Run == "STAR" & Threads == "16 Threads") %>% dplyr::select(Value) %>% max() #32991776 run2-32991760
## [1] 32991760
data_melted_beta_memory %>% filter(Run == "STAR" & Threads == "32 Threads") %>% dplyr::select(Value) %>% max() #36892708 run2-36892688
## [1] 36892688
data_melted_beta_memory %>% filter(Run == "WASP" & Threads == "8 Threads") %>% dplyr::select(Value) %>% max() #31041288 run2-31117428
## [1] 31117428
data_melted_beta_memory %>% filter(Run == "WASP" & Threads == "16 Threads") %>% dplyr::select(Value) %>% max() #32991976 run2-33070648
## [1] 33070648
data_melted_beta_memory %>% filter(Run == "WASP" & Threads == "32 Threads") %>% dplyr::select(Value) %>% max() #36892620 run2-36977224
## [1] 36977224
data_melted_beta_memory %>% filter(Run == "STAR_WASP" & Threads == "8 Threads") %>% dplyr::select(Value) %>% max() #31519628 run2-31473172
## [1] 31473172
data_melted_beta_memory %>% filter(Run == "STAR_WASP" & Threads == "16 Threads") %>% dplyr::select(Value) %>% max() #33828116 run2-33781520
## [1] 33781520
data_melted_beta_memory %>% filter(Run == "STAR_WASP" & Threads == "32 Threads") %>% dplyr::select(Value) %>% max() #38445368 run2-38398332
## [1] 38398332
3.3 Speed Based on Wall Clock
Note that each sample should have the same number of input reads at the start for each run, however, for WASP, the Log.final.out file reports input reads passed or already filtered reads which are less than the initial input reads (also affects mapping speed, see example below for HG00512). We therefore need to consider speed based on the wall clock, i.e overall time taken for the run to complete
data_melted_beta_speed <- data_melted_beta[order(data_melted_beta$Sample, data_melted_beta$Mapping_speed_Million_of_reads_per_hour),]
data_melted_beta_speed$Threads <- ordered(data_melted_beta_speed$Threads , levels = c("8 Threads", "16 Threads", "32 Threads"))
data_melted_beta_speed$Value <- as.numeric(data_melted_beta_speed$Mapping_speed_Million_of_reads_per_hour)
data_melted_beta_speed$Sample <- as.factor(data_melted_beta_speed$Sample)
data_melted_beta_speed$Run <- as.factor(data_melted_beta_speed$Run)
data_melted_beta_speed <- data_melted_beta_speed[order(data_melted_beta_speed$Sample, data_melted_beta_speed$Value),]
data_melted_beta_speed <- data_melted_beta_speed %>%
group_by(Sample, Threads) %>%
dplyr::mutate(speed_order = sum(Mapping_speed_Million_of_reads_per_hour)) %>%
arrange(speed_order)
data_melted_beta_speed <- as.data.frame(data_melted_beta_speed)
unique(data_melted_beta_speed[,c(1, 4:7)] %>% filter (Sample == "HG00512"))
## Sample Threads Run Number_of_input_reads
## 1 HG00512 8 Threads WASP 1110628
## 23 HG00512 8 Threads STAR_WASP 66055710
## 45 HG00512 8 Threads STAR 66055710
## 67 HG00512 16 Threads WASP 1110628
## 89 HG00512 16 Threads STAR_WASP 66055710
## 111 HG00512 16 Threads STAR 66055710
## 133 HG00512 32 Threads WASP 1110628
## 155 HG00512 32 Threads STAR 66055710
## 177 HG00512 32 Threads STAR_WASP 66055710
## Mapping_speed_Million_of_reads_per_hour
## 1 249.89
## 23 637.54
## 45 691.28
## 67 249.89
## 89 1106.05
## 111 1225.78
## 133 249.89
## 155 1651.39
## 177 1748.53
Note that for this analysis, we are taking the number of input reads per sample prior to each run
data_melted_beta_wall_clock <- data_melted_beta %>% filter(Param == "Wall_Clock")
# Extracting number of input reads per sample based on STAR runs - should be same input reads for each sample/fastq files
num_input_reads_per_sample <- unique(data_melted_beta %>% filter(Run == "STAR") %>% dplyr::select("Sample", "Number_of_input_reads"))
colnames(num_input_reads_per_sample)[2] <- "Number_of_input_reads_initial"
data_melted_beta_wall_clock <- inner_join(data_melted_beta_wall_clock, num_input_reads_per_sample, by = "Sample")
### Number of input reads per sample:
num_input_reads_per_sample %>%
ggplot(aes(reorder(x=Sample, -Number_of_input_reads_initial), y=Number_of_input_reads_initial)) +
geom_bar(stat = "identity", fill="gray60") +
theme_light() +
scale_y_continuous(expand=c(0,0), limits=c(0, max(num_input_reads_per_sample$Number_of_input_reads_initial)+1000000))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Number of Input Reads", x = "") +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# Plotting by average input read length
input_read_len_per_sample <- unique(data_melted_beta %>% filter(Run == "STAR") %>% dplyr::select("Sample", "Average_input_read_length"))
input_read_len_per_sample %>%
ggplot(aes(reorder(x=Sample, -Average_input_read_length), y=Average_input_read_length)) +
geom_bar(stat = "identity", fill="gray60") +
theme_light() +
scale_y_continuous(expand=c(0,0), limits = c(0, max(input_read_len_per_sample$Average_input_read_length)+10))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Average Input Read Length", x = "") +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
#Stacked and ordered
input_read_len_per_sample <- unique(data_melted_beta %>% filter(Run == "STAR") %>% dplyr::select("Sample", "Average_input_read_length"))
input_read_len_per_sample <- inner_join(num_input_reads_per_sample, input_read_len_per_sample, by = "Sample")
intro_order <- c("Total_R1", "Total_R2", "PolyA_R1", "PolyA_R2", "Nucleus_PolyA_R1","Nucleus_PolyA_R2", "Nucleus_nonPolyA_R1", "Nucleus_nonPolyA_R2", "HG00733", "NA19239","HG00732", "HG00512","NA19238", "NA19240", "HG00513", "HG00731" )
a<-input_read_len_per_sample %>%
ggplot(aes(x=factor(Sample, levels=intro_order), y=Average_input_read_length)) +
geom_bar(stat = "identity", fill="gray60") +
theme_light() +
scale_y_continuous(expand=c(0,0), limits = c(0, max(input_read_len_per_sample$Average_input_read_length)+10))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Average Input Read Length", x = "") +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
b<- num_input_reads_per_sample %>%
ggplot(aes(x=factor(Sample, levels=intro_order), y=Number_of_input_reads_initial)) +
geom_bar(stat = "identity", fill="gray60") +
theme_light() +
scale_y_continuous(expand=c(0,0), limits=c(0, max(num_input_reads_per_sample$Number_of_input_reads_initial)+1000000))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Number of Input Reads", x = "") +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
plot_grid(a, b, nrow=2, labels = "AUTO", rel_heights = c(0.45,0.66), align = 'v')
min(num_input_reads_per_sample$Number_of_input_reads_initial)#56254714
## [1] 56254714
max(num_input_reads_per_sample$Number_of_input_reads_initial)#128402941
## [1] 128402941
mean(num_input_reads_per_sample$Number_of_input_reads_initial)#86852506
## [1] 86852506
median(num_input_reads_per_sample$Number_of_input_reads_initial)#92285172
## [1] 92285172
data_melted_beta_wall_clock$Unique_ID <- 1:nrow(data_melted_beta_wall_clock)
data_melted_beta_wall_clock$Unique_ID <- as.character(data_melted_beta_wall_clock$Unique_ID)
data_melted_beta_wall_clock$Wall_Clock <- data_melted_beta_wall_clock$Value #maintaining a copy of the original variable
#Note that we have hours:minutes:seconds and minutes:seconds.milliseconds in our dataset - need to convert all to hours
data_melted_beta_wall_clock_min_sec_ms <- data_melted_beta_wall_clock[grepl('\\.', data_melted_beta_wall_clock$Value), ] #we want to grep all values that have a dot in them = milliseconds - overall these values have minutes:seconds:milliseconds
data_melted_beta_wall_clock_min_sec_ms_split1 <- cSplit(data_melted_beta_wall_clock_min_sec_ms, "Value", ":", direction = "wide", type.convert="as.character")
data_melted_beta_wall_clock_min_sec_ms_split <- cSplit(data_melted_beta_wall_clock_min_sec_ms_split1, "Value_2", ".", direction = "wide", type.convert="as.character")
colnames(data_melted_beta_wall_clock_min_sec_ms_split)[11] <- "minutes"
colnames(data_melted_beta_wall_clock_min_sec_ms_split)[12] <- "seconds"
colnames(data_melted_beta_wall_clock_min_sec_ms_split)[13] <- "milliseconds"
data_melted_beta_wall_clock_min_sec_ms_split$minutes <- as.numeric(as.character(data_melted_beta_wall_clock_min_sec_ms_split$minutes))
data_melted_beta_wall_clock_min_sec_ms_split$seconds <- as.numeric(as.character(data_melted_beta_wall_clock_min_sec_ms_split$seconds))
data_melted_beta_wall_clock_min_sec_ms_split$milliseconds <- as.numeric(as.character(data_melted_beta_wall_clock_min_sec_ms_split$milliseconds))
#Next we shall convert the different units to hours and sum them up for total time in hours
data_melted_beta_wall_clock_min_sec_ms_split <- data_melted_beta_wall_clock_min_sec_ms_split %>% mutate(min_in_hrs = (minutes/60))
data_melted_beta_wall_clock_min_sec_ms_split <- data_melted_beta_wall_clock_min_sec_ms_split %>% mutate(seconds_in_hrs = (seconds/(60*60)))
data_melted_beta_wall_clock_min_sec_ms_split <- data_melted_beta_wall_clock_min_sec_ms_split %>% mutate(milliseconds_in_hrs = (milliseconds/(3.6e+6)))
data_melted_beta_wall_clock_min_sec_ms_split <- data_melted_beta_wall_clock_min_sec_ms_split %>% mutate(time_hrs = (min_in_hrs + seconds_in_hrs + milliseconds_in_hrs))
#Second Subset - overall, these values have hours:minutes:seconds
data_melted_beta_wall_clock_hr_min_sec <- as.data.frame(data_melted_beta_wall_clock %>% filter(Unique_ID %nin% data_melted_beta_wall_clock_min_sec_ms$Unique_ID)) #values should not have dots/ms unit values in them
data_melted_beta_wall_clock_hr_min_sec_split <- cSplit(data_melted_beta_wall_clock_hr_min_sec, "Value", ":", direction = "wide", type.convert="as.character")
colnames(data_melted_beta_wall_clock_hr_min_sec_split)[11] <- "hrs"
colnames(data_melted_beta_wall_clock_hr_min_sec_split)[12] <- "minutes"
colnames(data_melted_beta_wall_clock_hr_min_sec_split)[13] <- "seconds"
data_melted_beta_wall_clock_hr_min_sec_split$hrs <- as.numeric(as.character(data_melted_beta_wall_clock_hr_min_sec_split$hrs))
data_melted_beta_wall_clock_hr_min_sec_split$minutes <- as.numeric(as.character(data_melted_beta_wall_clock_hr_min_sec_split$minutes))
data_melted_beta_wall_clock_hr_min_sec_split$seconds <- as.numeric(as.character(data_melted_beta_wall_clock_hr_min_sec_split$seconds))
#Next we shall convert the different units to hours and sum them up for total time in hours
data_melted_beta_wall_clock_hr_min_sec_split <- data_melted_beta_wall_clock_hr_min_sec_split %>% mutate(min_in_hrs = (minutes/60))
data_melted_beta_wall_clock_hr_min_sec_split <- data_melted_beta_wall_clock_hr_min_sec_split %>% mutate(seconds_in_hrs = (seconds/(60*60)))
data_melted_beta_wall_clock_hr_min_sec_split <- data_melted_beta_wall_clock_hr_min_sec_split %>% mutate(time_hrs = (hrs + min_in_hrs + seconds_in_hrs))
#Mutating speed based on wall clock (num of input reads/ time in hrs)
data_melted_beta_wall_clock_min_sec_ms_split <- data_melted_beta_wall_clock_min_sec_ms_split %>% mutate(reads_per_hour = (Number_of_input_reads_initial/time_hrs))
data_melted_beta_wall_clock_hr_min_sec_split <- data_melted_beta_wall_clock_hr_min_sec_split %>% mutate(reads_per_hour = (Number_of_input_reads_initial/time_hrs))
head(data_melted_beta_wall_clock_min_sec_ms_split)
## Sample Param Threads Run Number_of_input_reads
## 1: HG00512 Wall_Clock 16 Threads STAR 66055710
## 2: HG00513 Wall_Clock 16 Threads STAR 58601893
## 3: HG00731 Wall_Clock 16 Threads STAR 56254714
## 4: HG00732 Wall_Clock 16 Threads STAR 70029452
## 5: HG00733 Wall_Clock 16 Threads STAR 92075712
## 6: Nucleus_nonPolyA_R1 Wall_Clock 16 Threads STAR 110469791
## Mapping_speed_Million_of_reads_per_hour Average_input_read_length
## 1: 1225.78 102
## 2: 1191.90 102
## 3: 1065.88 102
## 4: 840.35 102
## 5: 1029.42 102
## 6: 831.99 152
## Number_of_input_reads_initial Unique_ID Wall_Clock minutes seconds
## 1: 66055710 1 3:24.40 3 24
## 2: 58601893 2 3:06.91 3 6
## 3: 56254714 3 3:20.14 3 20
## 4: 70029452 4 5:10.74 5 10
## 5: 92075712 5 5:33.08 5 33
## 6: 110469791 6 8:10.55 8 10
## milliseconds min_in_hrs seconds_in_hrs milliseconds_in_hrs time_hrs
## 1: 40 0.05000000 0.006666667 1.111111e-05 0.05667778
## 2: 91 0.05000000 0.001666667 2.527778e-05 0.05169194
## 3: 14 0.05000000 0.005555556 3.888889e-06 0.05555944
## 4: 74 0.08333333 0.002777778 2.055556e-05 0.08613167
## 5: 8 0.08333333 0.009166667 2.222222e-06 0.09250222
## 6: 55 0.13333333 0.002777778 1.527778e-05 0.13612639
## reads_per_hour
## 1: 1165460478
## 2: 1133675539
## 3: 1012513976
## 4: 813051166
## 5: 995389189
## 6: 811523702
head(data_melted_beta_wall_clock_hr_min_sec_split)
## Sample Param Threads Run Number_of_input_reads
## 1: Nucleus_nonPolyA_R1 Wall_Clock 16 Threads WASP 8078842
## 2: Nucleus_nonPolyA_R2 Wall_Clock 16 Threads WASP 8737811
## 3: Nucleus_PolyA_R1 Wall_Clock 16 Threads WASP 11671879
## 4: Nucleus_PolyA_R2 Wall_Clock 16 Threads WASP 10382396
## 5: PolyA_R1 Wall_Clock 16 Threads WASP 11045656
## 6: PolyA_R2 Wall_Clock 16 Threads WASP 11009414
## Mapping_speed_Million_of_reads_per_hour Average_input_read_length
## 1: 632.26 152
## 2: 533.15 152
## 3: 466.88 152
## 4: 473.12 152
## 5: 436.97 202
## 6: 430.80 202
## Number_of_input_reads_initial Unique_ID Wall_Clock hrs minutes seconds
## 1: 110469791 102 1:01:58 1 1 58
## 2: 106919251 103 1:05:32 1 5 32
## 3: 128402941 104 1:26:12 1 26 12
## 4: 116517502 105 1:17:05 1 17 5
## 5: 97548052 106 1:13:08 1 13 8
## 6: 93555584 107 1:13:49 1 13 49
## min_in_hrs seconds_in_hrs time_hrs reads_per_hour
## 1: 0.01666667 0.016111111 1.032778 106963757
## 2: 0.08333333 0.008888889 1.092222 97891481
## 3: 0.43333333 0.003333333 1.436667 89375597
## 4: 0.28333333 0.001388889 1.284722 90694704
## 5: 0.21666667 0.002222222 1.218889 80030307
## 6: 0.21666667 0.013611111 1.230278 76044277
# Subsetting datasets to required plotting variables ( Sample Param Threads Run Mapping_speed_Million_of_reads_per_hour time_hrs reads_per_hour)
data_melted_beta_wall_clock_min_sec_ms_split <- data_melted_beta_wall_clock_min_sec_ms_split[,c(1:4, 6,17:18)]
data_melted_beta_wall_clock_hr_min_sec_split <- data_melted_beta_wall_clock_hr_min_sec_split[,c(1:4, 6, 16:17)]
data_melted_beta_wall_clock_converted <- rbind(data_melted_beta_wall_clock_min_sec_ms_split, data_melted_beta_wall_clock_hr_min_sec_split)
data_melted_beta_wall_clock_converted <- inner_join(data_melted_beta_wall_clock_converted, input_read_len_per_sample, by = "Sample")
# Plotting wall clock
data_melted_beta_wall_clock_converted %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
geom_point(aes(color=factor(Run)), size=1)+
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = "Million Reads/Hour", x = "")+
theme_bw() + theme(legend.title = element_blank()) + scale_y_continuous(trans='log10') +
theme(strip.background = element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# 8 threads
data_melted_beta_wall_clock_converted %>% filter(Threads == "8 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
geom_point(aes(color=factor(Run)), size=1)+
scale_color_manual(values = global_colors) +
labs(y = paste0("Million Reads/Hour", "\n", "(8 Threads)"), x = "")+
theme_bw() + theme(legend.title = element_blank()) + scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# 16 threads
data_melted_beta_wall_clock_converted %>% filter(Threads == "16 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
geom_point(aes(color=factor(Run)), size=1)+
scale_color_manual(values = global_colors) +
labs(y = paste0("Million Reads/Hour", "\n", "(16 Threads)"), x = "")+
theme_bw() + theme(legend.title = element_blank()) + scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# 32 threads
data_melted_beta_wall_clock_converted %>% filter(Threads == "32 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
geom_point(aes(color=factor(Run)), size=1)+
scale_color_manual(values = global_colors) +
labs(y = paste0("Million Reads/Hour", "\n", "(32 Threads)"), x = "")+
theme_bw() + theme(legend.title = element_blank()) + scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# Representation 2 -taken
logticks <- function(datavar,type) {
minimum <- 1/10^abs(floor(log10(min(datavar, na.rm=TRUE))))
maximum <- 1*10^abs(floor(log10(max(datavar, na.rm=TRUE)))+1)
multiple <- floor(log10(maximum/minimum))
yourtickvector <- c()
if (type=="breaks") {
yourtickvector <- c(minimum)
for (x in seq(0,multiple)) {
andadd <- seq(minimum*10^x,minimum*10^(x+1),minimum*10^x)[-1]
yourtickvector <- c(yourtickvector,andadd)
}
} else if (type=="labels") {
for (x in seq(0,multiple)) {
andadd <- c(minimum*10^x,rep("",8))
yourtickvector <- c(yourtickvector,andadd)
}
yourtickvector <- c(yourtickvector,minimum*10^multiple)
}
return(yourtickvector)
}
data_melted_beta_wall_clock_converted %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run)) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+#,alpha=.8
scale_shape_manual(values=c(3, 17, 16))+
facet_wrap(~Threads) +
#scale_color_manual(values=c('gray50','firebrick3', '#56B4E9'))+
scale_color_manual(values=global_colors)+
labs(y = "Million Reads/Hour", x="")+
theme_bw() + theme(legend.title = element_blank())+ #scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) +
scale_y_log10(
breaks = logticks(data_melted_beta_wall_clock_converted$reads_per_hour,"breaks"),
labels = logticks(data_melted_beta_wall_clock_converted$reads_per_hour,"labels")
)
#scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
# labels = trans_format("log10", math_format(10^.x))) + annotation_logticks()
data_melted_beta_wall_clock_converted %>% filter(Threads == "8 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run)) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+#,alpha=.8
scale_shape_manual(values=c(3, 17, 16))+
#scale_color_manual(values=c('gray50','firebrick3', '#56B4E9'))+
scale_color_manual(values=global_colors)+
labs(y = paste0("Million Reads/Hour", "\n", "(8 Threads)"), x="")+
theme_bw() + theme(legend.title = element_blank()) + scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
data_melted_beta_wall_clock_converted %>% filter(Threads == "16 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run)) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+#,alpha=.8
scale_shape_manual(values=c(3, 17, 16))+
scale_color_manual(values=global_colors)+
labs(y = paste0("Million Reads/Hour", "\n", "(16 Threads)"), x="")+
theme_bw() + theme(legend.title = element_blank())+ scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
data_melted_beta_wall_clock_converted %>% filter(Threads == "32 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run)) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+#,alpha=.8
scale_shape_manual(values=c(3, 17, 16))+
scale_color_manual(values=global_colors)+
labs(y = paste0("Million Reads/Hour", "\n", "(32 Threads)"), x="")+
theme_bw() + theme(legend.title = element_blank()) + scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# num_input_reads_per_sample %>%
# ggplot(aes(x= factor(Sample, levels=order_df), y=Number_of_input_reads_initial)) +
# geom_bar(stat = "identity", fill="gray60") +
# theme_light() +
# scale_y_continuous(expand=c(0,0))+
# scale_x_discrete(expand=c(0,0)) +
# labs("Number of Input Reads", x = "") +
# theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
a <- num_input_reads_per_sample %>%
# ggplot(aes(reorder(x=Sample, -Number_of_input_reads_initial), y=Number_of_input_reads_initial)) +
ggplot(aes(x= factor(Sample, levels=order_df), y=Number_of_input_reads_initial)) +
geom_bar(stat = "identity", fill="#999999") +
theme_light() +
scale_y_continuous(expand=c(0,0)) +#, limits = c(0, max(num_input_reads_per_sample$Number_of_input_reads_initial)+100))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Number of Input Reads", x = "") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
b <- data_melted_beta_wall_clock_converted %>% filter(Threads == "8 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour, group=Run, color=Run)) +
geom_point(aes(color=factor(Run)), size=2)+#,alpha=.8
scale_color_manual(values=global_colors)+
labs(y = paste0("Reads/Hours", "\n", "(8 Threads)"), x = "")+
theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") +
theme(strip.background =element_rect(fill="white", colour = "white"))+ scale_y_continuous(trans='log10') +
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
plot_grid(a, b, ncol = 1, nrow=2, labels = "AUTO", rel_heights = c(0.5,1), rel_widths = c(1,1), align = 'hv')
## Warning: Graphs cannot be horizontally aligned unless the axis parameter is set.
## Placing graphs unaligned.
## By average input read length
c <- input_read_len_per_sample %>%
ggplot(aes(x= factor(Sample, levels=order_df), y=Average_input_read_length)) +
geom_bar(stat = "identity", fill="gray60") +
theme_light() +
scale_y_continuous(expand=c(0,0), limits = c(0, max(input_read_len_per_sample$Average_input_read_length)+10))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Average input read length", x = "") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
d <- data_melted_beta_wall_clock_converted %>% filter(Threads == "8 Threads") %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour, group=Run, color=Run)) +
geom_point(aes(color=factor(Run)), size=2)+#,alpha=.8
scale_color_manual(values=global_colors)+
labs(y = paste0("Reads/Hours", "\n", "(8 Threads)"), x = "")+
theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") +
theme(strip.background =element_rect(fill="white", colour = "white"))+ scale_y_continuous(trans='log10') +
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
plot_grid(c, d, ncol = 1, nrow=2, labels = "AUTO", rel_heights = c(0.5,1), rel_widths = c(1,1), align = 'hv')
## Warning: Graphs cannot be horizontally aligned unless the axis parameter is set.
## Placing graphs unaligned.
4.0 vW_Tag Distributions for alignments that did not pass WASP filtering
4.2 Sample-specific vW_Tag Distributions for alignments that did not pass WASP filtering
#revist expunge TD
samples <- c("HG00512","HG00513","HG00731","HG00732","HG00733","NA12878_Nucleus_nonPolyA","NA12878_Nucleus_nonPolyA_Rep",
"NA12878_Nucleus_PolyA", "NA12878_Nucleus_PolyA_Rep", "NA12878_PolyA","NA12878_PolyA_Rep","NA12878_Total","NA12878_Total_Rep","NA19238", "NA19239","NA19240")
files_list <- list()
for(i in samples){
#path <- paste0("/home/asiimwe/Desktop/TD/pass2_to_keep_", i , ".txt")
path <- paste0("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/WASP/WASP_Runs/", i, "/32threads/pass2_to_keep.txt")
files_list[[i]] <- path
}
files = do.call(rbind, files_list)
# Reading and merging data from all samples
merger <- lapply(files, function(x) {
merger <- read.table(x, comment.char = "", sep = '', header = FALSE, stringsAsFactors = FALSE)
merger$source <- x
return(merger)
}) %>%
bind_rows()
merger <- as.data.frame(merger)
merger %>% head(n=2)
merger$Sample <- merger$source
merger$source <- NULL
merger$Sample <- gsub("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/WASP/WASP_Runs/", "", merger$Sample)
merger$Sample <- gsub("/32threads/pass2_to_keep.txt", "", merger$Sample)
merger %>% head(n=2)
unique(merger$Sample)#validate called samples
merger <- as.data.frame(merger)
colnames(merger) <- c("Read_ID", "vW_Tag", "Reads_to_remap", "Reads_to_keep", "Sample")
merger %>% head(n=2)
merger <- cSplit(merger, "vW_Tag", ":", direction = "wide", type.convert="as.character")
merger$vW_Tag_1 <- NULL
merger$vW_Tag_2 <- NULL
colnames(merger)[5] <- "vW_Tag"
merger %>% head(n=2)
merger$vW_Tag <- as.integer(as.character(merger$vW_Tag))
merger <- as.data.frame(merger)
write.csv(merger, file="/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/merger_reads_to_keep_and_remap.csv")
#Creating dataframe with mutated percentages per tag for plotting
perclist <- list()
for (i in unique(merger$Sample)){
#print(i)
perc <- merger %>% filter(Sample == i)
perc <- as.data.frame(perc %>%
group_by(vW_Tag) %>%
dplyr::summarise(cnt = n()) %>%
mutate(freq = formattable::percent(cnt / sum(cnt), digits = 5)))
perc$Sample <- i
perclist[[i]] <- perc
}
vW_Tag_percentages = do.call(rbind, perclist)
merger_vW_Tag_percentages <- as.data.frame(vW_Tag_percentages)
orders <- as.data.frame(merger_vW_Tag_percentages %>% filter(vW_Tag != 1) %>%
group_by(Sample) %>%
dplyr::summarise(sum(cnt)))
colnames(orders)[2] <- "order"
vW_Tag_percentages <- inner_join(merger_vW_Tag_percentages, orders, by = "Sample")
# Plotting
unique(vW_Tag_percentages$vW_Tag) #1 3 4 6 7
colourCount = length(unique(vW_Tag_percentages$vW_Tag))
getPalette = colorRampPalette(brewer.pal(8,"RdYlBu"))#RdYlBu Spectral Greys
vW_Tag_percentages$vW_Tag <- as.factor(vW_Tag_percentages$vW_Tag)
vW_Tag_percentages$Freq <- vW_Tag_percentages$freq
vW_Tag_percentages$Freq <- gsub("%", "", vW_Tag_percentages$Freq)
vW_Tag_percentages$Freq <- as.numeric(as.character(vW_Tag_percentages$Freq))
#Sneak pick into distributions of tags overall
vW_Tag_percentages %>% group_by(vW_Tag) %>% dplyr::summarise(sum(cnt))
# A tibble: 5 × 2
# vW_Tag `sum(cnt)`
# <fct> <int>
#1 1 97368800
#2 3 40555
#3 4 864141
#4 6 571383
#5 7 34311
#https://htmlcolorcodes.com/
#"#D73027" - red var
#"#FA9D59" -orange var
#"#EFE9C4" - light or
#"#9DCDE3" -blue
#"#4575B4" - dark blue
vW_Tag_percentages$vW_Tag_desc <- vW_Tag_percentages$vW_Tag
vW_Tag_percentages$vW_Tag_desc <- as.character(vW_Tag_percentages$vW_Tag_desc)
vW_Tag_percentages$vW_Tag_desc[vW_Tag_percentages$vW_Tag_desc == "1"] <- "Alignment Passed WASP Filtering"
vW_Tag_percentages$vW_Tag_desc[vW_Tag_percentages$vW_Tag_desc == "3"] <- "Variant Base in Read is N/non-ACGT"
vW_Tag_percentages$vW_Tag_desc[vW_Tag_percentages$vW_Tag_desc == "4"] <- "Remapped Read did not Map"
vW_Tag_percentages$vW_Tag_desc[vW_Tag_percentages$vW_Tag_desc == "6"] <- "Remapped Read Maps to Different Locus"
vW_Tag_percentages$vW_Tag_desc[vW_Tag_percentages$vW_Tag_desc == "7"] <- "Read Overlaps too Many Variants"
write.csv(vW_Tag_percentages, file="/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/vW_Tag_percentages.csv")
vW_Tag_percentages <- read.csv(file="/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/vW_Tag_percentages.csv")
## Cleaning up Sample names
unique(vW_Tag_percentages$Sample)
## [1] HG00512 HG00513
## [3] HG00731 HG00732
## [5] HG00733 NA12878_Nucleus_nonPolyA
## [7] NA12878_Nucleus_nonPolyA_Rep NA12878_Nucleus_PolyA
## [9] NA12878_Nucleus_PolyA_Rep NA12878_PolyA
## [11] NA12878_PolyA_Rep NA12878_Total
## [13] NA12878_Total_Rep NA19238
## [15] NA19239 NA19240
## 16 Levels: HG00512 HG00513 HG00731 HG00732 HG00733 ... NA19240
#Removing NA12878 from all NA12878 derived samples
vW_Tag_percentages$Sample <- gsub("NA12878_", "", vW_Tag_percentages$Sample)
vW_Tag_percentages$Sample <- gsub("Rep", "R2", vW_Tag_percentages$Sample)
vW_Tag_percentages$Sample[vW_Tag_percentages$Sample == "Total"] <- "Total_R1"
vW_Tag_percentages$Sample[vW_Tag_percentages$Sample == "PolyA"] <- "PolyA_R1"
vW_Tag_percentages$Sample[vW_Tag_percentages$Sample == "Nucleus_PolyA"] <- "Nucleus_PolyA_R1"
vW_Tag_percentages$Sample[vW_Tag_percentages$Sample == "Nucleus_nonPolyA"] <- "Nucleus_nonPolyA_R1"
plot <- vW_Tag_percentages %>% filter(vW_Tag != 1) %>%
#mutate(Sample = fct_reorder(Sample, order)) %>%
ggplot() +
geom_bar(aes(x = factor(Sample, levels=order_df), Freq,
group = Sample,
fill = vW_Tag_desc),
stat = "identity", width = 0.99, alpha=0.9) +
geom_text(aes(x = Sample,
y =Freq,
label = freq), size = 3,
position = position_stack(vjust = 0.5)
) +
#scale_fill_manual(values = getPalette(colourCount))+
#scale_fill_manual(values = c("#9DCDE3", "#D73027", "#4575B4", "#FA9D59", "#EFE9C4")) +
scale_fill_manual(values =c("Variant Base in Read is N/non-ACGT" = "#D73027",
"Remapped Read did not Map" = "#FA9D59",
"Remapped Read Maps to Different Locus" ="#EFE9C4",
"Read Overlaps too Many Variants"="#9DCDE3"))+
theme_test()+
labs(y=paste0("% Distribution of vW_Tagged", "\n", "Reads that did not pass WASP filtering"), x="") +
theme(legend.title = element_blank(), legend.position="bottom") + #legend.position = c(0.8, 0.65)
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) +
scale_y_continuous(expand = c(0.02,0), limits = c(0, 5)) +
scale_x_discrete(expand = c(0,0))# + coord_flip()
plot_grid(c, plot, ncol = 1, nrow=2, labels = "AUTO", rel_heights = c(0.5,1), rel_widths = c(1,1), align = 'hv')
## Warning: Graphs cannot be horizontally aligned unless the axis parameter is set.
## Placing graphs unaligned.
#ggsave("~/Desktop/merger/plot_nocolor.pdf", plot=plot,width=10, height=15, device="pdf")
#ggsave("~/Desktop/merger/plot_color.pdf", plot=plot,width=10, height=15, device="pdf")#output with color too
d <- c + geom_bar(stat = "identity", fill="gray85") +theme_test() + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) + coord_flip() + geom_text(aes(x = Sample,
y =Average_input_read_length,
label = Average_input_read_length), size = 3,
position = position_stack(vjust = 0.5)
) + labs(y="Average Input Read Length") #+ theme(plot.margin = unit(c(0,0,0,-0.6), "cm"))
vtagd <- vtg +theme_test() + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) + coord_flip() +
theme(legend.position = "none") +scale_y_continuous(expand = c(0,0), limits = c(0, 7.5)) # + theme(plot.margin = unit(c(0,0,0,-0.6), "cm"))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
#Flipped
unique(vW_Tag_percentages$vW_Tag_desc)
## [1] Alignment Passed WASP Filtering Variant Base in Read is N/non-ACGT
## [3] Remapped Read did not Map Remapped Read Maps to Different Locus
## [5] Read Overlaps too Many Variants
## 5 Levels: Alignment Passed WASP Filtering ... Variant Base in Read is N/non-ACGT
plot2 <- vW_Tag_percentages %>% filter(vW_Tag != 1) %>%
ggplot() +
geom_bar(aes(x = factor(Sample, levels=order_df), Freq,
group = Sample,
fill = vW_Tag_desc),
stat = "identity", width = 0.99, alpha = 0.9) +
geom_text(aes(x = Sample,
y =Freq,
label = freq), size = 3,
position = position_stack(vjust = 0.5)
) +
scale_fill_manual(values =c("Variant Base in Read is N/non-ACGT" = "#D73027",
"Remapped Read did not Map" = "#FA9D59",
"Remapped Read Maps to Different Locus" ="#EFE9C4",
"Read Overlaps too Many Variants"="#9DCDE3"))+
theme_test()+
labs(y=paste0("% Distribution of vW_Tags", "\n", "reads that did not pass WASP filtering"), x="") +
theme(legend.title = element_blank(), legend.position = "none") + #, legend.position = "bottom"
theme(axis.text.x = element_text(angle = 0,hjust = 1,vjust = 0.5, size=10)) +
scale_y_continuous(expand = c(0.03,0), limits = c(0, 4.5)) +
scale_x_discrete(expand = c(0,0)) + coord_flip() + theme(plot.margin = unit(c(0,0,0,0), "cm")) #margin(t = 0, r = 0, b = 0, l = 0, unit = "pt")
plot_grid(plot2, vtagd, d, ncol = 3, labels = "AUTO", rel_widths = c(10,2,2), align = 'h')
#plot_grid(c, d, ncol = 1, nrow=2, labels = "AUTO", rel_heights = c(0.5,1), rel_widths = c(1,1), align = 'hv')
Venn Diagrams - Overall
merger <- read.csv("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/merger_reads_to_keep_and_remap.csv")
merger$X <- NULL
## Cleaning up Sample names
unique(merger$Sample)
#Removing NA12878 from all NA12878 derived samples
merger$Sample <- gsub("NA12878_", "", merger$Sample)
merger$Sample <- gsub("Rep", "R2", merger$Sample)
merger$Sample[merger$Sample == "Total"] <- "Total_R1"
merger$Sample[merger$Sample == "PolyA"] <- "PolyA_R1"
merger$Sample[merger$Sample == "Nucleus_PolyA"] <- "Nucleus_PolyA_R1"
merger$Sample[merger$Sample == "Nucleus_nonPolyA"] <- "Nucleus_nonPolyA_R1"
unique(merger$Sample)
head(merger, n=3)
merger$to_keep_recoded <- merger$Reads_to_keep
merger$to_remap_recoded <- merger$Reads_to_remap
sum(is.na(merger$to_remap))#147190
sum(is.na(merger$to_keep))#1480843
merger$to_remap_recoded <- as.character(merger$to_remap_recoded)
merger$to_keep_recoded <- as.character(merger$to_keep_recoded)
merger$to_remap_recoded[is.na(merger$to_remap_recoded)] <- 0
merger$to_remap_recoded[merger$to_remap_recoded != 0] <- 1
merger$to_keep_recoded[is.na(merger$to_keep_recoded)] <- 0
merger$to_keep_recoded[merger$to_keep_recoded != 0] <- 1
head(merger)
merger$vW_tag <- as.integer(merger$vW_Tag)
merger$to_remap_recoded <- as.integer(merger$to_remap_recoded)
merger$to_keep_recoded <- as.integer(merger$to_keep_recoded)
#Creating comparative read sets for venn diagram
merger$Reads_to_remap <- as.character(merger$Reads_to_remap)
merger$Reads_to_keep <- as.character(merger$Reads_to_keep)
merger$Reads_to_remap[is.na(merger$Reads_to_remap)] <- "0" #Venn diagrams don't work well with NAs so we shall replace all with zeros
merger$Reads_to_keep[is.na(merger$Reads_to_keep)] <- "0"
str(merger)
# merger_HG00512 <- merger %>% filter(Sample == "HG00512")
# set1 <- as.character(merger_HG00512$Read_ID)
# set2 <- as.character(merger_HG00512$Reads_to_remap)
# set3 <- as.character(merger_HG00512$Reads_to_keep)
#Overall
set1 <- as.character(merger$Read_ID)
set2 <- as.character(merger$Reads_to_remap)
set3 <- as.character(merger$Reads_to_keep)
myCol <- brewer.pal(3, "Pastel2")
venn.diagram(
x = list(set1, set2, set3),
category.names = c("vW_tagged_reads" , "Reads_to_remap" , "Reads_to_keep"),
filename = '/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/Venn_Diagrams_Per_Sample/Read_Overlaps_Overall.png',
output = TRUE ,
imagetype="png" ,
height = 600 ,
width = 600 ,
sub = "Overall Read Overlaps",
sub.cex = 0.5,
resolution = 300,
compression = "lzw",
lwd = 0.25,
col=c('#21908dff', '#ffa533', "#2a5c92"),
fill = c(alpha("#21908dff",0.2), alpha('#ffa533',0.2), alpha('#2a5c92',0.2)),
cex = 0.5,
fontfamily = "serif",
cat.cex = 0.4,
cat.default.pos = "outer",
cat.pos = c(-27, 27, 135),
cat.dist = c(0.055, 0.055, 0.085),
cat.fontfamily = "sans",
#cat.col = c("#21908dff", '#ffa533', '#ffa533'),
cat.col = "black",
rotation = 1
)
Overlaps Per sample
for(i in unique(merger$Sample)){
#print(i)
merger_sample <- merger %>% filter(Sample == i)
set1 <- as.character(merger_sample$Read_ID)
set2 <- as.character(merger_sample$Reads_to_remap)
set3 <- as.character(merger_sample$Reads_to_keep)
myCol <- brewer.pal(3, "Pastel2")
venn.diagram(
x = list(set1, set2, set3),
category.names = c("vW_tagged_reads" , "Reads_to_remap" , "Reads_to_keep"),
filename = paste0("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/Venn_Diagrams_Per_Sample/Read_Overlaps_", i, ".png"),
output = TRUE ,
imagetype="png" ,
height = 600 ,
width = 600 ,
sub = i,
sub.cex = 0.5,
resolution = 300,
compression = "lzw",
lwd = 0.25,
col=c('#21908dff', '#ffa533', "#2a5c92"),
fill = c(alpha("#21908dff",0.2), alpha('#ffa533',0.2), alpha('#2a5c92',0.2)),
cex = 0.5,
fontfamily = "sans",
cat.cex = 0.4,
cat.default.pos = "outer",
cat.pos = c(-27, 27, 135),
cat.dist = c(0.055, 0.055, 0.085),
cat.fontfamily = "sans",
#cat.col = c("#21908dff", '#ffa533', '#ffa533'),
cat.col = "black",
rotation = 1
)
}