Elzar Benchmarking Results

1.0 Loading Required Libraries

library(splitstackshape)
library(dplyr)
library(ggplot2)
library(VennDiagram)
library(RColorBrewer)
library(gridExtra)
library(cowplot)
library(knitr)
library(kableExtra)
library(tidyverse)
library(Hmisc)
library(ggridges)
library(venn)
library(ggthemes)
library(stringr)
library(splitstackshape)
library(chron)
library(magicfor)
library(ggtext)
library(reshape2)
library(plyr)
library(formattable)
library(data.table)
library(ggrepel)
library(chron)
library(lubridate)
require(MASS) 
require(scales)

#Setting working directory
setwd("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/dataExtractions/")

Data Loading

#https://man7.org/linux/man-pages/man2/getrusage.2.html
setwd("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/")
elzar_runs <- read.table("dataExtractions/benchmarking_stats_elzar.txt",comment.char = "", header = FALSE,fill=TRUE)

Data cleaning

elzar_runs$V2 <- paste0(elzar_runs$V2, " ", elzar_runs$V3)
elzar_runs$V3 <- NULL

elzar_runs_transposed <- as.data.frame(t(elzar_runs))
colnames(elzar_runs_transposed) <- unlist(elzar_runs_transposed[c(1),])
elzar_runs_transposed <- elzar_runs_transposed[-c(1),]

k <- 8
nr <- nrow(elzar_runs_transposed)
nc <- ncol(elzar_runs_transposed)
unames <- unique(names(elzar_runs_transposed))

a <- array(as.matrix(elzar_runs_transposed), c(nr, k, nc/k))
m <- matrix(aperm(a, c(1, 3, 2)),, k, dimnames = list(NULL, unames))
elzar_runs_transposed <- as.data.frame(m, stringsAsFactors = FALSE)

#duplicated(elzar_runs_transposed[,1]) #need to filter given some runs were repeated on the same day - filter by time
head(elzar_runs_transposed$start_time)

## [1] "05/03/2022 23:50:49.025" "05/04/2022 00:12:38.897"
## [3] "05/04/2022 00:10:20.782" "05/04/2022 00:25:46.782"
## [5] "05/04/2022 00:07:48.379" "05/04/2022 00:35:21.462"

elzar_runs_transposed <- cSplit(elzar_runs_transposed, "start_time", " ", direction = "wide", type.convert = "character")
elzar_runs_transposed <- cSplit(elzar_runs_transposed, "end_time", " ", direction = "wide", type.convert = "character")

#elzar_runs_transposed %>% filter(!duplicated(elzar_runs_transposed$jobname))

# #Capture sample, threads, wall clock and memory
# #Wall clock is reported in seconds    

# time <- "22:13:59.760"
# time2 <- "22:23:11.209"
# x <- chron(times=time)
# y <- chron(times=time2)
# y-x
# period_to_seconds(hms(y-x)) #551 exact reported wall clock is 551.510

elzar_runs_transposed$cwd_base <- elzar_runs_transposed$cwd
elzar_runs_transposed <- cSplit(elzar_runs_transposed, "cwd", "/", direction = "wide", type.convert="as.character")

#Filtering to remove prior runs
unique(elzar_runs_transposed$cwd_07)

## [1] "STAR_Run2"      "STAR_WASP_Run2" "WASP_Run3"      "WASP_Run2"     
## [5] "STAR_Run3"      "STAR_WASP_Run3" "STAR_Run1"      "STAR_WASP_Run1"
## [9] "WASP_Run1"

runs <- c("STAR_Run1",  "STAR_Run2",  "STAR_Run3",  "STAR_WASP_Run1",  "STAR_WASP_Run2",  "STAR_WASP_Run3",  "WASP_Run1",  "WASP_Run2",  "WASP_Run3")
all(elzar_runs_transposed$cwd_07 %in% runs)

## [1] TRUE

#elzar_runs_transposed <- elzar_runs_transposed %>% filter(cwd_07 %in% runs) - prior calls had diff run labels

elzar_runs_transposed[,c(11:16)] <- NULL
dim(elzar_runs_transposed)

## [1] 664  14

dim(elzar_runs_transposed <- na.omit(elzar_runs_transposed))

## [1] 664  14

colnames(elzar_runs_transposed)[6:14] <- c("start_date", "start_time", "end_date", "end_time", "Path", "Run_Group", "Run", "Sample", "Threads")
elzar_runs_transposed$wallclock <- as.numeric(as.character(elzar_runs_transposed$wallclock))
elzar_runs_transposed$ru_wallclock <- as.numeric(as.character(elzar_runs_transposed$ru_wallclock))

elzar_runs_transposed$maxrss <- gsub("G", "", elzar_runs_transposed$maxrss)
elzar_runs_transposed$maxrss <- as.numeric(as.character(elzar_runs_transposed$maxrss))
elzar_runs_transposed$ru_maxrss <- as.numeric(as.character(elzar_runs_transposed$ru_maxrss))

elzar_runs_transposed$Run <- gsub("_Runs", "", elzar_runs_transposed$Run)


elzar_runs_transposed$Threads <- ordered(elzar_runs_transposed$Threads , levels = c("8threads", "16threads", "32threads"))
elzar_runs_transposed <- elzar_runs_transposed %>% filter(Sample != "HG00514" & Sample != "NA12878_Small" & Sample != "NA12878_RAMPAGE" & Sample != "NA12878_RAMPAGE_Rep")
unique(elzar_runs_transposed$Sample)

##  [1] "HG00732"                      "HG00733"                     
##  [3] "HG00512"                      "HG00513"                     
##  [5] "NA12878_Nucleus_nonPolyA_Rep" "NA19240"                     
##  [7] "NA12878_Nucleus_PolyA"        "NA12878_Nucleus_nonPolyA"    
##  [9] "HG00731"                      "NA12878_PolyA_Rep"           
## [11] "NA12878_Total_Rep"            "NA12878_Total"               
## [13] "NA19238"                      "NA12878_PolyA"               
## [15] "NA19239"                      "NA12878_Nucleus_PolyA_Rep"

#Coverting date and time fields from character to data and time respectively
# elzar_runs_transposed$start_date <- chron(date=elzar_runs_transposed$start_date)
# elzar_runs_transposed$start_time <-  chron(times=elzar_runs_transposed$start_time)
# elzar_runs_transposed$end_date <- chron(date=elzar_runs_transposed$end_date)
# elzar_runs_transposed$end_time <-  chron(times=elzar_runs_transposed$end_time)

unique(elzar_runs_transposed$Run)

## [1] "STAR"      "STAR_WASP" "WASP"

dim(STAR <- elzar_runs_transposed %>% filter(Run == "STAR")) #208 we expect 16 * 3 = 48

## [1] 208  14

dim(STAR_WASP <- elzar_runs_transposed %>% filter(Run == "STAR_WASP")) #203 we expect 16 * 3 = 48

## [1] 203  14

dim(WASP <- elzar_runs_transposed %>% filter(Run == "WASP")) #208 we expect 16 * 3 = 48

## [1] 208  14

#Checking call groups:
dim(STAR_Run1 <- as.data.frame(STAR %>% filter(Run_Group == "STAR_Run1"))) #48

## [1] 48 14

STAR_Run1 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n()) #checking that for each sample we have 3 runs, 1 for each thread - looks good

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

STAR_Run1 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n()) #looks good

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

dim(STAR_Run2 <- STAR %>% filter(Run_Group == "STAR_Run2"))#96

## [1] 96 14

STAR_Run2 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n()) #We have twice the expected number of runs, data called might include a prior run that was conducted on the same day or prior, we shall filter these by date

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          6
##  2 HG00513                          6
##  3 HG00731                          6
##  4 HG00732                          6
##  5 HG00733                          6
##  6 NA12878_Nucleus_nonPolyA         6
##  7 NA12878_Nucleus_nonPolyA_Rep     6
##  8 NA12878_Nucleus_PolyA            6
##  9 NA12878_Nucleus_PolyA_Rep        6
## 10 NA12878_PolyA                    6
## 11 NA12878_PolyA_Rep                6
## 12 NA12878_Total                    6
## 13 NA12878_Total_Rep                6
## 14 NA19238                          6
## 15 NA19239                          6
## 16 NA19240                          6

STAR_Run2 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n()) #duplicated

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      2
##  2 HG00512 16threads     2
##  3 HG00512 32threads     2
##  4 HG00513 8threads      2
##  5 HG00513 16threads     2
##  6 HG00513 32threads     2
##  7 HG00731 8threads      2
##  8 HG00731 16threads     2
##  9 HG00731 32threads     2
## 10 HG00732 8threads      2
## # … with 38 more rows

dim(STAR_Run3 <- STAR %>% filter(Run_Group == "STAR_Run3"))#64

## [1] 64 14

STAR_Run3 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n()) #We also have an extra run herein, we shall filter by date as well

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          4
##  2 HG00513                          4
##  3 HG00731                          4
##  4 HG00732                          4
##  5 HG00733                          4
##  6 NA12878_Nucleus_nonPolyA         4
##  7 NA12878_Nucleus_nonPolyA_Rep     4
##  8 NA12878_Nucleus_PolyA            4
##  9 NA12878_Nucleus_PolyA_Rep        4
## 10 NA12878_PolyA                    4
## 11 NA12878_PolyA_Rep                4
## 12 NA12878_Total                    4
## 13 NA12878_Total_Rep                4
## 14 NA19238                          4
## 15 NA19239                          4
## 16 NA19240                          4

STAR_Run3 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n()) #some are duplicated

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      2
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      2
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      2
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      2
## # … with 38 more rows

#STAR_WASP
dim(STAR_WASP_Run1 <- as.data.frame(STAR_WASP %>% filter(Run_Group == "STAR_WASP_Run1"))) #48

## [1] 48 14

STAR_WASP_Run1 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

STAR_WASP_Run1 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())#looks good

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

dim(STAR_WASP_Run2 <- STAR_WASP %>% filter(Run_Group == "STAR_WASP_Run2")) #96

## [1] 96 14

STAR_WASP_Run2 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          6
##  2 HG00513                          6
##  3 HG00731                          6
##  4 HG00732                          6
##  5 HG00733                          6
##  6 NA12878_Nucleus_nonPolyA         6
##  7 NA12878_Nucleus_nonPolyA_Rep     6
##  8 NA12878_Nucleus_PolyA            6
##  9 NA12878_Nucleus_PolyA_Rep        6
## 10 NA12878_PolyA                    6
## 11 NA12878_PolyA_Rep                6
## 12 NA12878_Total                    6
## 13 NA12878_Total_Rep                6
## 14 NA19238                          6
## 15 NA19239                          6
## 16 NA19240                          6

STAR_WASP_Run2 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n()) #we have duplicates

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      2
##  2 HG00512 16threads     2
##  3 HG00512 32threads     2
##  4 HG00513 8threads      2
##  5 HG00513 16threads     2
##  6 HG00513 32threads     2
##  7 HG00731 8threads      2
##  8 HG00731 16threads     2
##  9 HG00731 32threads     2
## 10 HG00732 8threads      2
## # … with 38 more rows

dim(STAR_WASP_Run3 <- STAR_WASP %>% filter(Run_Group == "STAR_WASP_Run3"))#59

## [1] 59 14

STAR_WASP_Run3 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          4
##  2 HG00513                          4
##  3 HG00731                          3
##  4 HG00732                          4
##  5 HG00733                          4
##  6 NA12878_Nucleus_nonPolyA         4
##  7 NA12878_Nucleus_nonPolyA_Rep     4
##  8 NA12878_Nucleus_PolyA            4
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                4
## 12 NA12878_Total                    4
## 13 NA12878_Total_Rep                4
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          4

STAR_WASP_Run3 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n()) #we have duplicates

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      2
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      2
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      2
## # … with 38 more rows

#WASP
dim(WASP_Run1 <- as.data.frame(WASP %>% filter(Run_Group == "WASP_Run1"))) #48

## [1] 48 14

WASP_Run1 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

WASP_Run1 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())#looks good

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

dim(WASP_Run2 <- WASP %>% filter(Run_Group == "WASP_Run2")) #71

## [1] 71 14

WASP_Run2 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          5
##  2 HG00513                          5
##  3 HG00731                          4
##  4 HG00732                          5
##  5 HG00733                          5
##  6 NA12878_Nucleus_nonPolyA         5
##  7 NA12878_Nucleus_nonPolyA_Rep     5
##  8 NA12878_Nucleus_PolyA            5
##  9 NA12878_Nucleus_PolyA_Rep        4
## 10 NA12878_PolyA                    4
## 11 NA12878_PolyA_Rep                4
## 12 NA12878_Total                    4
## 13 NA12878_Total_Rep                4
## 14 NA19238                          4
## 15 NA19239                          4
## 16 NA19240                          4

WASP_Run2 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n()) #we have duplicates

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      2
##  2 HG00512 16threads     1
##  3 HG00512 32threads     2
##  4 HG00513 8threads      2
##  5 HG00513 16threads     1
##  6 HG00513 32threads     2
##  7 HG00731 8threads      2
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      2
## # … with 38 more rows

dim(WASP_Run3 <- WASP %>% filter(Run_Group == "WASP_Run3"))#89

## [1] 89 14

WASP_Run3 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          6
##  2 HG00513                          6
##  3 HG00731                          5
##  4 HG00732                          6
##  5 HG00733                          6
##  6 NA12878_Nucleus_nonPolyA         6
##  7 NA12878_Nucleus_nonPolyA_Rep     6
##  8 NA12878_Nucleus_PolyA            6
##  9 NA12878_Nucleus_PolyA_Rep        5
## 10 NA12878_PolyA                    5
## 11 NA12878_PolyA_Rep                6
## 12 NA12878_Total                    5
## 13 NA12878_Total_Rep                5
## 14 NA19238                          5
## 15 NA19239                          5
## 16 NA19240                          6

WASP_Run3 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n()) #we have duplicates

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      2
##  2 HG00512 16threads     2
##  3 HG00512 32threads     2
##  4 HG00513 8threads      2
##  5 HG00513 16threads     2
##  6 HG00513 32threads     2
##  7 HG00731 8threads      2
##  8 HG00731 16threads     1
##  9 HG00731 32threads     2
## 10 HG00732 8threads      2
## # … with 38 more rows

# Filtering out double calls
dim(STAR_Run2 <- as.data.frame(STAR_Run2 %>% filter(start_date >= as.Date("05/05/2022"))))

## [1] 48 14

STAR_Run2 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

STAR_Run2 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

dim(STAR_Run3 <- as.data.frame(STAR_Run3 %>% filter(start_date >= as.Date("05/05/2022"))))

## [1] 48 14

STAR_Run3 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

STAR_Run3 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

#STAR_WASP
dim(STAR_WASP_Run2 <- as.data.frame(STAR_WASP_Run2 %>% filter(start_date >= as.Date("05/05/2022"))))

## [1] 48 14

STAR_WASP_Run2 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

STAR_WASP_Run2 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

dim(STAR_WASP_Run3 <- as.data.frame(STAR_WASP_Run3 %>% filter(start_date >= as.Date("05/05/2022"))))

## [1] 48 14

STAR_WASP_Run3 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

STAR_WASP_Run3 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

# WASP
dim(WASP_Run2 <- as.data.frame(WASP_Run2 %>% filter(start_date >= as.Date("05/05/2022"))))

## [1] 48 14

WASP_Run2 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

WASP_Run2 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

dim(WASP_Run3 <- as.data.frame(WASP_Run3 %>% filter(start_date >= as.Date("05/05/2022"))))

## [1] 48 14

WASP_Run3 %>% group_by(Sample) %>%  dplyr::summarise(cnt = n())

## # A tibble: 16 × 2
##    Sample                         cnt
##    <chr>                        <int>
##  1 HG00512                          3
##  2 HG00513                          3
##  3 HG00731                          3
##  4 HG00732                          3
##  5 HG00733                          3
##  6 NA12878_Nucleus_nonPolyA         3
##  7 NA12878_Nucleus_nonPolyA_Rep     3
##  8 NA12878_Nucleus_PolyA            3
##  9 NA12878_Nucleus_PolyA_Rep        3
## 10 NA12878_PolyA                    3
## 11 NA12878_PolyA_Rep                3
## 12 NA12878_Total                    3
## 13 NA12878_Total_Rep                3
## 14 NA19238                          3
## 15 NA19239                          3
## 16 NA19240                          3

WASP_Run3 %>% group_by(Sample, Threads) %>%  dplyr::summarise(cnt = n())

## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.

## # A tibble: 48 × 3
## # Groups:   Sample [16]
##    Sample  Threads     cnt
##    <chr>   <ord>     <int>
##  1 HG00512 8threads      1
##  2 HG00512 16threads     1
##  3 HG00512 32threads     1
##  4 HG00513 8threads      1
##  5 HG00513 16threads     1
##  6 HG00513 32threads     1
##  7 HG00731 8threads      1
##  8 HG00731 16threads     1
##  9 HG00731 32threads     1
## 10 HG00732 8threads      1
## # … with 38 more rows

dfs <- list(STAR_Run1, STAR_Run2, STAR_Run3, STAR_WASP_Run1, STAR_WASP_Run2, STAR_WASP_Run3, WASP_Run1, WASP_Run2, WASP_Run3)

for(i in dfs){print(class(i))}

## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"

for(i in dfs){print(nrow(i))}

## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48

#merging dfs
#STAR
duplicated(STAR_Run1$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

duplicated(STAR_Run2$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

duplicated(STAR_Run3$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

all(STAR_Run1$jobname %in% STAR_Run2$jobname)

## [1] TRUE

all(STAR_Run2$jobname %in% STAR_Run3$jobname)

## [1] TRUE

colnames(STAR_Run1)[2:ncol(STAR_Run1)] <- paste0(colnames(STAR_Run1)[2:ncol(STAR_Run1)], "_Run1")
colnames(STAR_Run2)[2:ncol(STAR_Run2)] <- paste0(colnames(STAR_Run2)[2:ncol(STAR_Run2)], "_Run2")
colnames(STAR_Run3)[2:ncol(STAR_Run3)] <- paste0(colnames(STAR_Run3)[2:ncol(STAR_Run3)], "_Run3")

dim(STAR_merged <- inner_join(STAR_Run1, STAR_Run2, by = "jobname"))

## [1] 48 27

dim(STAR_merged <- inner_join(STAR_merged, STAR_Run3, by = "jobname"))

## [1] 48 40

#STAR_WASP
duplicated(STAR_WASP_Run1$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

duplicated(STAR_WASP_Run2$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

duplicated(STAR_WASP_Run3$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

all(STAR_WASP_Run1$jobname %in% STAR_WASP_Run2$jobname)

## [1] TRUE

all(STAR_WASP_Run2$jobname %in% STAR_WASP_Run3$jobname)

## [1] TRUE

colnames(STAR_WASP_Run1)[2:ncol(STAR_WASP_Run1)] <- paste0(colnames(STAR_WASP_Run1)[2:ncol(STAR_WASP_Run1)], "_Run1")
colnames(STAR_WASP_Run2)[2:ncol(STAR_WASP_Run2)] <- paste0(colnames(STAR_WASP_Run2)[2:ncol(STAR_WASP_Run2)], "_Run2")
colnames(STAR_WASP_Run3)[2:ncol(STAR_WASP_Run3)] <- paste0(colnames(STAR_WASP_Run3)[2:ncol(STAR_WASP_Run3)], "_Run3")

dim(STAR_WASP_merged <- inner_join(STAR_WASP_Run1, STAR_WASP_Run2, by = "jobname"))

## [1] 48 27

dim(STAR_WASP_merged <- inner_join(STAR_WASP_merged, STAR_WASP_Run3, by = "jobname"))

## [1] 48 40

#WASP
duplicated(WASP_Run1$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

duplicated(WASP_Run2$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

duplicated(WASP_Run3$jobname)

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

all(WASP_Run1$jobname %in% WASP_Run2$jobname)

## [1] TRUE

all(WASP_Run2$jobname %in% WASP_Run3$jobname)

## [1] TRUE

colnames(WASP_Run1)[2:ncol(WASP_Run1)] <- paste0(colnames(WASP_Run1)[2:ncol(WASP_Run1)], "_Run1")
colnames(WASP_Run2)[2:ncol(WASP_Run2)] <- paste0(colnames(WASP_Run2)[2:ncol(WASP_Run2)], "_Run2")
colnames(WASP_Run3)[2:ncol(WASP_Run3)] <- paste0(colnames(WASP_Run3)[2:ncol(WASP_Run3)], "_Run3")

dim(WASP_merged <- inner_join(WASP_Run1, WASP_Run2, by = "jobname"))

## [1] 48 27

dim(WASP_merged <- inner_join(WASP_merged, WASP_Run3, by = "jobname"))

## [1] 48 40

# Averaging memory and speed across runs (ru_wallclock_Run1 ru_maxrss_Run1 wallclock_Run1 maxrss_Run1)
#STAR
STAR_merged <- STAR_merged %>% mutate(average_ru_wallclock = rowMeans(STAR_merged[,c("ru_wallclock_Run1", "ru_wallclock_Run2", "ru_wallclock_Run3")]))
#STAR_merged <- STAR_merged %>% mutate(average_ru_wallclock2 = (ru_wallclock_Run1 + ru_wallclock_Run2 + ru_wallclock_Run3)/3)
#all(round(STAR_merged$average_ru_wallclock,2) == round(STAR_merged$average_ru_wallclock2,2)) #TRUE
STAR_merged <- STAR_merged %>% mutate(average_ru_maxrss = rowMeans(STAR_merged[,c("ru_maxrss_Run1","ru_maxrss_Run2","ru_maxrss_Run3")]))
STAR_merged <- STAR_merged %>% mutate(average_wallclock = rowMeans(STAR_merged[,c("wallclock_Run1","wallclock_Run2","wallclock_Run3")]))
STAR_merged <- STAR_merged %>% mutate(average_maxrss = rowMeans(STAR_merged[,c("maxrss_Run1","maxrss_Run2","maxrss_Run3")]))

#STAR_WASP
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_ru_wallclock = rowMeans(STAR_WASP_merged[,c("ru_wallclock_Run1", "ru_wallclock_Run2", "ru_wallclock_Run3")]))
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_ru_maxrss = rowMeans(STAR_WASP_merged[,c("ru_maxrss_Run1","ru_maxrss_Run2","ru_maxrss_Run3")]))
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_wallclock = rowMeans(STAR_WASP_merged[,c("wallclock_Run1","wallclock_Run2","wallclock_Run3")]))
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_maxrss = rowMeans(STAR_WASP_merged[,c("maxrss_Run1","maxrss_Run2","maxrss_Run3")]))

#WASP
WASP_merged <- WASP_merged %>% mutate(average_ru_wallclock = rowMeans(WASP_merged[,c("ru_wallclock_Run1", "ru_wallclock_Run2", "ru_wallclock_Run3")]))
WASP_merged <- WASP_merged %>% mutate(average_ru_maxrss = rowMeans(WASP_merged[,c("ru_maxrss_Run1","ru_maxrss_Run2","ru_maxrss_Run3")]))
WASP_merged <- WASP_merged %>% mutate(average_wallclock = rowMeans(WASP_merged[,c("wallclock_Run1","wallclock_Run2","wallclock_Run3")]))
WASP_merged <- WASP_merged %>% mutate(average_maxrss = rowMeans(WASP_merged[,c("maxrss_Run1","maxrss_Run2","maxrss_Run3")]))

# Merging datasets for plotting
STAR_merged_subset <- STAR_merged[,c(38:ncol(STAR_merged))]
colnames(STAR_merged_subset)[1:3] <- c("Run", "Sample", "Threads")

STAR_WASP_merged_subset <- STAR_WASP_merged[,c(38:ncol(STAR_WASP_merged))]
colnames(STAR_WASP_merged_subset)[1:3] <- c("Run", "Sample", "Threads")

WASP_merged_subset <- WASP_merged[,c(38:ncol(WASP_merged))]
colnames(WASP_merged_subset)[1:3] <- c("Run", "Sample", "Threads")

colnames(STAR_merged_subset)

## [1] "Run"                  "Sample"               "Threads"             
## [4] "average_ru_wallclock" "average_ru_maxrss"    "average_wallclock"   
## [7] "average_maxrss"

colnames(STAR_WASP_merged_subset)

## [1] "Run"                  "Sample"               "Threads"             
## [4] "average_ru_wallclock" "average_ru_maxrss"    "average_wallclock"   
## [7] "average_maxrss"

colnames(WASP_merged_subset)

## [1] "Run"                  "Sample"               "Threads"             
## [4] "average_ru_wallclock" "average_ru_maxrss"    "average_wallclock"   
## [7] "average_maxrss"

merged_df <- as.data.frame(rbind(STAR_merged_subset, STAR_WASP_merged_subset, WASP_merged_subset))

Average Memory Across All Runs

## Cleaning up Sample names
unique(merged_df$Sample)

##  [1] "NA19240"                      "HG00512"                     
##  [3] "HG00732"                      "HG00513"                     
##  [5] "HG00731"                      "NA12878_Nucleus_PolyA"       
##  [7] "HG00733"                      "NA12878_Nucleus_nonPolyA_Rep"
##  [9] "NA12878_PolyA_Rep"            "NA12878_Nucleus_nonPolyA"    
## [11] "NA12878_Total"                "NA19238"                     
## [13] "NA12878_Total_Rep"            "NA19239"                     
## [15] "NA12878_Nucleus_PolyA_Rep"    "NA12878_PolyA"

#Removing NA12878 from all NA12878 derived samples
merged_df$Sample <- gsub("NA12878_", "", merged_df$Sample)
merged_df$Sample <- gsub("Rep", "R2", merged_df$Sample)
merged_df$Sample[merged_df$Sample == "Total"] <- "Total_R1"
merged_df$Sample[merged_df$Sample == "PolyA"] <- "PolyA_R1"
merged_df$Sample[merged_df$Sample == "Nucleus_PolyA"] <- "Nucleus_PolyA_R1"
merged_df$Sample[merged_df$Sample == "Nucleus_nonPolyA"] <- "Nucleus_nonPolyA_R1"

order_df <- c("Total_R2" ,           "Total_R1"  ,          "PolyA_R2" ,           "PolyA_R1" ,           "Nucleus_PolyA_R2",    "Nucleus_PolyA_R1"  ,  "Nucleus_nonPolyA_R2", "Nucleus_nonPolyA_R1", "NA19240" , "NA19239"   ,          "NA19238"  ,           "HG00733"  ,           "HG00732"   ,          "HG00731",             "HG00513"  ,           "HG00512")


global_colors <-  c("orange", "dodgerblue4", "firebrick4")

merged_df %>% 
  ggplot(aes(x = factor(Sample, levels=order_df), y = average_maxrss, group=Run,color=factor(Run))) +
  geom_point(aes(colour=factor(Run)), size=1) +
  #geom_smooth(se=FALSE, linetype="dashed", size=0.5) +
  scale_color_manual(values = global_colors) +
  facet_wrap(~Threads) + 
  labs(y = paste0("Average Memory Across Runs", "\n", "(maxrss (GB))"), x="") + scale_y_continuous() +
  theme_bw() + theme(legend.title = element_blank()) + 
  theme(strip.background =element_rect(fill="white", colour = "white"))+
  theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))

merged_df %>% 
  #mutate(Sample = fct_reorder2(Sample, Threads, mem_order,  .desc = FALSE)) %>%
    ggplot(aes(x = factor(Sample, levels=order_df), y = average_maxrss, group=Run)) +
  geom_point(aes(color=factor(Threads),shape=factor(Run),), size=3, alpha=0.9)+
  #geom_line()+
  scale_shape_manual(values=c("STAR"=3, "WASP"=1, "STAR_WASP"=17))+ #WASP 16 #"STAR"=4, "WASP"=1, "STAR_WASP"=17
  #scale_color_manual(values=c('#999999','#E69F00', '#56B4E9'))+
  scale_color_manual(values = global_colors) +
 # facet_wrap(~Threads) + 
  #labs(y = "Mapping Speed (Million of reads/hour", x="") +
  labs(y = paste0("Average Memory Across Runs", "\n", "(maxrss (GB))"), x="") + #scale_y_continuous(limits = c(30,40)) +
  theme_bw() + theme(legend.title = element_blank()) + #Base, bw, excel_new(), few, light, lindraw
  theme(strip.background =element_rect(fill="white", colour = "white"))+
  theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
  #theme(strip.text.x = element_blank()) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) #+

merged_df %>% 
  ggplot(aes(x = factor(Sample, levels=order_df), y = (average_ru_maxrss)/1000000, group=Run,color=factor(Run))) + #ru_maxrss is measured in kb, we'll convert it to GB
  geom_point(aes(colour=factor(Run)), size=1) +
  scale_color_manual(values = global_colors) +
  facet_wrap(~Threads) + 
  labs(y = paste0("Average Memory Across Runs", "\n", "(ru_maxrss (GB))"), x="") + scale_y_continuous() +
  theme_bw() + theme(legend.title = element_blank()) + 
  theme(strip.background =element_rect(fill="white", colour = "white"))+
  theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))

Average wall clock across all runs

merged_df %>% 
  ggplot(aes(x = factor(Sample, levels=order_df), y = average_wallclock, group=Run,color=factor(Run))) +
   geom_point(aes(colour=factor(Run)), size=1) +
  scale_color_manual(values = global_colors) +
  facet_wrap(~Threads) +
  labs(y = paste0("Average Wall Clock Across Runs","\n", "(wallclock)"), x="") + scale_y_continuous() +
  theme_bw() + theme(legend.title = element_blank()) + 
  theme(strip.background =element_rect(fill="white", colour = "white"))+
  theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))

merged_df %>% 
  ggplot(aes(x =factor(Sample, levels=order_df), y = average_ru_wallclock, group=Run,color=factor(Run))) +
  geom_point(aes(colour=factor(Run)), size=1) +
  scale_color_manual(values = global_colors) +
  facet_wrap(~Threads) +
  facet_wrap(~Threads) + 
  labs(y = paste0("Average Wall Clock Across Runs", "\n", "(ru_wallclock)"), x="") + scale_y_continuous() +
  theme_bw() + theme(legend.title = element_blank()) + 
  theme(strip.background =element_rect(fill="white", colour = "white"))+
  theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))

MIllion Reads per Hour

#Loading pre-processed data from final log results (all runs)
data_melted_beta <- read.csv("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/data_melted_beta.csv")
#data_melted_beta <- read.csv(pipe('ssh atlas "cat /home/asiimwe/projects/run_env/alpha_star_wasp_comparison/data_melted_beta.csv"'))

data_melted_beta$X <- NULL
colnames(data_melted_beta)

## [1] "Sample"                                 
## [2] "Param"                                  
## [3] "Value"                                  
## [4] "Threads"                                
## [5] "Run"                                    
## [6] "Number_of_input_reads"                  
## [7] "Mapping_speed_Million_of_reads_per_hour"
## [8] "Average_input_read_length"

#"Sample"  "Param"  "Value" "Threads" "Run" "Mapping_speed_Million_of_reads_per_hour" "Average_input_read_length" 

#Filtering samples not needed downstream:
unique(data_melted_beta$Sample)

##  [1] HG00512                      HG00513                     
##  [3] HG00731                      HG00732                     
##  [5] HG00733                      NA12878_Nucleus_nonPolyA    
##  [7] NA12878_Nucleus_nonPolyA_Rep NA12878_Nucleus_PolyA       
##  [9] NA12878_Nucleus_PolyA_Rep    NA12878_PolyA               
## [11] NA12878_PolyA_Rep            NA12878_Total               
## [13] NA12878_Total_Rep            NA19238                     
## [15] NA19239                      NA19240                     
## 16 Levels: HG00512 HG00513 HG00731 HG00732 HG00733 ... NA19240

data_melted_beta$Threads <- as.character(data_melted_beta$Threads )
data_melted_beta$Threads[data_melted_beta$Threads == "8threads" ] <- "8 Threads"
data_melted_beta$Threads[data_melted_beta$Threads == "16threads" ] <- "16 Threads"
data_melted_beta$Threads[data_melted_beta$Threads == "32threads" ] <- "32 Threads"
unique(data_melted_beta$Threads)

## [1] "16 Threads" "32 Threads" "8 Threads"

#Data cleanup
str(data_melted_beta)

## 'data.frame':    3168 obs. of  8 variables:
##  $ Sample                                 : Factor w/ 16 levels "HG00512","HG00513",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Param                                  : Factor w/ 22 levels "Average resident set size (kbytes)",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Value                                  : Factor w/ 1387 levels "0","1","1:01:58",..: 1 1 1 1 1 1 258 1230 531 1 ...
##  $ Threads                                : chr  "16 Threads" "16 Threads" "16 Threads" "16 Threads" ...
##  $ Run                                    : Factor w/ 3 levels "STAR","STAR_WASP",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Number_of_input_reads                  : int  66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 ...
##  $ Mapping_speed_Million_of_reads_per_hour: num  1226 1226 1226 1226 1226 ...
##  $ Average_input_read_length              : int  102 102 102 102 102 102 102 102 102 102 ...

data_melted_beta$Sample <- as.character(data_melted_beta$Sample)
data_melted_beta$Param <- as.character(data_melted_beta$Param)
data_melted_beta$Value <- gsub("%", "", data_melted_beta$Value)

#Reordering threads to appear in the correct order (8, 16, 32)
data_melted_beta$Threads <- as.factor(data_melted_beta$Threads)

data_melted_beta$Threads <- ordered(data_melted_beta$Threads , levels = c("8 Threads", "16 Threads", "32 Threads"))

## Cleaning up Sample names
unique(data_melted_beta$Sample)

##  [1] "HG00512"                      "HG00513"                     
##  [3] "HG00731"                      "HG00732"                     
##  [5] "HG00733"                      "NA12878_Nucleus_nonPolyA"    
##  [7] "NA12878_Nucleus_nonPolyA_Rep" "NA12878_Nucleus_PolyA"       
##  [9] "NA12878_Nucleus_PolyA_Rep"    "NA12878_PolyA"               
## [11] "NA12878_PolyA_Rep"            "NA12878_Total"               
## [13] "NA12878_Total_Rep"            "NA19238"                     
## [15] "NA19239"                      "NA19240"

#Removing NA12878 from all NA12878 derived samples
data_melted_beta$Sample <- gsub("NA12878_", "", data_melted_beta$Sample)
data_melted_beta$Sample <- gsub("Rep", "R2", data_melted_beta$Sample)
data_melted_beta$Sample[data_melted_beta$Sample == "Total"] <- "Total_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "PolyA"] <- "PolyA_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "Nucleus_PolyA"] <- "Nucleus_PolyA_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "Nucleus_nonPolyA"] <- "Nucleus_nonPolyA_R1"

data_melted_beta_wall_clock <- data_melted_beta %>% filter(Param == "Wall_Clock")

# Extracting number of input reads per sample based on STAR runs - should be same inwput reads for each sample/fastq files
num_input_reads_per_sample <- unique(data_melted_beta %>% filter(Run == "STAR") %>% dplyr::select("Sample", "Number_of_input_reads"))
colnames(num_input_reads_per_sample)[2] <- "Number_of_input_reads_initial"
data_melted_beta_wall_clock <- inner_join(data_melted_beta_wall_clock, num_input_reads_per_sample, by = "Sample")

### Number of input reads per sample:
num_input_reads_per_sample %>%
  ggplot(aes(x=factor(Sample, levels=order_df), y=Number_of_input_reads_initial)) +
  geom_bar(stat = "identity", fill="gray60") +
  theme_light() +
  scale_y_continuous(expand=c(0,0), limits=c(0, max(num_input_reads_per_sample$Number_of_input_reads_initial)+1000000))+
  scale_x_discrete(expand=c(0,0)) +
  labs(y="Number of Input Reads", x = "") +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))

# Plotting by average input read length
input_read_len_per_sample <- unique(data_melted_beta %>% filter(Run == "STAR") %>% dplyr::select("Sample", "Average_input_read_length"))
input_read_len_per_sample %>%
  ggplot(aes(x=factor(Sample, levels=order_df), y=Average_input_read_length)) +
  geom_bar(stat = "identity", fill="gray60") +
  theme_light() +
  scale_y_continuous(expand=c(0,0), limits = c(0, max(input_read_len_per_sample$Average_input_read_length)+10))+
  scale_x_discrete(expand=c(0,0)) +
  labs(y="Average Input Read Length", x = "") +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))

min(num_input_reads_per_sample$Number_of_input_reads_initial)#56254714

## [1] 56254714

max(num_input_reads_per_sample$Number_of_input_reads_initial)#128402941

## [1] 128402941

mean(num_input_reads_per_sample$Number_of_input_reads_initial)#86852506

## [1] 86852506

median(num_input_reads_per_sample$Number_of_input_reads_initial)#92285172

## [1] 92285172

data_melted_beta_wall_clock$Unique_ID <- 1:nrow(data_melted_beta_wall_clock)
data_melted_beta_wall_clock$Unique_ID <- as.character(data_melted_beta_wall_clock$Unique_ID)
data_melted_beta_wall_clock$Wall_Clock <- data_melted_beta_wall_clock$Value #maintaining a copy of the original variable

num_input_reads <- unique(data_melted_beta_wall_clock[,c(1,9)])

merged_df <- inner_join(merged_df, num_input_reads, by = "Sample")

#Converting wall clock to hours (reported wall clock is in seconds)
merged_df <- merged_df %>%  mutate(wall_clock_seconds_in_hrs = (average_wallclock/(60*60))) #0.09966657 0.19093528 0.12747778 
merged_df <- merged_df %>%  mutate(reads_per_hour = (Number_of_input_reads_initial/wall_clock_seconds_in_hrs))

# Plotting wall clock

logticks <- function(datavar,type) {

  minimum <- 1/10^abs(floor(log10(min(datavar, na.rm=TRUE))))
  maximum <- 1*10^abs(floor(log10(max(datavar, na.rm=TRUE)))+1)
  multiple <- floor(log10(maximum/minimum))

  yourtickvector <- c()

  if (type=="breaks") {

    yourtickvector <- c(minimum)

    for (x in seq(0,multiple)) {

      andadd <- seq(minimum*10^x,minimum*10^(x+1),minimum*10^x)[-1]

      yourtickvector <- c(yourtickvector,andadd)

    }

  } else if (type=="labels") {

    for (x in seq(0,multiple)) {

      andadd <- c(minimum*10^x,rep("",8))

      yourtickvector <- c(yourtickvector,andadd)

    }

    yourtickvector <- c(yourtickvector,minimum*10^multiple)

  }

  return(yourtickvector)

}


merged_df %>% 
  ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
   geom_point(aes(colour=factor(Run)), size=1) +
  scale_color_manual(values = global_colors) +
  facet_wrap(~Threads) + 
  labs(y = "Million Reads/Hour", x = "")+
  theme_bw() + theme(legend.title = element_blank()) + #scale_y_continuous(trans='log10') +
  theme(strip.background = element_rect(fill="white", colour = "white"))+
  theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) + scale_y_log10(
    breaks = logticks(merged_df$reads_per_hour,"breaks"),
    labels = logticks(merged_df$reads_per_hour,"labels")
  )

merged_df %>% 
  ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
  geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+#,alpha=.8
  scale_shape_manual(values=c(3, 17, 16))+
  facet_wrap(~Threads) + 
  scale_color_manual(values=global_colors)+
  labs(y = paste0("Average Speed Across Runs", "\n", "(Million Reads/Hour)"), x="")+
  theme_bw() + theme(legend.title = element_blank())+ scale_y_continuous(trans='log10') +
  theme(strip.background =element_rect(fill="white", colour = "white"))+
  theme(strip.text = element_text(colour = 'black'), strip.text.x =  element_markdown(hjust = 0.5, size=12)) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))  + scale_y_log10(
    breaks = logticks(merged_df$reads_per_hour,"breaks"),
    labels = logticks(merged_df$reads_per_hour,"labels")
  )

## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.