Data cleaning
elzar_runs$V2 <- paste0(elzar_runs$V2, " ", elzar_runs$V3)
elzar_runs$V3 <- NULL
elzar_runs_transposed <- as.data.frame(t(elzar_runs))
colnames(elzar_runs_transposed) <- unlist(elzar_runs_transposed[c(1),])
elzar_runs_transposed <- elzar_runs_transposed[-c(1),]
k <- 8
nr <- nrow(elzar_runs_transposed)
nc <- ncol(elzar_runs_transposed)
unames <- unique(names(elzar_runs_transposed))
a <- array(as.matrix(elzar_runs_transposed), c(nr, k, nc/k))
m <- matrix(aperm(a, c(1, 3, 2)),, k, dimnames = list(NULL, unames))
elzar_runs_transposed <- as.data.frame(m, stringsAsFactors = FALSE)
#duplicated(elzar_runs_transposed[,1]) #need to filter given some runs were repeated on the same day - filter by time
head(elzar_runs_transposed$start_time)
## [1] "05/03/2022 23:50:49.025" "05/04/2022 00:12:38.897"
## [3] "05/04/2022 00:10:20.782" "05/04/2022 00:25:46.782"
## [5] "05/04/2022 00:07:48.379" "05/04/2022 00:35:21.462"
elzar_runs_transposed <- cSplit(elzar_runs_transposed, "start_time", " ", direction = "wide", type.convert = "character")
elzar_runs_transposed <- cSplit(elzar_runs_transposed, "end_time", " ", direction = "wide", type.convert = "character")
#elzar_runs_transposed %>% filter(!duplicated(elzar_runs_transposed$jobname))
# #Capture sample, threads, wall clock and memory
# #Wall clock is reported in seconds
# time <- "22:13:59.760"
# time2 <- "22:23:11.209"
# x <- chron(times=time)
# y <- chron(times=time2)
# y-x
# period_to_seconds(hms(y-x)) #551 exact reported wall clock is 551.510
elzar_runs_transposed$cwd_base <- elzar_runs_transposed$cwd
elzar_runs_transposed <- cSplit(elzar_runs_transposed, "cwd", "/", direction = "wide", type.convert="as.character")
#Filtering to remove prior runs
unique(elzar_runs_transposed$cwd_07)
## [1] "STAR_Run2" "STAR_WASP_Run2" "WASP_Run3" "WASP_Run2"
## [5] "STAR_Run3" "STAR_WASP_Run3" "STAR_Run1" "STAR_WASP_Run1"
## [9] "WASP_Run1"
runs <- c("STAR_Run1", "STAR_Run2", "STAR_Run3", "STAR_WASP_Run1", "STAR_WASP_Run2", "STAR_WASP_Run3", "WASP_Run1", "WASP_Run2", "WASP_Run3")
all(elzar_runs_transposed$cwd_07 %in% runs)
## [1] TRUE
#elzar_runs_transposed <- elzar_runs_transposed %>% filter(cwd_07 %in% runs) - prior calls had diff run labels
elzar_runs_transposed[,c(11:16)] <- NULL
dim(elzar_runs_transposed)
## [1] 664 14
dim(elzar_runs_transposed <- na.omit(elzar_runs_transposed))
## [1] 664 14
colnames(elzar_runs_transposed)[6:14] <- c("start_date", "start_time", "end_date", "end_time", "Path", "Run_Group", "Run", "Sample", "Threads")
elzar_runs_transposed$wallclock <- as.numeric(as.character(elzar_runs_transposed$wallclock))
elzar_runs_transposed$ru_wallclock <- as.numeric(as.character(elzar_runs_transposed$ru_wallclock))
elzar_runs_transposed$maxrss <- gsub("G", "", elzar_runs_transposed$maxrss)
elzar_runs_transposed$maxrss <- as.numeric(as.character(elzar_runs_transposed$maxrss))
elzar_runs_transposed$ru_maxrss <- as.numeric(as.character(elzar_runs_transposed$ru_maxrss))
elzar_runs_transposed$Run <- gsub("_Runs", "", elzar_runs_transposed$Run)
elzar_runs_transposed$Threads <- ordered(elzar_runs_transposed$Threads , levels = c("8threads", "16threads", "32threads"))
elzar_runs_transposed <- elzar_runs_transposed %>% filter(Sample != "HG00514" & Sample != "NA12878_Small" & Sample != "NA12878_RAMPAGE" & Sample != "NA12878_RAMPAGE_Rep")
unique(elzar_runs_transposed$Sample)
## [1] "HG00732" "HG00733"
## [3] "HG00512" "HG00513"
## [5] "NA12878_Nucleus_nonPolyA_Rep" "NA19240"
## [7] "NA12878_Nucleus_PolyA" "NA12878_Nucleus_nonPolyA"
## [9] "HG00731" "NA12878_PolyA_Rep"
## [11] "NA12878_Total_Rep" "NA12878_Total"
## [13] "NA19238" "NA12878_PolyA"
## [15] "NA19239" "NA12878_Nucleus_PolyA_Rep"
#Coverting date and time fields from character to data and time respectively
# elzar_runs_transposed$start_date <- chron(date=elzar_runs_transposed$start_date)
# elzar_runs_transposed$start_time <- chron(times=elzar_runs_transposed$start_time)
# elzar_runs_transposed$end_date <- chron(date=elzar_runs_transposed$end_date)
# elzar_runs_transposed$end_time <- chron(times=elzar_runs_transposed$end_time)
unique(elzar_runs_transposed$Run)
## [1] "STAR" "STAR_WASP" "WASP"
dim(STAR <- elzar_runs_transposed %>% filter(Run == "STAR")) #208 we expect 16 * 3 = 48
## [1] 208 14
dim(STAR_WASP <- elzar_runs_transposed %>% filter(Run == "STAR_WASP")) #203 we expect 16 * 3 = 48
## [1] 203 14
dim(WASP <- elzar_runs_transposed %>% filter(Run == "WASP")) #208 we expect 16 * 3 = 48
## [1] 208 14
#Checking call groups:
dim(STAR_Run1 <- as.data.frame(STAR %>% filter(Run_Group == "STAR_Run1"))) #48
## [1] 48 14
STAR_Run1 %>% group_by(Sample) %>% dplyr::summarise(cnt = n()) #checking that for each sample we have 3 runs, 1 for each thread - looks good
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
STAR_Run1 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n()) #looks good
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
dim(STAR_Run2 <- STAR %>% filter(Run_Group == "STAR_Run2"))#96
## [1] 96 14
STAR_Run2 %>% group_by(Sample) %>% dplyr::summarise(cnt = n()) #We have twice the expected number of runs, data called might include a prior run that was conducted on the same day or prior, we shall filter these by date
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 6
## 2 HG00513 6
## 3 HG00731 6
## 4 HG00732 6
## 5 HG00733 6
## 6 NA12878_Nucleus_nonPolyA 6
## 7 NA12878_Nucleus_nonPolyA_Rep 6
## 8 NA12878_Nucleus_PolyA 6
## 9 NA12878_Nucleus_PolyA_Rep 6
## 10 NA12878_PolyA 6
## 11 NA12878_PolyA_Rep 6
## 12 NA12878_Total 6
## 13 NA12878_Total_Rep 6
## 14 NA19238 6
## 15 NA19239 6
## 16 NA19240 6
STAR_Run2 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n()) #duplicated
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 2
## 2 HG00512 16threads 2
## 3 HG00512 32threads 2
## 4 HG00513 8threads 2
## 5 HG00513 16threads 2
## 6 HG00513 32threads 2
## 7 HG00731 8threads 2
## 8 HG00731 16threads 2
## 9 HG00731 32threads 2
## 10 HG00732 8threads 2
## # … with 38 more rows
dim(STAR_Run3 <- STAR %>% filter(Run_Group == "STAR_Run3"))#64
## [1] 64 14
STAR_Run3 %>% group_by(Sample) %>% dplyr::summarise(cnt = n()) #We also have an extra run herein, we shall filter by date as well
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 4
## 2 HG00513 4
## 3 HG00731 4
## 4 HG00732 4
## 5 HG00733 4
## 6 NA12878_Nucleus_nonPolyA 4
## 7 NA12878_Nucleus_nonPolyA_Rep 4
## 8 NA12878_Nucleus_PolyA 4
## 9 NA12878_Nucleus_PolyA_Rep 4
## 10 NA12878_PolyA 4
## 11 NA12878_PolyA_Rep 4
## 12 NA12878_Total 4
## 13 NA12878_Total_Rep 4
## 14 NA19238 4
## 15 NA19239 4
## 16 NA19240 4
STAR_Run3 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n()) #some are duplicated
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 2
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 2
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 2
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 2
## # … with 38 more rows
#STAR_WASP
dim(STAR_WASP_Run1 <- as.data.frame(STAR_WASP %>% filter(Run_Group == "STAR_WASP_Run1"))) #48
## [1] 48 14
STAR_WASP_Run1 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
STAR_WASP_Run1 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())#looks good
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
dim(STAR_WASP_Run2 <- STAR_WASP %>% filter(Run_Group == "STAR_WASP_Run2")) #96
## [1] 96 14
STAR_WASP_Run2 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 6
## 2 HG00513 6
## 3 HG00731 6
## 4 HG00732 6
## 5 HG00733 6
## 6 NA12878_Nucleus_nonPolyA 6
## 7 NA12878_Nucleus_nonPolyA_Rep 6
## 8 NA12878_Nucleus_PolyA 6
## 9 NA12878_Nucleus_PolyA_Rep 6
## 10 NA12878_PolyA 6
## 11 NA12878_PolyA_Rep 6
## 12 NA12878_Total 6
## 13 NA12878_Total_Rep 6
## 14 NA19238 6
## 15 NA19239 6
## 16 NA19240 6
STAR_WASP_Run2 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n()) #we have duplicates
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 2
## 2 HG00512 16threads 2
## 3 HG00512 32threads 2
## 4 HG00513 8threads 2
## 5 HG00513 16threads 2
## 6 HG00513 32threads 2
## 7 HG00731 8threads 2
## 8 HG00731 16threads 2
## 9 HG00731 32threads 2
## 10 HG00732 8threads 2
## # … with 38 more rows
dim(STAR_WASP_Run3 <- STAR_WASP %>% filter(Run_Group == "STAR_WASP_Run3"))#59
## [1] 59 14
STAR_WASP_Run3 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 4
## 2 HG00513 4
## 3 HG00731 3
## 4 HG00732 4
## 5 HG00733 4
## 6 NA12878_Nucleus_nonPolyA 4
## 7 NA12878_Nucleus_nonPolyA_Rep 4
## 8 NA12878_Nucleus_PolyA 4
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 4
## 12 NA12878_Total 4
## 13 NA12878_Total_Rep 4
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 4
STAR_WASP_Run3 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n()) #we have duplicates
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 2
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 2
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 2
## # … with 38 more rows
#WASP
dim(WASP_Run1 <- as.data.frame(WASP %>% filter(Run_Group == "WASP_Run1"))) #48
## [1] 48 14
WASP_Run1 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
WASP_Run1 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())#looks good
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
dim(WASP_Run2 <- WASP %>% filter(Run_Group == "WASP_Run2")) #71
## [1] 71 14
WASP_Run2 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 5
## 2 HG00513 5
## 3 HG00731 4
## 4 HG00732 5
## 5 HG00733 5
## 6 NA12878_Nucleus_nonPolyA 5
## 7 NA12878_Nucleus_nonPolyA_Rep 5
## 8 NA12878_Nucleus_PolyA 5
## 9 NA12878_Nucleus_PolyA_Rep 4
## 10 NA12878_PolyA 4
## 11 NA12878_PolyA_Rep 4
## 12 NA12878_Total 4
## 13 NA12878_Total_Rep 4
## 14 NA19238 4
## 15 NA19239 4
## 16 NA19240 4
WASP_Run2 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n()) #we have duplicates
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 2
## 2 HG00512 16threads 1
## 3 HG00512 32threads 2
## 4 HG00513 8threads 2
## 5 HG00513 16threads 1
## 6 HG00513 32threads 2
## 7 HG00731 8threads 2
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 2
## # … with 38 more rows
dim(WASP_Run3 <- WASP %>% filter(Run_Group == "WASP_Run3"))#89
## [1] 89 14
WASP_Run3 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 6
## 2 HG00513 6
## 3 HG00731 5
## 4 HG00732 6
## 5 HG00733 6
## 6 NA12878_Nucleus_nonPolyA 6
## 7 NA12878_Nucleus_nonPolyA_Rep 6
## 8 NA12878_Nucleus_PolyA 6
## 9 NA12878_Nucleus_PolyA_Rep 5
## 10 NA12878_PolyA 5
## 11 NA12878_PolyA_Rep 6
## 12 NA12878_Total 5
## 13 NA12878_Total_Rep 5
## 14 NA19238 5
## 15 NA19239 5
## 16 NA19240 6
WASP_Run3 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n()) #we have duplicates
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 2
## 2 HG00512 16threads 2
## 3 HG00512 32threads 2
## 4 HG00513 8threads 2
## 5 HG00513 16threads 2
## 6 HG00513 32threads 2
## 7 HG00731 8threads 2
## 8 HG00731 16threads 1
## 9 HG00731 32threads 2
## 10 HG00732 8threads 2
## # … with 38 more rows
# Filtering out double calls
dim(STAR_Run2 <- as.data.frame(STAR_Run2 %>% filter(start_date >= as.Date("05/05/2022"))))
## [1] 48 14
STAR_Run2 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
STAR_Run2 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
dim(STAR_Run3 <- as.data.frame(STAR_Run3 %>% filter(start_date >= as.Date("05/05/2022"))))
## [1] 48 14
STAR_Run3 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
STAR_Run3 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
#STAR_WASP
dim(STAR_WASP_Run2 <- as.data.frame(STAR_WASP_Run2 %>% filter(start_date >= as.Date("05/05/2022"))))
## [1] 48 14
STAR_WASP_Run2 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
STAR_WASP_Run2 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
dim(STAR_WASP_Run3 <- as.data.frame(STAR_WASP_Run3 %>% filter(start_date >= as.Date("05/05/2022"))))
## [1] 48 14
STAR_WASP_Run3 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
STAR_WASP_Run3 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
# WASP
dim(WASP_Run2 <- as.data.frame(WASP_Run2 %>% filter(start_date >= as.Date("05/05/2022"))))
## [1] 48 14
WASP_Run2 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
WASP_Run2 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
dim(WASP_Run3 <- as.data.frame(WASP_Run3 %>% filter(start_date >= as.Date("05/05/2022"))))
## [1] 48 14
WASP_Run3 %>% group_by(Sample) %>% dplyr::summarise(cnt = n())
## # A tibble: 16 × 2
## Sample cnt
## <chr> <int>
## 1 HG00512 3
## 2 HG00513 3
## 3 HG00731 3
## 4 HG00732 3
## 5 HG00733 3
## 6 NA12878_Nucleus_nonPolyA 3
## 7 NA12878_Nucleus_nonPolyA_Rep 3
## 8 NA12878_Nucleus_PolyA 3
## 9 NA12878_Nucleus_PolyA_Rep 3
## 10 NA12878_PolyA 3
## 11 NA12878_PolyA_Rep 3
## 12 NA12878_Total 3
## 13 NA12878_Total_Rep 3
## 14 NA19238 3
## 15 NA19239 3
## 16 NA19240 3
WASP_Run3 %>% group_by(Sample, Threads) %>% dplyr::summarise(cnt = n())
## `summarise()` has grouped output by 'Sample'. You can override using the
## `.groups` argument.
## # A tibble: 48 × 3
## # Groups: Sample [16]
## Sample Threads cnt
## <chr> <ord> <int>
## 1 HG00512 8threads 1
## 2 HG00512 16threads 1
## 3 HG00512 32threads 1
## 4 HG00513 8threads 1
## 5 HG00513 16threads 1
## 6 HG00513 32threads 1
## 7 HG00731 8threads 1
## 8 HG00731 16threads 1
## 9 HG00731 32threads 1
## 10 HG00732 8threads 1
## # … with 38 more rows
dfs <- list(STAR_Run1, STAR_Run2, STAR_Run3, STAR_WASP_Run1, STAR_WASP_Run2, STAR_WASP_Run3, WASP_Run1, WASP_Run2, WASP_Run3)
for(i in dfs){print(class(i))}
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
## [1] "data.frame"
for(i in dfs){print(nrow(i))}
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
## [1] 48
#merging dfs
#STAR
duplicated(STAR_Run1$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
duplicated(STAR_Run2$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
duplicated(STAR_Run3$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
all(STAR_Run1$jobname %in% STAR_Run2$jobname)
## [1] TRUE
all(STAR_Run2$jobname %in% STAR_Run3$jobname)
## [1] TRUE
colnames(STAR_Run1)[2:ncol(STAR_Run1)] <- paste0(colnames(STAR_Run1)[2:ncol(STAR_Run1)], "_Run1")
colnames(STAR_Run2)[2:ncol(STAR_Run2)] <- paste0(colnames(STAR_Run2)[2:ncol(STAR_Run2)], "_Run2")
colnames(STAR_Run3)[2:ncol(STAR_Run3)] <- paste0(colnames(STAR_Run3)[2:ncol(STAR_Run3)], "_Run3")
dim(STAR_merged <- inner_join(STAR_Run1, STAR_Run2, by = "jobname"))
## [1] 48 27
dim(STAR_merged <- inner_join(STAR_merged, STAR_Run3, by = "jobname"))
## [1] 48 40
#STAR_WASP
duplicated(STAR_WASP_Run1$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
duplicated(STAR_WASP_Run2$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
duplicated(STAR_WASP_Run3$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
all(STAR_WASP_Run1$jobname %in% STAR_WASP_Run2$jobname)
## [1] TRUE
all(STAR_WASP_Run2$jobname %in% STAR_WASP_Run3$jobname)
## [1] TRUE
colnames(STAR_WASP_Run1)[2:ncol(STAR_WASP_Run1)] <- paste0(colnames(STAR_WASP_Run1)[2:ncol(STAR_WASP_Run1)], "_Run1")
colnames(STAR_WASP_Run2)[2:ncol(STAR_WASP_Run2)] <- paste0(colnames(STAR_WASP_Run2)[2:ncol(STAR_WASP_Run2)], "_Run2")
colnames(STAR_WASP_Run3)[2:ncol(STAR_WASP_Run3)] <- paste0(colnames(STAR_WASP_Run3)[2:ncol(STAR_WASP_Run3)], "_Run3")
dim(STAR_WASP_merged <- inner_join(STAR_WASP_Run1, STAR_WASP_Run2, by = "jobname"))
## [1] 48 27
dim(STAR_WASP_merged <- inner_join(STAR_WASP_merged, STAR_WASP_Run3, by = "jobname"))
## [1] 48 40
#WASP
duplicated(WASP_Run1$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
duplicated(WASP_Run2$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
duplicated(WASP_Run3$jobname)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
all(WASP_Run1$jobname %in% WASP_Run2$jobname)
## [1] TRUE
all(WASP_Run2$jobname %in% WASP_Run3$jobname)
## [1] TRUE
colnames(WASP_Run1)[2:ncol(WASP_Run1)] <- paste0(colnames(WASP_Run1)[2:ncol(WASP_Run1)], "_Run1")
colnames(WASP_Run2)[2:ncol(WASP_Run2)] <- paste0(colnames(WASP_Run2)[2:ncol(WASP_Run2)], "_Run2")
colnames(WASP_Run3)[2:ncol(WASP_Run3)] <- paste0(colnames(WASP_Run3)[2:ncol(WASP_Run3)], "_Run3")
dim(WASP_merged <- inner_join(WASP_Run1, WASP_Run2, by = "jobname"))
## [1] 48 27
dim(WASP_merged <- inner_join(WASP_merged, WASP_Run3, by = "jobname"))
## [1] 48 40
# Averaging memory and speed across runs (ru_wallclock_Run1 ru_maxrss_Run1 wallclock_Run1 maxrss_Run1)
#STAR
STAR_merged <- STAR_merged %>% mutate(average_ru_wallclock = rowMeans(STAR_merged[,c("ru_wallclock_Run1", "ru_wallclock_Run2", "ru_wallclock_Run3")]))
#STAR_merged <- STAR_merged %>% mutate(average_ru_wallclock2 = (ru_wallclock_Run1 + ru_wallclock_Run2 + ru_wallclock_Run3)/3)
#all(round(STAR_merged$average_ru_wallclock,2) == round(STAR_merged$average_ru_wallclock2,2)) #TRUE
STAR_merged <- STAR_merged %>% mutate(average_ru_maxrss = rowMeans(STAR_merged[,c("ru_maxrss_Run1","ru_maxrss_Run2","ru_maxrss_Run3")]))
STAR_merged <- STAR_merged %>% mutate(average_wallclock = rowMeans(STAR_merged[,c("wallclock_Run1","wallclock_Run2","wallclock_Run3")]))
STAR_merged <- STAR_merged %>% mutate(average_maxrss = rowMeans(STAR_merged[,c("maxrss_Run1","maxrss_Run2","maxrss_Run3")]))
#STAR_WASP
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_ru_wallclock = rowMeans(STAR_WASP_merged[,c("ru_wallclock_Run1", "ru_wallclock_Run2", "ru_wallclock_Run3")]))
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_ru_maxrss = rowMeans(STAR_WASP_merged[,c("ru_maxrss_Run1","ru_maxrss_Run2","ru_maxrss_Run3")]))
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_wallclock = rowMeans(STAR_WASP_merged[,c("wallclock_Run1","wallclock_Run2","wallclock_Run3")]))
STAR_WASP_merged <- STAR_WASP_merged %>% mutate(average_maxrss = rowMeans(STAR_WASP_merged[,c("maxrss_Run1","maxrss_Run2","maxrss_Run3")]))
#WASP
WASP_merged <- WASP_merged %>% mutate(average_ru_wallclock = rowMeans(WASP_merged[,c("ru_wallclock_Run1", "ru_wallclock_Run2", "ru_wallclock_Run3")]))
WASP_merged <- WASP_merged %>% mutate(average_ru_maxrss = rowMeans(WASP_merged[,c("ru_maxrss_Run1","ru_maxrss_Run2","ru_maxrss_Run3")]))
WASP_merged <- WASP_merged %>% mutate(average_wallclock = rowMeans(WASP_merged[,c("wallclock_Run1","wallclock_Run2","wallclock_Run3")]))
WASP_merged <- WASP_merged %>% mutate(average_maxrss = rowMeans(WASP_merged[,c("maxrss_Run1","maxrss_Run2","maxrss_Run3")]))
# Merging datasets for plotting
STAR_merged_subset <- STAR_merged[,c(38:ncol(STAR_merged))]
colnames(STAR_merged_subset)[1:3] <- c("Run", "Sample", "Threads")
STAR_WASP_merged_subset <- STAR_WASP_merged[,c(38:ncol(STAR_WASP_merged))]
colnames(STAR_WASP_merged_subset)[1:3] <- c("Run", "Sample", "Threads")
WASP_merged_subset <- WASP_merged[,c(38:ncol(WASP_merged))]
colnames(WASP_merged_subset)[1:3] <- c("Run", "Sample", "Threads")
colnames(STAR_merged_subset)
## [1] "Run" "Sample" "Threads"
## [4] "average_ru_wallclock" "average_ru_maxrss" "average_wallclock"
## [7] "average_maxrss"
colnames(STAR_WASP_merged_subset)
## [1] "Run" "Sample" "Threads"
## [4] "average_ru_wallclock" "average_ru_maxrss" "average_wallclock"
## [7] "average_maxrss"
colnames(WASP_merged_subset)
## [1] "Run" "Sample" "Threads"
## [4] "average_ru_wallclock" "average_ru_maxrss" "average_wallclock"
## [7] "average_maxrss"
merged_df <- as.data.frame(rbind(STAR_merged_subset, STAR_WASP_merged_subset, WASP_merged_subset))
Average Memory Across All Runs
## Cleaning up Sample names
unique(merged_df$Sample)
## [1] "NA19240" "HG00512"
## [3] "HG00732" "HG00513"
## [5] "HG00731" "NA12878_Nucleus_PolyA"
## [7] "HG00733" "NA12878_Nucleus_nonPolyA_Rep"
## [9] "NA12878_PolyA_Rep" "NA12878_Nucleus_nonPolyA"
## [11] "NA12878_Total" "NA19238"
## [13] "NA12878_Total_Rep" "NA19239"
## [15] "NA12878_Nucleus_PolyA_Rep" "NA12878_PolyA"
#Removing NA12878 from all NA12878 derived samples
merged_df$Sample <- gsub("NA12878_", "", merged_df$Sample)
merged_df$Sample <- gsub("Rep", "R2", merged_df$Sample)
merged_df$Sample[merged_df$Sample == "Total"] <- "Total_R1"
merged_df$Sample[merged_df$Sample == "PolyA"] <- "PolyA_R1"
merged_df$Sample[merged_df$Sample == "Nucleus_PolyA"] <- "Nucleus_PolyA_R1"
merged_df$Sample[merged_df$Sample == "Nucleus_nonPolyA"] <- "Nucleus_nonPolyA_R1"
order_df <- c("Total_R2" , "Total_R1" , "PolyA_R2" , "PolyA_R1" , "Nucleus_PolyA_R2", "Nucleus_PolyA_R1" , "Nucleus_nonPolyA_R2", "Nucleus_nonPolyA_R1", "NA19240" , "NA19239" , "NA19238" , "HG00733" , "HG00732" , "HG00731", "HG00513" , "HG00512")
global_colors <- c("orange", "dodgerblue4", "firebrick4")
merged_df %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = average_maxrss, group=Run,color=factor(Run))) +
geom_point(aes(colour=factor(Run)), size=1) +
#geom_smooth(se=FALSE, linetype="dashed", size=0.5) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = paste0("Average Memory Across Runs", "\n", "(maxrss (GB))"), x="") + scale_y_continuous() +
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
merged_df %>%
#mutate(Sample = fct_reorder2(Sample, Threads, mem_order, .desc = FALSE)) %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = average_maxrss, group=Run)) +
geom_point(aes(color=factor(Threads),shape=factor(Run),), size=3, alpha=0.9)+
#geom_line()+
scale_shape_manual(values=c("STAR"=3, "WASP"=1, "STAR_WASP"=17))+ #WASP 16 #"STAR"=4, "WASP"=1, "STAR_WASP"=17
#scale_color_manual(values=c('#999999','#E69F00', '#56B4E9'))+
scale_color_manual(values = global_colors) +
# facet_wrap(~Threads) +
#labs(y = "Mapping Speed (Million of reads/hour", x="") +
labs(y = paste0("Average Memory Across Runs", "\n", "(maxrss (GB))"), x="") + #scale_y_continuous(limits = c(30,40)) +
theme_bw() + theme(legend.title = element_blank()) + #Base, bw, excel_new(), few, light, lindraw
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0)) +
#theme(strip.text.x = element_blank()) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) #+
merged_df %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = (average_ru_maxrss)/1000000, group=Run,color=factor(Run))) + #ru_maxrss is measured in kb, we'll convert it to GB
geom_point(aes(colour=factor(Run)), size=1) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = paste0("Average Memory Across Runs", "\n", "(ru_maxrss (GB))"), x="") + scale_y_continuous() +
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
Average wall clock across all runs
merged_df %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = average_wallclock, group=Run,color=factor(Run))) +
geom_point(aes(colour=factor(Run)), size=1) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = paste0("Average Wall Clock Across Runs","\n", "(wallclock)"), x="") + scale_y_continuous() +
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
merged_df %>%
ggplot(aes(x =factor(Sample, levels=order_df), y = average_ru_wallclock, group=Run,color=factor(Run))) +
geom_point(aes(colour=factor(Run)), size=1) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
facet_wrap(~Threads) +
labs(y = paste0("Average Wall Clock Across Runs", "\n", "(ru_wallclock)"), x="") + scale_y_continuous() +
theme_bw() + theme(legend.title = element_blank()) +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black',size=10), strip.text.x = element_markdown(hjust = 0.5)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
MIllion Reads per Hour
#Loading pre-processed data from final log results (all runs)
data_melted_beta <- read.csv("/home/asiimwe/projects/run_env/alpha_star_wasp_benchmarking/Downstream_Analysis/data_melted_beta.csv")
#data_melted_beta <- read.csv(pipe('ssh atlas "cat /home/asiimwe/projects/run_env/alpha_star_wasp_comparison/data_melted_beta.csv"'))
data_melted_beta$X <- NULL
colnames(data_melted_beta)
## [1] "Sample"
## [2] "Param"
## [3] "Value"
## [4] "Threads"
## [5] "Run"
## [6] "Number_of_input_reads"
## [7] "Mapping_speed_Million_of_reads_per_hour"
## [8] "Average_input_read_length"
#"Sample" "Param" "Value" "Threads" "Run" "Mapping_speed_Million_of_reads_per_hour" "Average_input_read_length"
#Filtering samples not needed downstream:
unique(data_melted_beta$Sample)
## [1] HG00512 HG00513
## [3] HG00731 HG00732
## [5] HG00733 NA12878_Nucleus_nonPolyA
## [7] NA12878_Nucleus_nonPolyA_Rep NA12878_Nucleus_PolyA
## [9] NA12878_Nucleus_PolyA_Rep NA12878_PolyA
## [11] NA12878_PolyA_Rep NA12878_Total
## [13] NA12878_Total_Rep NA19238
## [15] NA19239 NA19240
## 16 Levels: HG00512 HG00513 HG00731 HG00732 HG00733 ... NA19240
data_melted_beta$Threads <- as.character(data_melted_beta$Threads )
data_melted_beta$Threads[data_melted_beta$Threads == "8threads" ] <- "8 Threads"
data_melted_beta$Threads[data_melted_beta$Threads == "16threads" ] <- "16 Threads"
data_melted_beta$Threads[data_melted_beta$Threads == "32threads" ] <- "32 Threads"
unique(data_melted_beta$Threads)
## [1] "16 Threads" "32 Threads" "8 Threads"
#Data cleanup
str(data_melted_beta)
## 'data.frame': 3168 obs. of 8 variables:
## $ Sample : Factor w/ 16 levels "HG00512","HG00513",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Param : Factor w/ 22 levels "Average resident set size (kbytes)",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Value : Factor w/ 1387 levels "0","1","1:01:58",..: 1 1 1 1 1 1 258 1230 531 1 ...
## $ Threads : chr "16 Threads" "16 Threads" "16 Threads" "16 Threads" ...
## $ Run : Factor w/ 3 levels "STAR","STAR_WASP",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Number_of_input_reads : int 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 66055710 ...
## $ Mapping_speed_Million_of_reads_per_hour: num 1226 1226 1226 1226 1226 ...
## $ Average_input_read_length : int 102 102 102 102 102 102 102 102 102 102 ...
data_melted_beta$Sample <- as.character(data_melted_beta$Sample)
data_melted_beta$Param <- as.character(data_melted_beta$Param)
data_melted_beta$Value <- gsub("%", "", data_melted_beta$Value)
#Reordering threads to appear in the correct order (8, 16, 32)
data_melted_beta$Threads <- as.factor(data_melted_beta$Threads)
data_melted_beta$Threads <- ordered(data_melted_beta$Threads , levels = c("8 Threads", "16 Threads", "32 Threads"))
## Cleaning up Sample names
unique(data_melted_beta$Sample)
## [1] "HG00512" "HG00513"
## [3] "HG00731" "HG00732"
## [5] "HG00733" "NA12878_Nucleus_nonPolyA"
## [7] "NA12878_Nucleus_nonPolyA_Rep" "NA12878_Nucleus_PolyA"
## [9] "NA12878_Nucleus_PolyA_Rep" "NA12878_PolyA"
## [11] "NA12878_PolyA_Rep" "NA12878_Total"
## [13] "NA12878_Total_Rep" "NA19238"
## [15] "NA19239" "NA19240"
#Removing NA12878 from all NA12878 derived samples
data_melted_beta$Sample <- gsub("NA12878_", "", data_melted_beta$Sample)
data_melted_beta$Sample <- gsub("Rep", "R2", data_melted_beta$Sample)
data_melted_beta$Sample[data_melted_beta$Sample == "Total"] <- "Total_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "PolyA"] <- "PolyA_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "Nucleus_PolyA"] <- "Nucleus_PolyA_R1"
data_melted_beta$Sample[data_melted_beta$Sample == "Nucleus_nonPolyA"] <- "Nucleus_nonPolyA_R1"
data_melted_beta_wall_clock <- data_melted_beta %>% filter(Param == "Wall_Clock")
# Extracting number of input reads per sample based on STAR runs - should be same inwput reads for each sample/fastq files
num_input_reads_per_sample <- unique(data_melted_beta %>% filter(Run == "STAR") %>% dplyr::select("Sample", "Number_of_input_reads"))
colnames(num_input_reads_per_sample)[2] <- "Number_of_input_reads_initial"
data_melted_beta_wall_clock <- inner_join(data_melted_beta_wall_clock, num_input_reads_per_sample, by = "Sample")
### Number of input reads per sample:
num_input_reads_per_sample %>%
ggplot(aes(x=factor(Sample, levels=order_df), y=Number_of_input_reads_initial)) +
geom_bar(stat = "identity", fill="gray60") +
theme_light() +
scale_y_continuous(expand=c(0,0), limits=c(0, max(num_input_reads_per_sample$Number_of_input_reads_initial)+1000000))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Number of Input Reads", x = "") +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
# Plotting by average input read length
input_read_len_per_sample <- unique(data_melted_beta %>% filter(Run == "STAR") %>% dplyr::select("Sample", "Average_input_read_length"))
input_read_len_per_sample %>%
ggplot(aes(x=factor(Sample, levels=order_df), y=Average_input_read_length)) +
geom_bar(stat = "identity", fill="gray60") +
theme_light() +
scale_y_continuous(expand=c(0,0), limits = c(0, max(input_read_len_per_sample$Average_input_read_length)+10))+
scale_x_discrete(expand=c(0,0)) +
labs(y="Average Input Read Length", x = "") +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10))
min(num_input_reads_per_sample$Number_of_input_reads_initial)#56254714
## [1] 56254714
max(num_input_reads_per_sample$Number_of_input_reads_initial)#128402941
## [1] 128402941
mean(num_input_reads_per_sample$Number_of_input_reads_initial)#86852506
## [1] 86852506
median(num_input_reads_per_sample$Number_of_input_reads_initial)#92285172
## [1] 92285172
data_melted_beta_wall_clock$Unique_ID <- 1:nrow(data_melted_beta_wall_clock)
data_melted_beta_wall_clock$Unique_ID <- as.character(data_melted_beta_wall_clock$Unique_ID)
data_melted_beta_wall_clock$Wall_Clock <- data_melted_beta_wall_clock$Value #maintaining a copy of the original variable
num_input_reads <- unique(data_melted_beta_wall_clock[,c(1,9)])
merged_df <- inner_join(merged_df, num_input_reads, by = "Sample")
#Converting wall clock to hours (reported wall clock is in seconds)
merged_df <- merged_df %>% mutate(wall_clock_seconds_in_hrs = (average_wallclock/(60*60))) #0.09966657 0.19093528 0.12747778
merged_df <- merged_df %>% mutate(reads_per_hour = (Number_of_input_reads_initial/wall_clock_seconds_in_hrs))
# Plotting wall clock
logticks <- function(datavar,type) {
minimum <- 1/10^abs(floor(log10(min(datavar, na.rm=TRUE))))
maximum <- 1*10^abs(floor(log10(max(datavar, na.rm=TRUE)))+1)
multiple <- floor(log10(maximum/minimum))
yourtickvector <- c()
if (type=="breaks") {
yourtickvector <- c(minimum)
for (x in seq(0,multiple)) {
andadd <- seq(minimum*10^x,minimum*10^(x+1),minimum*10^x)[-1]
yourtickvector <- c(yourtickvector,andadd)
}
} else if (type=="labels") {
for (x in seq(0,multiple)) {
andadd <- c(minimum*10^x,rep("",8))
yourtickvector <- c(yourtickvector,andadd)
}
yourtickvector <- c(yourtickvector,minimum*10^multiple)
}
return(yourtickvector)
}
merged_df %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
geom_point(aes(colour=factor(Run)), size=1) +
scale_color_manual(values = global_colors) +
facet_wrap(~Threads) +
labs(y = "Million Reads/Hour", x = "")+
theme_bw() + theme(legend.title = element_blank()) + #scale_y_continuous(trans='log10') +
theme(strip.background = element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) + scale_y_log10(
breaks = logticks(merged_df$reads_per_hour,"breaks"),
labels = logticks(merged_df$reads_per_hour,"labels")
)
merged_df %>%
ggplot(aes(x = factor(Sample, levels=order_df), y = reads_per_hour/1000000, group=Run, color=Run)) +
geom_point(aes(color=factor(Run),shape=factor(Run),fill=factor(Run)), size=2)+#,alpha=.8
scale_shape_manual(values=c(3, 17, 16))+
facet_wrap(~Threads) +
scale_color_manual(values=global_colors)+
labs(y = paste0("Average Speed Across Runs", "\n", "(Million Reads/Hour)"), x="")+
theme_bw() + theme(legend.title = element_blank())+ scale_y_continuous(trans='log10') +
theme(strip.background =element_rect(fill="white", colour = "white"))+
theme(strip.text = element_text(colour = 'black'), strip.text.x = element_markdown(hjust = 0.5, size=12)) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,vjust = 0.5, size=10)) + scale_y_log10(
breaks = logticks(merged_df$reads_per_hour,"breaks"),
labels = logticks(merged_df$reads_per_hour,"labels")
)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.