In [None]:
# ..... get pseudobulk matrices for each stage ..... #

In [2]:
library(Seurat)
library(SingleCellExperiment)
library(MetaMarkers)

In [9]:
# load data
get_seurat_obj <- function(currstage1){
    
    sc1 = readRDS(paste0(currstage1, '_integrated_cca.rds'))

    df1 = read.delim(paste0(currstage1, '_Knight-Schrijver_celltypes.csv'), sep = ',')
    ids = match(rownames(sc1@meta.data), df1$barcode)
    sc1$class = df1$class[ids]
    sc1$celltype = df1$final_celltype[ids]
    
    # convert only immature_other into subclass labels
    sc1$celltype2 = sc1$class
    sc1$celltype2[sc1$class=='Immature_other'] <- sc1$celltype[sc1$class=='Immature_other']
#     sc1$cluster = df1$cluster[ids]

    sc1 <- sc1[,!is.na(sc1$celltype) & sc1$celltype!='unassigned']
    
    # remove cell types with <50 cells across all samples combined
    sc1 <- sc1[,!(sc1$celltype2 %in% names(which(table(sc1$celltype2)<50)))]
    return(sc1)
}

# get pseudo-bulk for CPM data
get_cpm_pseudo_bulk <- function(scdata){      
        
    new_labels = scdata$celltype2
    nbds = unique(new_labels)
    nbds <- nbds[which(!is.na(nbds) & nbds!='unassigned')]
    sc_sub2 = matrix(NA, nrow = dim(scdata)[1], ncol = length(nbds))
    
    for (i in 1:length(nbds)){
        cols = which(new_labels==nbds[i])
    
        if(length(cols)>1){
            sc_sub2[,i] = rowSums(cpm(scdata)[,cols])/length(cols)
        }else if(length(cols)==1){            
            sc_sub2[,i] = cpm(scdata)[,cols]
        }else{
            sc_sub2[,i] = NA
        }
    }    
    rownames(sc_sub2) = rownames(scdata)
    colnames(sc_sub2) = nbds
    return(sc_sub2)
}

In [19]:
# load individual sample files
stage1 = 'stage32'
sc1 = get_seurat_obj(stage1)

In [20]:
# make SCE object
sce2 = SingleCellExperiment(list(counts = sc1@assays$RNA@counts), colData = DataFrame(sc1@meta.data))
assay(sce2, "cpm") = convert_to_cpm(assay(sce2))
sce2

class: SingleCellExperiment 
dim: 17007 21250 
metadata(0):
assays(2): counts cpm
rownames(17007): SPRY2 PCDH20 ... ENSGALG00010000495 ENSGALG00010000377
rowData names(0):
colnames(21250): stage32_sample5_AAACCCACAAGGATGC-1
  stage32_sample5_AAACCCACATACTGTG-1 ...
  stage32_sample12_TTTGTTGGTAGACGGT-1
  stage32_sample12_TTTGTTGGTGCTCGTG-1
colData names(31): orig.ident nCount_RNA ... celltype celltype2
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [21]:
# create pseudo-bulk matrices for all genes
scbulk1 = get_cpm_pseudo_bulk(sce2)
scbulk1[c('TBX5','COL3A1'),]

Unnamed: 0,Cardiomyocytes,Endothelial_Capillaries,Fibroblasts,Immature_Endothelium,Smooth_Muscle_Cells,Pericytes_Stromal,Immature_FB-like,Pericytes,Endothelial_Venous,Endocardial,Immature_Cardiomyocytes,Endothelial_Arterial,Neuronal_Cells,Epicardium_FB-like
TBX5,614.29784,21.17038,52.59146,29.62259,17.45004,22.03379,30.11545,37.34122,21.57166,26.0915,73.28275,6.416926,5.300204,24.87415
COL3A1,69.83279,105.08081,482.14546,251.72699,431.94044,127.97892,307.41498,369.03323,91.34623,106.3729,216.08583,88.713556,79.505207,466.89381


In [22]:
# save
save(scbulk1, file = paste0(stage1, '_pseudobulk_expression_matrices.Rdata'))