In [None]:
# ..... get pseudobulk matrices for each stage ..... #

In [2]:
library(Seurat)
library(SingleCellExperiment)
library(MetaMarkers)

In [4]:
# load data
get_seurat_obj <- function(currstage1){
    
    sc1 = readRDS(paste0(currstage1, '_integrated_cca.rds'))

    df1 = read.delim(paste0(currstage1, '_Knight-Schrijver_celltypes.csv'), sep = ',')
    ids = match(rownames(sc1@meta.data), df1$barcode)
    sc1$class = df1$class[ids]
    sc1$celltype = df1$final_celltype[ids]
    
    # convert only immature_other into subclass labels
    sc1$celltype2 = sc1$class
    sc1$celltype2[sc1$class=='Immature_other'] <- sc1$celltype[sc1$class=='Immature_other']
#     sc1$cluster = df1$cluster[ids]

    sc1 <- sc1[,!is.na(sc1$celltype) & sc1$celltype!='unassigned']
    
    # remove cell types with <50 cells across all samples combined
    sc1 <- sc1[,!(sc1$celltype2 %in% names(which(table(sc1$celltype2)<50)))]
    return(sc1)
}

# get pseudo-bulk for CPM data
get_cpm_pseudo_bulk <- function(scdata){      
        
    new_labels = scdata$celltype2
    nbds = unique(new_labels)
    nbds <- nbds[which(!is.na(nbds) & nbds!='unassigned')]
    sc_sub2 = matrix(NA, nrow = dim(scdata)[1], ncol = length(nbds))
    
    for (i in 1:length(nbds)){
        cols = which(new_labels==nbds[i])
    
        if(length(cols)>1){
            sc_sub2[,i] = rowSums(cpm(scdata)[,cols])/length(cols)
        }else if(length(cols)==1){            
            sc_sub2[,i] = cpm(scdata)[,cols]
        }else{
            sc_sub2[,i] = NA
        }
    }    
    rownames(sc_sub2) = rownames(scdata)
    colnames(sc_sub2) = nbds
    return(sc_sub2)
}

In [13]:
# load individual sample files
stage1 = 'stage16'
sc1 = get_seurat_obj(stage1)

In [14]:
# make SCE object
sce2 = SingleCellExperiment(list(counts = sc1@assays$RNA@counts), colData = DataFrame(sc1@meta.data))
assay(sce2, "cpm") = convert_to_cpm(assay(sce2))
sce2

class: SingleCellExperiment 
dim: 22719 28389 
metadata(0):
assays(2): counts cpm
rownames(22719): LOC132761864 LOC132781240 ... LOC132783387
  LOC132783388
rowData names(0):
colnames(28389): stage16_sample1_AAACCCACAGCTGTCG-1
  stage16_sample1_AAACCCACATGTGACT-1 ...
  stage16_sample3_TTTGTTGGTTCCTACC-1 stage16_sample3_TTTGTTGTCAGTGTTG-1
colData names(31): orig.ident nCount_RNA ... celltype celltype2
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

In [15]:
# create pseudo-bulk matrices for all genes
scbulk1 = get_cpm_pseudo_bulk(sce2)
scbulk1[c('TBX5','COL3A1'),]

Unnamed: 0,Immature_Endothelium,Fibroblasts,Endothelial_Capillaries,Cardiomyocytes,Myeloid_Immune_Cells,Endocardial,Immature_FB-like,Neuronal_Cells,Immature_Cardiomyocytes,Lymphoid_Immune_Cells,Pericytes,Immature_Neuralcrest,Smooth_Muscle_Cells,Endothelial_Venous,Endothelial_Arterial,Epicardium_FB-like
TBX5,9.972931,81.76718,35.23522,435.69463,9.151697,27.37292,57.85927,0.0,93.82229,10.15188,112.6582,40.92119,51.06938,14.88387,0.0,0.0
COL3A1,385.961438,3028.01151,1095.0687,77.88729,202.376342,1547.33729,1043.48382,316.532,187.80875,284.77283,3120.4315,768.94486,2936.88719,675.27428,1316.485,2311.336


In [16]:
# save
save(scbulk1, file = paste0(stage1, '_pseudobulk_expression_matrices.Rdata'))