In [None]:
# ..... annotate cell clusters ..... #

In [2]:
library(Seurat)
library(SingleCellExperiment)
library(MetaMarkers)
library(data.table)
library(dplyr)

In [3]:
# load list of orthologs
om = read.delim('lizard_human_orthologs_eggNOG.txt', sep = '\t')
om <- om[!is.na(om$lizard_gene) & !is.na(om$ortholog_name),]
dim(om)
om[1,]

Unnamed: 0_level_0,query,orth_type,species,orthologs,lizard_gene,ortholog_gene,ortholog_name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,XP_060608688.1,one2one,Homo sapiens(9606),ENSP00000340297,ARHGEF10,ENSG00000104728,ARHGEF10


In [4]:
# function to get top 100 markers for each cell type
get_top_markers <- function(markers, ctypes){
    markers$rank = NA
    for(ii in 1:length(ctypes)){
        id = which(markers$cell_type==ctypes[ii])
        markers$rank[id] = 1:length(id)
    }
    return(markers)
}

In [5]:
# load class markers
markers = fread('~/septation/markers/Kanemaru/Kanemaru_celltype_markers_res2.csv.gz')
markers$gene <- om$lizard_gene[match(markers$gene, om$ortholog_name)]
markers <- markers[!is.na(markers$gene),]
ctypes = unique(markers$cell_type)

# subclass markers stratified by class
markers2 = fread('~/septation/markers/Kanemaru/Kanemaru_celltype_markers_res3.csv.gz')
markers2$gene <- om$lizard_gene[match(markers2$gene, om$ortholog_name)]
markers2 <- markers2[!is.na(markers2$gene),]
ctypes2 = unique(markers2$cell_type)

# get rank of markers
markers = get_top_markers(markers, ctypes)
markers2 = get_top_markers(markers2, ctypes2)

# tibble of group, cell_type, gene, rank
top_markers = as_tibble(markers[which(markers$rank<=100),c('group', 'cell_type', 'gene')])
top_markers2 = as_tibble(markers2[which(markers2$rank<=100),c('group', 'cell_type', 'gene')])
top_markers[1:2,]
top_markers2[1:2,]

group,cell_type,gene
<chr>,<chr>,<chr>
all,adipocyte,GPAM
all,adipocyte,MGST1


group,cell_type,gene
<chr>,<chr>,<chr>
endothelial cell,EC10_CMC-like,ADAMTS9
endothelial cell,EC10_CMC-like,MCF2L


In [6]:
# list of stages and samples
stages = rep(c('stage9', 'stage13', 'stage16'), each = 3)
samples = paste0('sample', rep(1:3, 3))

In [7]:
# load data
pb = txtProgressBar(min = 0, max = length(samples), initial = 0)

for(id1 in 1:length(samples)){
    currstage1 = stages[id1]
    currsmp1 = samples[id1]
    sc1 = readRDS(paste0(currstage1, '_', currsmp1, '_data.rds'))
    
    # get SCE object
    sc3 = SingleCellExperiment(list(counts = LayerData(sc1, assay = 'RNA', layer = 'counts')))
    colData(sc3) <- DataFrame(sc1@meta.data)
    assay(sc3, "cpm") = convert_to_cpm(assay(sc3))

    # predict cell type class
    ct_scores = score_cells(log1p(cpm(sc3)), top_markers)
    ct_enrichment = compute_marker_enrichment(ct_scores)
    ct_pred = assign_cells(ct_scores)
    
    # ct_pred[1:3,]
    # table(ct_pred$predicted)

    # get cell subclass labels
    sub_scores = score_cells(log1p(cpm(sc3)), top_markers2)
    sub_enrichment = compute_marker_enrichment(sub_scores, by_group = TRUE)
    sub_pred = assign_cells(sub_scores, group_assignment = ct_pred$predicted)
    
    # sub_pred[1:3,]
    # table(sub_pred$predicted)

    # <20 cells of a type - make assignment NA
    rm_ids = names(which(table(sub_pred$predicted)<20))
    sub_pred$predicted[which(sub_pred$predicted %in% rm_ids)] <- NA

    # make combined prediction df
    newdf = data.frame(barcode = rownames(ct_pred), class = ct_pred$predicted, class_score = ct_pred$score,
                       class_enrichment = ct_pred$enrichment, celltype = sub_pred$predicted,
                      celltype_score = sub_pred$score, celltype_enrichment = sub_pred$enrichment)
    newdf$final_celltype = newdf$celltype
    
    for(ii in 1:dim(newdf)[1]){
       if(is.na(newdf$celltype[ii])){
           newdf$final_celltype[ii] = newdf$class[ii]
       }
    }
    # newdf[1,]

    # save
    write.table(newdf, file = paste0('annotations/', currstage1, '_', currsmp1, '_Kanemaru_celltypes.csv'),
                sep = ',', row.names = F, col.names = T, quote = F)

    setTxtProgressBar(pb, id1)

}

“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”




“Some group assignments (unassigned) do not match groups in the score matrix (adipocyte, endothelial cell of lymphatic vessel, endothelial cell, fibroblast, lymphocyte, mast cell, mesothelial cell, mural cell, myeloid cell, neural cell, regular atrial cardiac myocyte, regular ventricular cardiac myocyte) and will result in NA predictions.”


