In [None]:
# get candidate genes for screening with zebrafish crispants
# add summary data from Jin et al and other sources
# for all potential candidate human genes

In [2]:
library(ggplot2)
library(viridis)
library(dplyr)

In [110]:
tab1 = read.delim('lof_het_mutations_cases.csv', sep = ',')
tab2 = read.delim('denovo_mutations_cases.csv', sep = ',')
# tab2 = read.delim('denovo_mutations_controls.csv', sep = ',')

tab1 <- tab1[,c('Cardiac.Category', 'EM', 'NDD', 'CHR', 'POS', 'Gene', 'ExonicFunc.refGene',
                'pLI.Score', 'HHE.Rank')]
tab2 <- tab2[,c('Cardiac.Category', 'EM', 'NDD', 'CHROM', 'POS', 'Gene', 'Variant_Class',
                'pLI.score', 'HHE.Rank')]

cols1 = c('Cardiac.Category', 'EM', 'NDD', 'chr', 'pos', 'Gene', 'Variant_Class', 'pLI', 'HHE_rank')
colnames(tab1) = cols1
colnames(tab2) = cols1
tab1$mutation = 'LoF_het'
tab2$mutation = 'denovo'

tab3 = rbind(tab1, tab2)
tab3$Cardiac.Category[(tab3$Cardiac.Category) %in% c('CTD ')] = 'CTD'
tab3$Cardiac.Category[(tab3$Cardiac.Category) %in% c('CTD (TGA)', ' CTD (TGA)')] = 'CTD_TGA'
tab3$Cardiac.Category[(tab3$Cardiac.Category) %in% c('other (AVC)', 'Other (AVC)')] = 'AVC'
tab3$Cardiac.Category[(tab3$Cardiac.Category) %in% c('OTHER', 'Other', 'other')] = 'other'
tab3[1,]

Unnamed: 0_level_0,Cardiac.Category,EM,NDD,chr,pos,Gene,Variant_Class,pLI,HHE_rank,mutation
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
1,other,No,Yes,12,58152532,MARCH9,frameshift_insertion,0.35,36.2,LoF_het


In [111]:
# get all genes with variants in human CHD-cases
genes = unique(tab3$Gene)
length(genes)

# collect in df
newdf = data.frame(Gene = genes, LoF = 0, DMis = 0, LVO = 0, HTX = 0, AVC = 0, CTD = 0,
                   CTD_TGA = 0, other = 0, none = 0, cardiac = 0, extracardiac = 0, neuro = 0)

for(ii in 1:length(genes)){
    ids = which(tab3$Gene==genes[ii])
    newdf$LoF[ii] = sum(tab3$mutation[ids]=='LoF_het')
    newdf$DMis[ii] = sum(tab3$Variant_Class[ids]=='misD')

    # fraction of variants with other phenotype
    newdf$extracardiac[ii] = sum(tab3$EM[ids]=='Yes')/length(ids)
    newdf$neuro[ii] = sum(tab3$NDD[ids]=='Yes')/length(ids)

    # add no. of cardiac phenotypes for LoF/D-Mis
    vec1 = tab3$Cardiac.Category[which(tab3$mutation[ids]=='LoF_het' | tab3$Variant_Class[ids]=='misD')]
    if(length(vec1)){
        newdf$LVO[ii] = sum(vec1=='LVO')
        newdf$HTX[ii] = sum(vec1=='HTX')
        newdf$AVC[ii] = sum(vec1=='AVC')

        newdf$CTD[ii] = sum(vec1=='CTD')
        newdf$CTD_TGA[ii] = sum(vec1=='CTD_TGA')
        newdf$other[ii] = sum(vec1=='other')
        newdf$none[ii] = sum(vec1=='')
    }else{
        newdf$none[ii] = 1
    }    

    newdf$cardiac[ii] = names(which.max(newdf[ii,4:10]))
}

newdf$pLI = tab3$pLI[match(newdf$Gene, tab3$Gene)]
newdf$mouse_exp_rank = tab3$HHE_rank[match(newdf$Gene, tab3$Gene)]
newdf[1,]

Unnamed: 0_level_0,Gene,LoF,DMis,LVO,HTX,AVC,CTD,CTD_TGA,other,none,cardiac,extracardiac,neuro,pLI,mouse_exp_rank
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,MARCH9,1,0,0,0,0,0,0,1,0,other,0,1,0.35,36.2


In [112]:
sum(newdf$LoF>0 & newdf$DMis>0)
sum(newdf$LoF>0 | newdf$DMis>0)

In [113]:
# known CHD gene or not - from Jin's list of 253 human/mouse CHD genes
jin = read.delim('Jin_curated_CHD_genes.csv', sep = ',')
dim(jin)
jin[1,]

newdf$known_CHD = 0
newdf$known_CHD[which(newdf$Gene %in% jin$Gene)] = 1
newdf[1,]

sum((newdf$LoF>0 | newdf$DMis>0) & newdf$known_CHD==0)

Unnamed: 0_level_0,Gene,Gene.Set,Inheritance,HHE.Rank,pLI.Score,X..of.Observed.Damaging.RGs.in.Cases,observed.Cardiac.Phenotype,observed.EM,observed.NDD
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<chr>,<chr>,<chr>
1,ABCC9,Human CHD Gene,Monoallelic,90.3,0,0,,,


Unnamed: 0_level_0,Gene,LoF,DMis,LVO,HTX,AVC,CTD,CTD_TGA,other,none,cardiac,extracardiac,neuro,pLI,mouse_exp_rank,known_CHD
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
1,MARCH9,1,0,0,0,0,0,0,1,0,other,0,1,0.35,36.2,0


In [114]:
deg1 = read.delim('DEG_shared.txt', sep = '\t')
dim(deg1)
deg1[1,]

Unnamed: 0_level_0,Intersect,nGenes,Gene_symbol
Unnamed: 0_level_1,<chr>,<int>,<chr>
1,DCM HCM HLHS_DN-HLHS HLHS_fail TOF,34,BTG2 RTN4 FOSL2 ZNF189 HEPH AASS PM20D2 FRMD5 TIPARP C14orf132 MID1 ECE1 SOCS2 CEBPD PSMD8 C5orf46 YBX3 ADAMTS9-AS2 WNT9A NEURL1B FKBP5 TLE4 IRS2 PROS1 CEBPB KRBA1 TMTC1 LEPREL1 SCGB3A2 MAFF SLC7A8 PITPNB LPHN3 GADD45B


In [115]:
# get genes in each category
deg2 = c()
for(jj in 1:dim(deg1)[1]){
    temp = data.frame(Disease = deg1$Intersect[jj],
                      Gene = unlist(strsplit(deg1$Gene_symbol[jj], ' ')))
    deg2 = rbind(deg2, temp)
}
deg2[1:2,]

Unnamed: 0_level_0,Disease,Gene
Unnamed: 0_level_1,<chr>,<chr>
1,DCM HCM HLHS_DN-HLHS HLHS_fail TOF,BTG2
2,DCM HCM HLHS_DN-HLHS HLHS_fail TOF,RTN4


In [116]:
# CM3 - disease CMs
deg3 = read.delim('DEG_CM.csv', sep = ',')
deg3 <- deg3[deg3$cluster=='CM3',]
deg3[1,]

Unnamed: 0_level_0,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,cluster,gene
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
562,0,1.385904,1,0.963,0,CM3,PDK4


In [117]:
# add deg and CM3 deg to our list
ids = which(newdf$Gene %in% deg2$Gene)
newdf$CHD_DEG = 0
newdf$CHD_DEG[ids] = 1

newdf$Disease = NA
newdf$Disease[ids] = deg2$Disease[match(newdf$Gene[ids], deg2$Gene)]

newdf$CM_DEG = 0
newdf$CM_DEG[which(newdf$Gene %in% deg3$gene)] = 1
newdf[1,]

Unnamed: 0_level_0,Gene,LoF,DMis,LVO,HTX,AVC,CTD,CTD_TGA,other,none,cardiac,extracardiac,neuro,pLI,mouse_exp_rank,known_CHD,CHD_DEG,Disease,CM_DEG
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>
1,MARCH9,1,0,0,0,0,0,0,1,0,other,0,1,0.35,36.2,0,0,,0


In [118]:
sum(newdf$CM_DEG)

In [119]:
# add TF or chromatin modifier?
chrom = read.delim('/data/CoCoCoNet/gene2go/human_gene2go.csv', sep = ' ')
chrom <- chrom[chrom$GO_term=='GO:0016569',]

ginfo = read.delim('/data/CoCoCoNet/geneInfo/human_info.csv', sep = ',')
chrom$Gene = ginfo$GeneSymbol[match(chrom$NetworkIDs, ginfo$NetworkIDs)]
chrom <- chrom[!is.na(chrom$Gene),]
dim(chrom)
chrom[1,]

# add human TF info
tf = read.delim('~/septation/Homo_sapiens_TF.txt', sep = '\t')
tf[1,]

Unnamed: 0_level_0,NetworkIDs,GO_term,Gene
Unnamed: 0_level_1,<chr>,<chr>,<chr>
728374,ENSG00000136518,GO:0016569,ACTL6A


Unnamed: 0_level_0,Species,Symbol,Ensembl,Family,Protein,Entrez_ID
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>
1,Homo_sapiens,ATF1,ENSG00000123268,TF_bZIP,ENSP00000262053.3;,466


In [120]:
newdf$TF = 0
newdf$TF[which(newdf$Gene %in% tf$Symbol)] = 1

newdf$chromatin_modifier = 0
newdf$chromatin_modifier[which(newdf$Gene %in% chrom$Gene)] = 1
newdf[1,]

Unnamed: 0_level_0,Gene,LoF,DMis,LVO,HTX,AVC,CTD,CTD_TGA,other,none,⋯,extracardiac,neuro,pLI,mouse_exp_rank,known_CHD,CHD_DEG,Disease,CM_DEG,TF,chromatin_modifier
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
1,MARCH9,1,0,0,0,0,0,0,1,0,⋯,0,1,0.35,36.2,0,0,,0,0,0


In [121]:
# save
write.table(newdf, file = 'human_CHD_case_genes_summary.csv', sep = ',', row.names = F,
           col.names = T, quote = F)