pacman::p_load(tidyverse, janitor, CellChat)
setwd(this.path::here())
rm(list = ls())
# File creation
if (! dir.exists("../data/db_utils")) sapply("../data/db_utils", dir.create, recursive = TRUE)
# Function for summarizing overlap between databases
join_summarize <- function(db1, db2){
db1$LR <- paste0( db1$ligand, "|", db1$receptor )
db2$LR <- paste0( db2$ligand, "|", db2$receptor )
comb <- inner_join(db1, db2, by = "LR", suffix = c("_db1", "_db2"))
cat(" # ligand-receptors =", comb %>% distinct(LR) %>% nrow(), "\n",
"# simple ligands =", comb %>% filter(!complex_ligand_db1) %>% distinct(ligand_db1) %>% nrow(), "\n",
"# simple receptors =", comb %>% filter(!complex_receptor_db1) %>% distinct(receptor_db1) %>% nrow(), "\n",
"# complex ligands =", comb %>% filter(complex_ligand_db1) %>% distinct(ligand_db1) %>% nrow(), "\n",
"# complex receptors =", comb %>% filter(complex_receptor_db1) %>% distinct(receptor_db1) %>% nrow(), "\n"
)
return(comb %>% distinct(LR, .keep_all = TRUE))
}Database harmonization by scpeakeR - Part 1
Database harmonization between CellChatDB.human(v2.2.0) and CellPhoneDB (V5.0.1)
Introduction
In this document, we outline the database harmonization steps taken to determine the degree of overlap between CellChat and CellPhoneDB, which are listed as:
- Step 1: Converting gene symbols to Uniprot ID format in CellChatDB
- Step 2: Identifying duplicates, missing or invalid entries in CellChatDB and CellPhoneDB
- Step 3: Convert complexes in CellPhoneDB to Uniprot ID format
- Step 4: Sorting protein subunits of complexes in both databases
- Step 5: Swapping ligand-receptor pairs to identify missed overlaps
- Step 6: Partial mapping of complexes
Additionally, we describe the process to customize CellChatDB and CellPhoneDB with the revised changes implemented.
Load required packages and download CellChatDB and CellPhoneDB databases
Load required packages
NOTE: join_summarize() returns a summary table that includes the number of overlapping interactions across two databases, which we define as a match of the ligand|receptor pair, the number of simple ligands and receptors made up of a single gene, as well as the number of overlapping complexes made up of more than one gene. From CellChat (v2) and CellPhoneDB (v5) onward, both packages included non-protein signaling interactions in their latest databases. As such, interactions involving non-protein signaling may contain only one gene, should the hormone/molecule be produced by only one enzyme in its biosynthesis.
STEP 1: Map Gene Symbol to UniProt in CellChatDB
CellChatDB.human (CCDB) defines ligands and receptors using gene symbols, whereas CellPhoneDB (CPDB) defines them using UniProt IDs. To harmonize the two databases, we used the geneInfo slot to map CCDB gene symbols to their corresponding UniProt IDs. For example, the gene symbol EGF was converted to P01133.
CellChat version 2.2.0 stores the database within R under CellChatDB.human.
pacman::p_version("CellChat")[1] '2.2.0'
Prepare CCDB gene-to-uniprot conversion
# Gene symbol to UniProt mapping
map <- CellChatDB.human$geneInfo %>%
select(Symbol, EntryID.uniprot) %>%
drop_na() %>%
deframe()
map %>% length()[1] 26823
map %>% head() A1BG A1CF A2M A2ML1 A3GALT2 A4GALT
"P04217" "Q9NQ94" "P01023" "A8K2U0" "U3KPV4" "Q9NPC4"
Map complex to UniProt complexes and concatenate
CCDB contains complex interaction partners composed of multiple protien subunits. We convert the gene symbols of these proteins to Uniprot ID format.
complex <- CellChatDB.human$complex %>%
mutate(complex_name = rownames(.))
complex %>% dim()[1] 338 6
complex %>% head() subunit_1 subunit_2 subunit_3 subunit_4 subunit_5 complex_name
Activin AB INHBA INHBB Activin AB
Inhibin A INHA INHBA Inhibin A
Inhibin B INHA INHBB Inhibin B
IL12AB IL12A IL12B IL12AB
IL23 complex IL12B IL23A IL23 complex
IL27 complex IL27 EBI3 IL27 complex
example <- c("12oxoLTB4-PTGR1", "TREM2_TYROBP", "RARA_RXRA_CRABP2",
"Pregnenolone-CYP11A1", "HTR3_complex")Before converting the gene symbols to Uniprot ID format, complex looks like:
| subunit_1 | subunit_2 | subunit_3 | subunit_4 | subunit_5 | complex_name | |
|---|---|---|---|---|---|---|
| 12oxoLTB4-PTGR1 | PTGR1 | 12oxoLTB4-PTGR1 | ||||
| TREM2_TYROBP | TREM2 | TYROBP | TREM2_TYROBP | |||
| RARA_RXRA_CRABP2 | RARA | RXRA | CRABP2 | RARA_RXRA_CRABP2 | ||
| Pregnenolone-CYP11A1 | CYP11A1 | FDX2 | FDX1 | FDXR | Pregnenolone-CYP11A1 | |
| HTR3_complex | HTR3A | HTR3B | HTR3C | HTR3D | HTR3E | HTR3_complex |
# Perfrom gene-to-uniprot conversion
complex$subunit_1 <- map[complex$subunit_1]
complex$subunit_2 <- map[complex$subunit_2]
complex$subunit_3 <- map[complex$subunit_3]
complex$subunit_4 <- map[complex$subunit_4]
complex$subunit_5 <- map[complex$subunit_5]After converting the gene symbols to Uniprot ID format, complex now looks like:
| subunit_1 | subunit_2 | subunit_3 | subunit_4 | subunit_5 | complex_name | |
|---|---|---|---|---|---|---|
| 12oxoLTB4-PTGR1 | Q14914 | NA | NA | NA | NA | 12oxoLTB4-PTGR1 |
| TREM2_TYROBP | Q9NZC2 | O43914 | NA | NA | NA | TREM2_TYROBP |
| RARA_RXRA_CRABP2 | P10276 | P19793 | P29373 | NA | NA | RARA_RXRA_CRABP2 |
| Pregnenolone-CYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | NA | Pregnenolone-CYP11A1 |
| HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 | HTR3_complex |
We concatenate the Uniprot IDs of the protein subunits for each complex to form a single character value which represents it.
complex <- complex %>% select(-complex_name) # remove unused "complex_name" column
complex <- apply(complex, 1, function(x) paste(na.omit(x), collapse = "_"))
complex[which(names(complex) %in% example)] 12oxoLTB4-PTGR1 TREM2_TYROBP
"Q14914" "Q9NZC2_O43914"
RARA_RXRA_CRABP2 Pregnenolone-CYP11A1
"P10276_P19793_P29373" "P05108_Q6P4F2_P10109_P22570"
HTR3_complex
"P46098_O95264_Q8WXA8_Q70Z44_A5X5Y0"
Convert all interaction partners in CCDB to their respective Uniprot ID format
CCDB stores all 3233 interactions within R under CellChatDB.human$interaction.
# Include complex (Uniprot) to map
map <- c(map, complex)
CC.int <- CellChatDB.human$interaction
CC.int %>% dim()[1] 3233 28
CC.int %>% glimpse()Rows: 3,233
Columns: 28
$ interaction_name <chr> "TGFB1_TGFBR1_TGFBR2", "TGFB2_TGFBR1_TGFBR2",…
$ pathway_name <chr> "TGFb", "TGFb", "TGFb", "TGFb", "TGFb", "TGFb…
$ ligand <chr> "TGFB1", "TGFB2", "TGFB3", "TGFB1", "TGFB1", …
$ receptor <chr> "TGFbR1_R2", "TGFbR1_R2", "TGFbR1_R2", "ACVR1…
$ agonist <chr> "TGFb agonist", "TGFb agonist", "TGFb agonist…
$ antagonist <chr> "TGFb antagonist", "TGFb antagonist", "TGFb a…
$ co_A_receptor <chr> "", "", "", "", "", "", "", "", "", "", "", "…
$ co_I_receptor <chr> "TGFb inhibition receptor", "TGFb inhibition …
$ annotation <chr> "Secreted Signaling", "Secreted Signaling", "…
$ interaction_name_2 <chr> "TGFB1 - (TGFBR1+TGFBR2)", "TGFB2 - (TGFBR1+T…
$ evidence <chr> "KEGG: hsa04350", "KEGG: hsa04350", "KEGG: hs…
$ is_neurotransmitter <chr> "FALSE", "FALSE", "FALSE", "FALSE", "FALSE", …
$ ligand.symbol <chr> "TGFB1", "TGFB2", "TGFB3", "TGFB1", "TGFB1", …
$ ligand.family <chr> "TGF-beta", "TGF-beta", "TGF-beta", "TGF-beta…
$ ligand.location <chr> "Extracellular matrix, Secreted, Extracellula…
$ ligand.keyword <chr> "Disease variant, Signal, Reference proteome,…
$ ligand.secreted_type <chr> "growth factor", "growth factor", "cytokine;g…
$ ligand.transmembrane <chr> "FALSE", "FALSE", "TRUE", "FALSE", "FALSE", "…
$ receptor.symbol <chr> "TGFBR2, TGFBR1", "TGFBR2, TGFBR1", "TGFBR2, …
$ receptor.family <chr> "Protein kinase superfamily, TKL Ser/Thr prot…
$ receptor.location <chr> "Cell membrane, Secreted, Membrane raft, Cell…
$ receptor.keyword <chr> "Membrane, Secreted, Disulfide bond, Kinase, …
$ receptor.surfaceome_main <chr> "Receptors", "Receptors", "Receptors", "Recep…
$ receptor.surfaceome_sub <chr> "Act.TGFB;Kinase", "Act.TGFB;Kinase", "Act.TG…
$ receptor.adhesome <chr> "", "", "", "", "", "", "", "", "", "", "", "…
$ receptor.secreted_type <chr> "", "", "", "", "", "", "", "", "", "", "", "…
$ receptor.transmembrane <chr> "TRUE", "TRUE", "TRUE", "TRUE", "TRUE", "TRUE…
$ version <chr> "CellChatDB v1", "CellChatDB v1", "CellChatDB…
CC.int$complex_ligand <- CC.int$ligand %in% names(complex)
CC.int$complex_receptor <- CC.int$receptor %in% names(complex)
tmp <- join_summarize(CC.int, CC.int) # ligand-receptors = 3233
# simple ligands = 664
# simple receptors = 576
# complex ligands = 136
# complex receptors = 204
# Keeping gene/complex name of original database
CC.int$ligand_name <- CC.int$ligand
CC.int$receptor_name <- CC.int$receptor
# Map Gene Symbol to UniProt in CCDB
CC.int$ligand <- map[CC.int$ligand]
CC.int$receptor <- map[CC.int$receptor]
CC.int %>%
select(interaction_name, pathway_name, ligand_name, ligand, receptor_name, receptor) %>%
head() interaction_name pathway_name ligand_name ligand
TGFB1_TGFBR1_TGFBR2 TGFB1_TGFBR1_TGFBR2 TGFb TGFB1 P01137
TGFB2_TGFBR1_TGFBR2 TGFB2_TGFBR1_TGFBR2 TGFb TGFB2 P61812
TGFB3_TGFBR1_TGFBR2 TGFB3_TGFBR1_TGFBR2 TGFb TGFB3 P10600
TGFB1_ACVR1B_TGFBR2 TGFB1_ACVR1B_TGFBR2 TGFb TGFB1 P01137
TGFB1_ACVR1C_TGFBR2 TGFB1_ACVR1C_TGFBR2 TGFb TGFB1 P01137
TGFB2_ACVR1B_TGFBR2 TGFB2_ACVR1B_TGFBR2 TGFb TGFB2 P61812
receptor_name receptor
TGFB1_TGFBR1_TGFBR2 TGFbR1_R2 P36897_P37173
TGFB2_TGFBR1_TGFBR2 TGFbR1_R2 P36897_P37173
TGFB3_TGFBR1_TGFBR2 TGFbR1_R2 P36897_P37173
TGFB1_ACVR1B_TGFBR2 ACVR1B_TGFbR2 P36896_P37173
TGFB1_ACVR1C_TGFBR2 ACVR1C_TGFbR2 Q8NER5_P37173
TGFB2_ACVR1B_TGFBR2 ACVR1B_TGFbR2 P36896_P37173
Download CellPhoneDB database
basepath <- "https://raw.github.com/ventolab/cellphonedb-data/refs/heads/master/data"
# Load required files for database customization
CPDB.int <- read_csv(url(file.path(basepath, "interaction_input.csv")))
complex <- read_csv(url(file.path(basepath, "complex_input.csv")))Prepare CPDB for database comparison – Step 1
CPDB.int$ligand <- CPDB.int$partner_a
CPDB.int$receptor <- CPDB.int$partner_b
CPDB.int$complex_ligand <- CPDB.int$ligand %in% complex$complex_name
CPDB.int$complex_receptor <- CPDB.int$receptor %in% complex$complex_name
tmp <- join_summarize(CPDB.int, CPDB.int) # ligand-receptors = 2911
# simple ligands = 611
# simple receptors = 493
# complex ligands = 150
# complex receptors = 216
rm(map, tmp)
# Step 1 join
step1 <- join_summarize(CC.int, CPDB.int) # ligand-receptors = 585
# simple ligands = 327
# simple receptors = 288
# complex ligands = 0
# complex receptors = 0
rm(step1)After Step 1 of Database Harmonization, we observed that there were only 585 overlapping interactions across CCDB and CPDB. As expected, no complex interaction partners across both databases overlapped since CPDB complexes were not converted to Uniprot IDs in this step.
STEP 2: Addressing duplicated or invalid mappings
Finding duplicate LR pairs in CCDB after gene-to-uniprot conversion
CC.int %>% count(ligand, receptor) %>% filter(n > 1) # 16 ligand receptor n
1 P01189 P35372 2
2 P01189 P41143 2
3 P01189 P41145 2
4 P01258 P30988_O60894 2
5 P01258 P30988_O60895 2
6 P01258 P30988_O60896 2
7 P10092 P30988_O60894 2
8 P10092 P30988_O60895 2
9 P10092 P30988_O60896 2
10 P10997 P30988_O60894 2
11 P10997 P30988_O60895 2
12 P10997 P30988_O60896 2
13 P35318 P30988_O60894 2
14 P35318 P30988_O60895 2
15 P35318 P30988_O60896 2
16 <NA> P32246 2
CC.int <- CC.int %>%
mutate(LR = paste(ligand, receptor, sep = "|"))
dups <- CC.int %>%
count(ligand, receptor) %>%
filter(n > 1) %>%
select(-n) %>%
mutate(LR = paste(ligand, receptor, sep = "|"))
left_join( dups, CC.int ) %>% select(ligand_name, ligand, receptor_name, receptor) ligand_name ligand receptor_name receptor
1 POMC P01189 OPRM1 P35372
2 b-Endorphin-POMC P01189 OPRM1 P35372
3 POMC P01189 OPRD1 P41143
4 b-Endorphin-POMC P01189 OPRD1 P41143
5 POMC P01189 OPRK1 P41145
6 b-Endorphin-POMC P01189 OPRK1 P41145
7 CALCA P01258 CALCR_RAMP1 P30988_O60894
8 CALCA P01258 CALCRL_RAMP1 P30988_O60894
9 CALCA P01258 CALCR_RAMP2 P30988_O60895
10 CALCA P01258 CALCRL_RAMP2 P30988_O60895
11 CALCA P01258 CALCR_RAMP3 P30988_O60896
12 CALCA P01258 CALCRL_RAMP3 P30988_O60896
13 CALCB P10092 CALCR_RAMP1 P30988_O60894
14 CALCB P10092 CALCRL_RAMP1 P30988_O60894
15 CALCB P10092 CALCR_RAMP2 P30988_O60895
16 CALCB P10092 CALCRL_RAMP2 P30988_O60895
17 CALCB P10092 CALCR_RAMP3 P30988_O60896
18 CALCB P10092 CALCRL_RAMP3 P30988_O60896
19 IAPP P10997 CALCR_RAMP1 P30988_O60894
20 IAPP P10997 CALCRL_RAMP1 P30988_O60894
21 IAPP P10997 CALCR_RAMP2 P30988_O60895
22 IAPP P10997 CALCRL_RAMP2 P30988_O60895
23 IAPP P10997 CALCR_RAMP3 P30988_O60896
24 IAPP P10997 CALCRL_RAMP3 P30988_O60896
25 ADM P35318 CALCR_RAMP1 P30988_O60894
26 ADM P35318 CALCRL_RAMP1 P30988_O60894
27 ADM P35318 CALCR_RAMP2 P30988_O60895
28 ADM P35318 CALCRL_RAMP2 P30988_O60895
29 ADM P35318 CALCR_RAMP3 P30988_O60896
30 ADM P35318 CALCRL_RAMP3 P30988_O60896
31 CCL3L1 <NA> CCR1 P32246
32 CCL3L3 <NA> CCR1 P32246
rm(dups)Identifying missing/invalid entries of simple genes
CC.int$interaction_name[which(is.na(CC.int$ligand))] [1] "CCL3L1_CCR1" "CCL3L3_CCR1" "CCL3L1_ACKR2"
CC.int$interaction_name[which(is.na(CC.int$receptor))] [1] "RARRES2_GPR1"
We loaded the uniprotkb database of only reviewed Uniprot IDs to identify interaction partners not successfully mapped.
# Load Uniprot Database
uniprotkb <- read_tsv(url("https://rest.uniprot.org/uniprotkb/stream?download=true&fields=accession%2Cid%2Cprotein_name%2Cgene_names%2Ccc_subcellular_location%2Ccc_pathway%2Clit_pubmed_id%2Cxref_kegg&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29+AND+%28model_organism%3A9606%29")) %>%
clean_names() %>%
filter(! is.na(gene_names)) %>%
rowwise() %>%
mutate(n_genes = length(str_split_1(gene_names, pattern = " "))) %>%
ungroup() %>%
separate(gene_names, into = paste0("gene_", 1:max(.$n_genes)), sep = " ") %>%
pivot_longer(starts_with("gene"), values_to = "gene") %>%
filter(! is.na(gene)) %>%
select(-n_genes, -name) %>%
relocate(entry, entry_name, gene)
# Checking for non-uniprot entries
CC.int %>% filter(! complex_ligand, ! ligand %in% uniprotkb$entry) %>% pull(ligand, ligand_name) # ligand: CCL3L1, CCL3L3, IFNA1, IFNA13 invalid CCL3L1 CCL3L3 CCL3L1 IFNA1
NA NA NA "L0N195"
CC.int %>% filter(! complex_receptor, ! receptor %in% uniprotkb$entry) %>% pull(receptor, receptor_name) # receptor: GPR1 missing an entryGPR1
NA
Manually editing problematic entries
tmp <- CC.int %>%
mutate(ligand = case_when(ligand_name == "b-Endorphin-POMC" ~ "P01189betaEndo",
ligand_name == "CCL3L1" ~ "P16619",
ligand_name == "CCL3L3" ~ "P16619byCCL3L3",
ligand_name == "IFNA1" ~ "P01562",
ligand_name == "IFNA13" ~ "P01562byIFNA13",
.default = ligand),
receptor = case_when(grepl("^CALCRL_", receptor_name) ~ sub("P30988", "Q16602", receptor),
interaction_name == "RARRES2_GPR1" ~ "P46091",
.default = receptor),
interaction_name = case_when(ligand_name == "IFNA10" ~ "IFNA10_IFNAR1_IFNAR2",
ligand_name == "IFNA13" ~ "IFNA13_IFNAR1_IFNAR2",
.default = interaction_name),
LR = paste(ligand, receptor, sep = "|")) %>%
rename(interaction_name_new = interaction_name, ligand_new = ligand, receptor_new = receptor)
# Apply changes to CCDB
CC.int <- tmp %>%
rename_with(~ sub("_new", "", .), ends_with("_new"))Finding duplicate LR pairs in CPDB after gene-to-uniprot conversion
CellPhoneDB does not have any duplicated interactions at this stage
CPDB.int %>% count(ligand, receptor) %>% filter(n > 1) # 0 # A tibble: 0 × 3
# ℹ 3 variables: ligand <chr>, receptor <chr>, n <int>
Checking for invalid entries
CPDB.int %>% filter(! complex_ligand, ! ligand %in% uniprotkb$entry) %>% pull(ligand) # ligand: HLAA, HLAB, HLAC, IFNA1 not uniprot ID format [1] "HLAA" "HLAB" "HLAC" "HLAC" "IFNA1"
CPDB.int %>% filter(! complex_receptor, ! receptor %in% uniprotkb$entry) %>% pull(receptor) # receptor: no invalid uniprot IDscharacter(0)
rm(uniprotkb)
CPDB.int <- CPDB.int %>%
mutate(ligand = case_when(ligand == "HLAA" ~ "P04439",
ligand == "HLAB" ~ "P01889",
ligand == "HLAC" ~ "P10321",
ligand == "IFNA1" ~ "P01562",
.default = ligand))# Step 2 join
step2 <- join_summarize(CC.int, CPDB.int) # ligand-receptors = 591
# simple ligands = 331
# simple receptors = 291
# complex ligands = 0
# complex receptors = 0
rm(diff, tmp, step2)After Step 2 of Database Harmonization, we observed that there was a slight increase in the number of overlapping interactions (585 to 591) across CCDB and CPDB.
STEP 3: Change nomenclature for CellPhoneDB
In Step 3, we convert the CPDB native complex names to their respective Uniprot ID concatenated character value, mirroring Step 1 for CCDB.
example <- c("12oxoLeukotrieneB4_byPTGR1", "TREM2_receptor", "RAreceptor_RARA_RXRA",
"Pregnenolone_byCYP11A1", "5HTR3_complex")Before concatenating the Uniprot IDs of the protein subunits, complex looks like:
| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 |
|---|---|---|---|---|---|
| 12oxoLeukotrieneB4_byPTGR1 | Q14914 | NA | NA | NA | NA |
| Pregnenolone_byCYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | NA |
| RAreceptor_RARA_RXRA | P10276 | P19793 | P29373 | NA | NA |
| TREM2_receptor | Q9NZC2 | O43914 | NA | NA | NA |
| 5HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 |
We convert the complex names to their respective single character value in Uniprot ID format.
complex <- complex %>%
column_to_rownames("complex_name") %>%
select(starts_with("uniprot"))
# Construct CPDB complex map
complex <- apply(complex, 1, function(x) paste(na.omit(x), collapse = "_"))
complex[which(names(complex) %in% example)] 12oxoLeukotrieneB4_byPTGR1 Pregnenolone_byCYP11A1
"Q14914" "P05108_Q6P4F2_P10109_P22570"
RAreceptor_RARA_RXRA TREM2_receptor
"P10276_P19793_P29373" "Q9NZC2_O43914"
5HTR3_complex
"P46098_O95264_Q8WXA8_Q70Z44_A5X5Y0"
CPDB.int$ligand <- ifelse(CPDB.int$complex_ligand, complex[CPDB.int$ligand], CPDB.int$ligand)
CPDB.int$receptor <- ifelse(CPDB.int$complex_receptor, complex[CPDB.int$receptor], CPDB.int$receptor)
CPDB.int$ligand <- ifelse(CPDB.int$partner_a == "b-Endorphin_byPOMC", "P01189betaEndo", CPDB.int$ligand) # match changes done for same interaction in CCDB
# Step 3 join
step3 <- join_summarize(CC.int, CPDB.int) # ligand-receptors = 1760
# simple ligands = 411
# simple receptors = 390
# complex ligands = 121
# complex receptors = 122
rm(complex, step3)STEP 4: Sort subunits
We observed that some complexes in CCDB and CPDB contained the same protein subunits, but the order in which they were tabulated differed:
Examples
CC.int %>% filter(ligand_name == "IL27 complex") %>% pull(ligand)[1] "Q8NEV9_Q14213"
CPDB.int %>% filter(partner_a == "IL27") %>% pull(ligand)[1] "Q14213_Q8NEV9"
We sorted the protein subunits making up the complexes in both database
sort_paste <- function(x) paste(sort(x), collapse = "_")
CC.int$ligand <- sapply(strsplit(CC.int$ligand, split = "_" ), sort_paste )
CC.int$receptor <- sapply(strsplit(CC.int$receptor, split = "_" ), sort_paste )
CPDB.int$ligand <- sapply(strsplit(CPDB.int$ligand, split = "_" ), sort_paste )
CPDB.int$receptor <- sapply(strsplit(CPDB.int$receptor, split = "_" ), sort_paste )
# Finding duplicated interactions once again after sorting of subunits
## CellChatDB
tmp <- join_summarize(CC.int, CC.int) # ligand-receptors = 3233
# simple ligands = 664
# simple receptors = 576
# complex ligands = 134
# complex receptors = 203
No duplicated LR pairs were observed in CCDB after sorting subunits of complexes.
## CellPhoneDB
tmp <- join_summarize(CPDB.int, CPDB.int) # ligand-receptors = 2909
# simple ligands = 611
# simple receptors = 493
# complex ligands = 146
# complex receptors = 215
As indicated in the warning above, we observed duplicated complex keys after sorting subunits in CPDB:
dups <- CPDB.int %>%
count(ligand, receptor) %>%
filter(n > 1) %>%
select(-n) %>%
mutate(LR = paste(ligand, receptor, sep = "|"))We manually differentiated these complexes as such:
left_join( dups, CPDB.int)# A tibble: 4 × 24
ligand receptor LR id_cp_interaction partner_a partner_b protein_name_a
<chr> <chr> <chr> <lgl> <chr> <chr> <chr>
1 P09917_P2… Q9Y271 P099… NA Leukotri… Q9Y271 <NA>
2 P09917_P2… Q9Y271 P099… NA LipoxinA… Q9Y271 <NA>
3 Q8IZI9 Q08334_… Q8IZ… NA Q8IZI9 IL28_rec… IFNL3_HUMAN
4 Q8IZI9 Q08334_… Q8IZ… NA Q8IZI9 Type_III… IFNL3_HUMAN
# ℹ 17 more variables: protein_name_b <chr>, annotation_strategy <chr>,
# source <chr>, is_ppi <lgl>, curator <chr>, reactome_complex <chr>,
# reactome_reaction <chr>, reactome_pathway <chr>,
# complexPortal_complex <lgl>, comments <chr>, version <chr>,
# interactors <chr>, classification <chr>, directionality <chr>,
# modulatory_effect <chr>, complex_ligand <lgl>, complex_receptor <lgl>
CPDB.int <- CPDB.int %>% mutate(ligand = ifelse(partner_a == "LeukotrieneC4_byLTC4S", "P20292_P09917_Q16873", ligand),
receptor = ifelse(partner_b == "Type_III_IFNR", "Q8IU57_Q08334", receptor))
CC.int <- CC.int %>% mutate(ligand = ifelse(ligand_name == "LTC4-LTC4S", "P20292_P09917_Q16873", ligand)) # Match name for same complex in CPDB
# After addressing duplicated LR pairs in CellPhoneDB
tmp <- join_summarize(CPDB.int, CPDB.int) # ligand-receptors = 2911
# simple ligands = 611
# simple receptors = 493
# complex ligands = 147
# complex receptors = 216
rm(dups, tmp, sort_paste)
# Step 4 join
step4 <- join_summarize(CC.int, CPDB.int) # ligand-receptors = 2165
# simple ligands = 453
# simple receptors = 390
# complex ligands = 125
# complex receptors = 170
After Step 4, we observed an increase in the number of overlapping ligand-receptor interactions attributed to the sorting process.
STEP 5: Swap partners for unmatched CPDB
Apart from internally standardizing the order of protein subunits within complexes, the order of the interaction partners affected the number of overlapping interactions as well. Since we defined interactions as ligand|receptor pair, we identified cases where interactions in CCDB were arranged as “ligand_A|receptor_B”, while the same interaction was arranged as “receptor_B|ligand_A” in CPDB. This is often the case for cell-contact interactions, where directionality is inconsequential.
We first extracted the interactions which did not overlapped in Step 4, and swapped the interaction partners for those unmatched interactions:
found <- step4 %>%
select(ligand = ligand_db2, receptor = receptor_db2) %>%
mutate(found = 1)
CPDB.int2 <- CPDB.int %>% left_join(found)
# Extracting rows which were previously NOT MATCHED to CellChatDB
to_swap <- which( is.na( CPDB.int2$found ) )
# Performing swap of ligand and receptor for unmatched interactions
for(i in to_swap){
CPDB.int2$ligand[i] <- CPDB.int$receptor[i]
CPDB.int2$receptor[i] <- CPDB.int$ligand[i]
CPDB.int2$partner_a[i] <- CPDB.int$partner_b[i]
CPDB.int2$partner_b[i] <- CPDB.int$partner_a[i]
CPDB.int2$protein_name_a[i] <- CPDB.int$protein_name_b[i]
CPDB.int2$protein_name_b[i] <- CPDB.int$protein_name_a[i]
CPDB.int2$complex_ligand[i] <- CPDB.int$complex_receptor[i]
CPDB.int2$complex_receptor[i] <- CPDB.int$complex_ligand[i]
}
step5 <- join_summarize(CC.int, CPDB.int2) # ligand-receptors = 2213
# simple ligands = 464
# simple receptors = 415
# complex ligands = 128
# complex receptors = 170
After swapping the interaction partners of the unmatched interactions only, we observed 48 more overlapping interactions.
flip <- step5 %>%
filter(LR %in% setdiff(step5$LR, step4$LR)) All swapped interactions were in the “Cell-Cell Contact” category which validates our approach.
flip %>% tabyl(annotation) # all are Cell-Cell Contact annotation n percent
Cell-Cell Contact 48 1
We now implement the swap to the original CPDB
flipped <- flip$interactors # used for customizing CellPhoneDB database later
# Flipping only interactions that mapped after swapping interaction partners
CPDB.int$flip_LR <- paste(CPDB.int$receptor, CPDB.int$ligand, sep = "|")
to_flip <- which(CPDB.int$flip_LR %in% flip$LR)
cols <- c("ligand", "receptor", "partner_a", "partner_b", "protein_name_a",
"protein_name_b", "complex_ligand", "complex_receptor")
for (col in cols) {
CPDB.int[[col]][to_flip] <- CPDB.int2[[col]][to_flip]
}
CPDB.int$flip_LR <- NULL
rm(CPDB.int2, flip, found, to_flip, to_swap, step4, step5)STEP 6: Addressing partial matches
Within the complex database in both tools, we observed instances of complexes having the similar names but differ in their subunits.
CCDB complex
CC.complex <- CellChatDB.human$complex %>%
rownames_to_column("CellChat_complex") %>%
select(CellChat_complex) %>%
left_join(CC.int %>% select(ligand_name, ligand), by = c("CellChat_complex" = "ligand_name")) %>%
left_join(CC.int %>% select(receptor_name, receptor), by = c("CellChat_complex" = "receptor_name")) %>%
distinct() %>%
mutate(CellChat_uniprot = ifelse(is.na(ligand), receptor, ligand)) %>%
select(-ligand, -receptor) %>%
filter(!is.na(CellChat_uniprot)) # some complexes have NAs for subunitsCPDB complex
CPDB.complex <- read_csv(url(file.path(basepath, "complex_input.csv"))) %>%
select(CellPhoneDB_complex = complex_name) %>%
left_join(CPDB.int %>% select(partner_a, ligand), by = c("CellPhoneDB_complex" = "partner_a")) %>%
left_join(CPDB.int %>% select(partner_b, receptor), by = c("CellPhoneDB_complex" = "partner_b")) %>%
distinct() %>%
mutate(CellPhoneDB_uniprot = ifelse(is.na(ligand), receptor, ligand)) %>%
select(-ligand, -receptor)found <- CC.complex %>%
inner_join(CPDB.complex, by = c("CellChat_uniprot" = "CellPhoneDB_uniprot"))
CC.only <- CC.complex %>%
filter(! CellChat_complex %in% found$CellChat_complex)
CPDB.only <- CPDB.complex %>%
filter(! CellPhoneDB_complex %in% found$CellPhoneDB_complex)
rm(CC.complex, CPDB.complex, found)To identify partial matches, we calculated the jaccard index of the complex subunit composition in both databases, defining a partial match as complexes with a jaccard index more than or equal to 0.50.
\[ Jaccard Index = (number of identical subunits) / (total number of shared subunits) \] Suppose we have a complexes made up of subunitA_subunitB in CCDB and another similar complexes made up of subunitA_subunitB_subunitC. The pairwise jaccard index for these two complexes will be 0.667 (3 d.p.) since they both share subunits A and B, and the number of unique subunits is 3 (subunits A, B and C).
Finding potential overlapping subunits in complexes across both databases
find_jaccard_index <- function(x, y){
x <- str_split_1(x, pattern = "_")
y <- str_split_1(y, pattern = "_")
return( length(intersect(x, y)) / length(union(x, y)) )
}
partial.match <- expand.grid(CellChat_uniprot = CC.only$CellChat_uniprot,
CellPhoneDB_uniprot = CPDB.only$CellPhoneDB_uniprot,
stringsAsFactors = FALSE)
partial.match$jaccard_index <- mapply(FUN = find_jaccard_index, partial.match[ , 1], partial.match[ , 2])
partial.match <- partial.match %>%
arrange(desc(jaccard_index)) %>%
filter(jaccard_index >= 0.5) %>%
left_join(CC.only) %>%
left_join(CPDB.only) %>%
relocate(sort(colnames(.))) %>%
distinct(CellChat_complex, .keep_all = TRUE) %>%
filter(CellChat_complex != "ACVR1B_BMPR2") # no common interaction in both CCDB & CPDB; different complex despite high similarity
rm(CC.only, CPDB.only)Proposing a new complex name for the partially matched complexes
partial.match <- partial.match %>% mutate(proposed_name = c("P00352", "P28329_Q96EP9", "P14784_P31785_Q13261",
"O60896_Q16602_Q99527", "P36897_P37173_Q04771", "P07359_P13224_P14770_P40197"))
partial.match CellChat_complex CellChat_uniprot CellPhoneDB_complex
1 RA-ALDH1A1 P00352_P00352 atRetinoicAcid_byALDH1A1
2 Ach-CHAT_SLC10A4 P28329_Q96EP9_Q96EP9 Acetylcholine_byCHAT_and_SLC10A4
3 IL15RA_IL2RB P14784_P31785_Q13261 IL15_receptor
4 CALCRL_RAMP3 O60896_Q16602 RAMP3_complex
5 ACVR1_TGFbR P36897_P37173_Q04771 TGFbeta_receptor2
6 GP complex P07359_P13224_P14770_P40197 Glycoprotein_Ib_complex
CellPhoneDB_uniprot jaccard_index proposed_name
1 P00352 1.0000000 P00352
2 P28329_Q96EP9 1.0000000 P28329_Q96EP9
3 P14784_Q13261 0.6666667 P14784_P31785_Q13261
4 O60896_Q16602_Q99527 0.6666667 O60896_Q16602_Q99527
5 P37173_Q04771 0.6666667 P36897_P37173_Q04771
6 P07359_P13224 0.5000000 P07359_P13224_P14770_P40197
Changing to proposed names in respective databases
CellChatDB
CC.int <- CC.int %>%
left_join(partial.match %>% select(CellChat_uniprot, proposed_name), by = c("ligand" = "CellChat_uniprot")) %>%
mutate(ligand = ifelse(!is.na(proposed_name), proposed_name, ligand)) %>%
select(-proposed_name) %>%
left_join(partial.match %>% select(CellChat_uniprot, proposed_name), by = c("receptor" = "CellChat_uniprot")) %>%
mutate(receptor = ifelse(!is.na(proposed_name), proposed_name, receptor),
LR = paste(ligand, receptor, sep = "|")) %>%
select(-proposed_name) CellPhoneDB
CPDB.int <- CPDB.int %>%
left_join(partial.match %>% select(CellPhoneDB_uniprot, proposed_name), by = c("ligand" = "CellPhoneDB_uniprot")) %>%
mutate(ligand = ifelse(!is.na(proposed_name), proposed_name, ligand)) %>%
select(-proposed_name) %>%
left_join(partial.match %>% select(CellPhoneDB_uniprot, proposed_name), by = c("receptor" = "CellPhoneDB_uniprot")) %>%
mutate(receptor = ifelse(!is.na(proposed_name), proposed_name, receptor),
LR = paste(ligand, receptor, sep = "|")) %>%
select(-proposed_name)
step6 <- join_summarize(CC.int, CPDB.int) # ligand-receptors = 2239
# simple ligands = 466
# simple receptors = 415
# complex ligands = 130
# complex receptors = 174
rm(step6)Save the following objects for customizing CCDB and CPDB in Part 2:
save(flipped, file = "../data/db_utils/flipped.rda")
save(CC.int, file = "../data/db_utils/CC_int.rda")
save(CPDB.int, file = "../data/db_utils/CPDB_int.rda")Finally, we have a overlap of 2,239 interactions across CCDB and CPDB:
ggvenn::ggvenn(list(CellChat = CC.int$LR, CellPhoneDB = CPDB.int$LR))