How to generate gene-set collections from Bioconductor
Description
This script exports gene annotations relating genes to Gene Ontology, the pathway database KEGG and the protein domain resource PFAM. Use the final part of the script to export these in the GMT format, a gene-set file format that is used by the enrichment analysis tool GSEA.
Code
# UPDATE LIBRARIES
source("http://bioconductor.org/biocLite.R")
biocLite ("GO.db")
biocLite ("KEGG.db")
biocLite ("org.Hs.eg.db")
biocLite ("PFAM.db")
# LOAD LIBRARIES
library ("GO.db")
library ("KEGG.db")
library ("PFAM.db")
library ("org.Hs.eg.db")
# PACKAGE GO INTO LIST
# (ID: GO:xxxxx)
GO_Hs_eg.ls <- as.list (org.Hs.egGO2ALLEGS)
GO_Hs_ID2name.chv <- unlist (lapply (as.list (GOTERM), Term))
# PACKAGE KEGG INTO LIST
# (ID: KEGG_hsa:xxxxx)
# Import
KEGG_Hs_eg.ls <- as.list (org.Hs.egPATH2EG)
KEGG_Hs_ID2name.chv <- unlist (as.list (KEGGPATHID2NAME))
# Add Prefix to IDs and Names
ztemp_names.chv <- paste (
"KEGG:",
KEGG_Hs_ID2name.chv[names (KEGG_Hs_eg.ls)],
sep = ""
)
ztemp_IDs.chv <- paste (
"KEGG:hsa",
names (KEGG_Hs_ID2name.chv[names (KEGG_Hs_eg.ls)]),
sep = ""
)
KEGG_Hs_ID2name.chv <- ztemp_names.chv
names (KEGG_Hs_ID2name.chv) <- ztemp_IDs.chv
names (KEGG_Hs_eg.ls) <- paste (
"KEGG:hsa",
names (KEGG_Hs_eg.ls),
sep = ""
)
# PACKAGE PFAM INTO LIST
# (ID: KEGG_hsa:xxxxx)
# Import
PFAM_eg2pfam.ls <- as.list (org.Hs.egPFAM)
PFAM2names.chv <- unlist (as.list (PFAMDE))
# Change Format
# from EG->PFAM_ID to PFAM_ID->EG
PFAM_eg2pfam.df <- stack (PFAM_eg2pfam.ls)
colnames (PFAM_eg2pfam.df) <- c ("PFAM_ID", "EG")
PFAM_eg.ls <- unstack (PFAM_eg2pfam.df, form = EG ~ PFAM_ID)
# Warning: some PFAM entries don't have name
# some problem with the R package (?)
# UNITE
GS_eg.ls <- c (GO_Hs_eg.ls, KEGG_Hs_eg.ls, PFAM_eg.ls)
GS_ID2name.chv <- c (GO_Hs_ID2name.chv, KEGG_Hs_ID2name.chv, PFAM2names.chv)
# PACKAGE INTO GMT
f.collapse <- function (input.genes)
{paste (input.genes, collapse = "\t")}
genes.chv <- unlist (lapply (GS_eg.ls, f.collapse))
output.chv <- paste (names (GS_eg.ls), GS_ID2name.chv[names (GS_eg.ls)], genes.chv, sep = "\t")
cat (output.chv, sep = "\n", file = "GO_KEGG_PFAM_Hs_eg.GMT")