How to generate gene-set collections from Bioconductor

# UPDATE LIBRARIES

source("http://bioconductor.org/biocLite.R")
biocLite ("GO.db")
biocLite ("KEGG.db")
biocLite ("org.Hs.eg.db")
biocLite ("PFAM.db")

# LOAD LIBRARIES
library ("GO.db")
library ("KEGG.db")
library ("PFAM.db")
library ("org.Hs.eg.db")


# PACKAGE GO INTO LIST
# (ID: GO:xxxxx)

GO_Hs_eg.ls <- as.list (org.Hs.egGO2ALLEGS)
GO_Hs_ID2name.chv <- unlist (lapply (as.list (GOTERM), Term))

# PACKAGE KEGG INTO LIST
# (ID: KEGG_hsa:xxxxx)

# Import

KEGG_Hs_eg.ls <- as.list (org.Hs.egPATH2EG)
KEGG_Hs_ID2name.chv <- unlist (as.list (KEGGPATHID2NAME))

# Add Prefix to IDs and Names

ztemp_names.chv <- paste (
                                                        "KEGG:",
                                                        KEGG_Hs_ID2name.chv[names (KEGG_Hs_eg.ls)], 
                                                        sep = ""
                                                        )               
ztemp_IDs.chv   <- paste (      "KEGG:hsa",
                                                        names (KEGG_Hs_ID2name.chv[names (KEGG_Hs_eg.ls)]), 
                                                        sep = ""
                                                        )               
KEGG_Hs_ID2name.chv <- ztemp_names.chv
names (KEGG_Hs_ID2name.chv) <- ztemp_IDs.chv

names (KEGG_Hs_eg.ls) <- paste (
                                                        "KEGG:hsa",
                                                        names (KEGG_Hs_eg.ls), 
                                                        sep = ""
                                                        )

# PACKAGE PFAM INTO LIST
# (ID: KEGG_hsa:xxxxx)

# Import

PFAM_eg2pfam.ls <- as.list (org.Hs.egPFAM)
PFAM2names.chv <- unlist (as.list (PFAMDE))

# Change Format
# from EG->PFAM_ID to PFAM_ID->EG

PFAM_eg2pfam.df <- stack (PFAM_eg2pfam.ls)
colnames (PFAM_eg2pfam.df) <- c ("PFAM_ID", "EG")
PFAM_eg.ls <- unstack (PFAM_eg2pfam.df, form = EG ~ PFAM_ID)

# Warning: some PFAM entries don't have name
# some problem with the R package (?)

# UNITE

GS_eg.ls <- c (GO_Hs_eg.ls, KEGG_Hs_eg.ls, PFAM_eg.ls)
GS_ID2name.chv <- c (GO_Hs_ID2name.chv, KEGG_Hs_ID2name.chv, PFAM2names.chv)

# PACKAGE INTO GMT

f.collapse <- function (input.genes)
        {paste (input.genes, collapse = "\t")}

genes.chv  <- unlist (lapply (GS_eg.ls, f.collapse))
output.chv <- paste (names (GS_eg.ls), GS_ID2name.chv[names (GS_eg.ls)], genes.chv, sep = "\t")

cat (output.chv, sep = "\n", file = "GO_KEGG_PFAM_Hs_eg.GMT")

DanieleMerico/Code/Bioc2GMT (last edited 2010-03-02 00:38:48 by DanieleMerico)

MoinMoin Appliance - Powered by TurnKey Linux