# By: Daniele Merico (PhD), BaderLab, Donnelly CCBR, University of Toronto

# GENERAL RELEASE NOTES
# - Gene Ontology (GO), KEGG and PFAM from Bioconductor libraries
#   each species has an annotation set
# - NCI from web page, in tabular format (http://pid.nci.nih.gov/download.shtml)
#   primary data for Hs, used orthologs for other species
# - Biocarta from whichgenes (http://www.whichgenes.org/)
#   primary data for Hs, used orthologs for other species
#   * due to a quirk in the web server, this is not an updated version
# - HomoloGene from 
#   ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current/

# OTHER TECHNICAL NOTES
# - ID prefixes should be always fully capitalized, 
#   as GSEA automatically converts them to capital letters

# PATH

path_root.ch <- "/Users/danielemerico/Documents/Research_Works/GeneSet_DB_2.0/"
path_ws.ch   <- paste (path_root.ch, "R_Works/R_ws/", sep = "")
path_data.ch <- paste (path_root.ch, "Core_Data/20101108", sep = "")
path_res.ch  <- paste (path_root.ch, "Results/v02_20101108/", sep = "")

# COMMON FUNCTIONS

# Convert to orthologs

f.ConvHom <- function (input.eg, taxon_in.id, taxon_out.id, HomGene.df)
	{
	groups.id <- HomGene.df$HomologyGroupID[HomGene.df$TaxonID == taxon_in.id & HomGene.df$EgID %in% input.eg]
	output.eg <- HomGene.df$EgID[HomGene.df$TaxonID == taxon_out.id & HomGene.df$HomologyGroupID %in% groups.id]
	return (output.eg)
	}

# LIBRARIES

# Update
# (suggest running in a separate workspace)

source("http://bioconductor.org/biocLite.R")

biocLite ("GO.db")			# v:2.4.5, packaged: 2010-09-23
biocLite ("KEGG.db")		# v:2.4.5, packaged: 2010-09-23
biocLite ("PFAM.db")		# v:2.4.5, packaged: 2010-09-23

biocLite ("org.Hs.eg.db")	# v:2.4.6, packaged: 2010-10-04
biocLite ("org.Mm.eg.db")	# v:2.4.6, packaged: 2010-10-04
biocLite ("org.Rn.eg.db")	# v:2.4.6, packaged: 2010-10-04

# Load

library ("GO.db")
library ("KEGG.db")
library ("PFAM.db")

library ("org.Hs.eg.db")
library ("org.Mm.eg.db")
library ("org.Rn.eg.db")

# IMPORT GO

GO_id2eg_Hs.ls <- as.list (org.Hs.egGO2ALLEGS)
GO_id2eg_Mm.ls <- as.list (org.Mm.egGO2ALLEGS)
GO_id2eg_Rn.ls <- as.list (org.Rn.egGO2ALLEGS)

GO_id2des.chv  <- unlist (lapply (as.list (GOTERM), Term))

# Prune redundancies

GO_id2eg_Hs.ls <- lapply (GO_id2eg_Hs.ls, unique)
GO_id2eg_Mm.ls <- lapply (GO_id2eg_Mm.ls, unique)
GO_id2eg_Rn.ls <- lapply (GO_id2eg_Rn.ls, unique)

# IMPORT KEGG

KEGG_id2eg_Hs.ls <- as.list (org.Hs.egPATH2EG)
KEGG_id2eg_Mm.ls <- as.list (org.Mm.egPATH2EG)
KEGG_id2eg_Rn.ls <- as.list (org.Rn.egPATH2EG)

KEGG_id2des.chv  <- unlist (as.list (KEGGPATHID2NAME))

# Add ID Prefix

names (KEGG_id2des.chv) <- paste ("KEGG:", names (KEGG_id2des.chv), sep = "")

names (KEGG_id2eg_Hs.ls) <- paste ("KEGG:", names (KEGG_id2eg_Hs.ls), sep = "")
names (KEGG_id2eg_Mm.ls) <- paste ("KEGG:", names (KEGG_id2eg_Mm.ls), sep = "")
names (KEGG_id2eg_Rn.ls) <- paste ("KEGG:", names (KEGG_id2eg_Rn.ls), sep = "")

# Checks
# (output should be always 0)

length (setdiff (names (KEGG_id2eg_Hs.ls), names (KEGG_id2des.chv)))
length (setdiff (names (KEGG_id2eg_Mm.ls), names (KEGG_id2des.chv)))
length (setdiff (names (KEGG_id2eg_Rn.ls), names (KEGG_id2des.chv)))

# Add Description Prefix

temp.names <- names (KEGG_id2des.chv)
KEGG_id2des.chv <- paste ("KEGG: ", KEGG_id2des.chv, sep = "")
names (KEGG_id2des.chv) <- temp.names
rm (temp.names)

# IMPORT PFAM

PFAM_eg2id_Hs.ls <- as.list (org.Hs.egPFAM)
PFAM_eg2id_Mm.ls <- as.list (org.Mm.egPFAM)
PFAM_eg2id_Rn.ls <- as.list (org.Rn.egPFAM)

PFAM_id2des.chv <- unlist (as.list (PFAMDE))

# Reformat eg2id to id2eg

PFAM_eg2id_Hs.ls <- lapply (PFAM_eg2id_Hs.ls, unique)
PFAM_eg2id_Mm.ls <- lapply (PFAM_eg2id_Mm.ls, unique)
PFAM_eg2id_Rn.ls <- lapply (PFAM_eg2id_Rn.ls, unique)

f.reformat <- function (eg2id.ls)
	{
	eg2id.df <- stack (eg2id.ls)
	colnames (eg2id.df) <- c ("Id", "Eg")
	id2eg.ls <- unstack (eg2id.df, form = Eg ~ Id)
	return (id2eg.ls)
	}

PFAM_id2eg_Hs.ls <- lapply (f.reformat (PFAM_eg2id_Hs.ls), unique)
PFAM_id2eg_Mm.ls <- lapply (f.reformat (PFAM_eg2id_Mm.ls), unique)
PFAM_id2eg_Rn.ls <- lapply (f.reformat (PFAM_eg2id_Rn.ls), unique)

rm (PFAM_eg2id_Hs.ls, PFAM_eg2id_Mm.ls, PFAM_eg2id_Rn.ls)

# No PFAM descriptions are missing

length (setdiff (names (PFAM_id2eg_Hs.ls), names (PFAM_id2des.chv)))
# [1] 0
length (setdiff (names (PFAM_id2eg_Mm.ls), names (PFAM_id2des.chv)))
# [1] 0
length (setdiff (names (PFAM_id2eg_Rn.ls), names (PFAM_id2des.chv)))
# [1] 0

# Add Prefix to Descriptions

temp.names <- names (PFAM_id2des.chv)
PFAM_id2des.chv <- paste ("PFAM: ", PFAM_id2des.chv, sep = "")
names (PFAM_id2des.chv) <- temp.names
rm (temp.names)

# IMPORT HOMOLOGY DATA

setwd (path_data.ch)

HomGene.df <- read.table (
				file   = "Homologene_Build65_20100910.txt",
				header = T,
				sep    = "\t",
				quote  = "",
				stringsAsFactors = F
				)

colnames (HomGene.df) <- c ("HomologyGroupID", "TaxonID", "EgID", "Symbol", "ProteinGI", "ProteinAcc")

# Taxa ID
# - Hs: 9606
# - Mm: 10090
# - Rn: 10116

# IMPORT BIOCARTA

# Read GMT

Bioc_raw_Hs.chv <- scan (
        file = "Biocarta_Hs_WhichGenes_20100326.GMT", 
        what = character(), 
        sep = "\n"
        )

Bioc_raw_Hs.ls <- strsplit (Bioc_raw_Hs.chv, "\t")

f.extract_slotID <- function (input.chv)
        {return (input.chv[1])}
f.extract_content <- function (input.chv)
        {return (setdiff (input.chv, input.chv[1]))}

Bioc_id2eg_Hs.ls <- lapply (Bioc_raw_Hs.ls, f.extract_content)
names (Bioc_id2eg_Hs.ls) <- unlist (lapply (Bioc_raw_Hs.ls, f.extract_slotID))
names (Bioc_id2eg_Hs.ls) <- sub (
							names (Bioc_id2eg_Hs.ls),
							pattern = "Bioc_",
							replacement = "BIOC:"
							)
							
Bioc_id2des.chv <- sub (
					names (Bioc_id2eg_Hs.ls),
					pattern = "PATHWAY",
					replacement = " Pathway"
					)
Bioc_id2des.chv <- sub (
					Bioc_id2des.chv,
					pattern = "BIOC:",
					replacement = " BIOC: "
					)					
names (Bioc_id2des.chv) <- names (Bioc_id2eg_Hs.ls)

# Check

length (setdiff (names (Bioc_id2eg_Hs.ls), names (Bioc_id2des.chv)))
# [1] 0

# Convert to orthologs

# Mouse

Bioc_id2eg_Mm.ls <- lapply (
					lapply (
						Bioc_id2eg_Hs.ls,
						f.ConvHom,
						9606,
						10090,
						HomGene.df
						), 
					unique
					)

# Rat 
					
Bioc_id2eg_Rn.ls <- lapply (
					lapply (
						Bioc_id2eg_Hs.ls,
						f.ConvHom,
						9606,
						10116,
						HomGene.df
						), 
					unique
					)
					

# IMPORT NCI (downloaded 08/11/2010)

# Load tabular format:
# - protein ID
# - pathway description
# - pathway ID

setwd (path_data.ch)

# Note: externally replaced
# "na&#xef;ve" --> "naive"

NCI_raw_Hs.df <- read.table (
				file  = "NCI_Hs_20101108.txt",
				sep   = "\t",
				quote = "", comment.char = "",
				header = F,
				stringsAsFactors = F 
				)
colnames (NCI_raw_Hs.df) <- c ("UniProtID", "PathwayName", "PathwayID")

# Reformat to list

NCI_id2up_Hs.ls <- unstack (NCI_raw_Hs.df, form = UniProtID ~ PathwayID)

# Generate descriptions 
# and add id and description suffix

NCI_id2des.chv <- paste ("NCI: ", NCI_raw_Hs.df$PathwayName, sep = "")
names (NCI_id2des.chv) <- paste ("NCI:", NCI_raw_Hs.df$PathwayID, sep = "")

names (NCI_id2up_Hs.ls) <- paste ("NCI:", names (NCI_id2up_Hs.ls), sep = "")

# Check

length (setdiff (names (NCI_id2up_Hs.ls), names (NCI_id2des.chv)))

# Convert UniProt to Eg

Conv_eg2uniprot.df <- stack (as.list (org.Hs.egUNIPROT))
colnames (Conv_eg2uniprot.df) <- c ("UniProt", "Eg")

f.ConvUniprot2Eg <- function (Uniprot.chv)
	{
	matching.ix <- Conv_eg2uniprot.df$UniProt %in% Uniprot.chv
	Eg.chv <- unique (as.character (Conv_eg2uniprot.df$Eg[matching.ix]))
	return (Eg.chv)
	}

NCI_id2eg_Hs.ls <- lapply (NCI_id2up_Hs.ls, f.ConvUniprot2Eg)

# Remove intermediate objects

rm (NCI_raw_Hs.df) 

# Convert to orthologs

# Mouse

NCI_id2eg_Mm.ls <- lapply (
					lapply (
						NCI_id2eg_Hs.ls,
						f.ConvHom,
						9606,
						10090,
						HomGene.df
						), 
					unique
					)

# Rat 
					
NCI_id2eg_Rn.ls <- lapply (
					lapply (
						NCI_id2eg_Hs.ls,
						f.ConvHom,
						9606,
						10116,
						HomGene.df
						), 
					unique
					)

# Capitalize IDs

names (NCI_id2eg_Hs.ls) <- toupper (names (NCI_id2eg_Hs.ls))
names (NCI_id2eg_Mm.ls) <- toupper (names (NCI_id2eg_Mm.ls))
names (NCI_id2eg_Rn.ls) <- toupper (names (NCI_id2eg_Rn.ls))

names (NCI_id2des.chv) <- toupper (names (NCI_id2des.chv))

# Check

length (setdiff (names (NCI_id2eg_Hs.ls), names (NCI_id2des.chv)))

# MERGE

U_id2eg_Hs.ls <- c (GO_id2eg_Hs.ls, KEGG_id2eg_Hs.ls, NCI_id2eg_Hs.ls, Bioc_id2eg_Hs.ls, PFAM_id2eg_Hs.ls)
U_id2eg_Mm.ls <- c (GO_id2eg_Mm.ls, KEGG_id2eg_Mm.ls, NCI_id2eg_Mm.ls, Bioc_id2eg_Mm.ls, PFAM_id2eg_Mm.ls)
U_id2eg_Rn.ls <- c (GO_id2eg_Rn.ls, KEGG_id2eg_Rn.ls, NCI_id2eg_Rn.ls, Bioc_id2eg_Rn.ls, PFAM_id2eg_Rn.ls)

U_id2des.chv  <- c (GO_id2des.chv, KEGG_id2des.chv, NCI_id2des.chv, Bioc_id2des.chv, PFAM_id2des.chv)

# Check

length (setdiff (names (U_id2eg_Hs.ls), names (U_id2des.chv)))
length (setdiff (names (U_id2eg_Mm.ls), names (U_id2des.chv)))
length (setdiff (names (U_id2eg_Rn.ls), names (U_id2des.chv)))

# PACKAGE INTO GMT

f.collapse <- function (input.genes)
	{paste (input.genes, collapse = "\t")}

f.PackGMT <- function (id2eg.ls, id2des.chv, file.name)
	{
	genes.chv  <- unlist (lapply (id2eg.ls, f.collapse))
	output.chv <- paste (names (id2eg.ls), id2des.chv[names (id2eg.ls)], genes.chv, sep = "\t")

	cat (output.chv, sep = "\n", file = file.name)
	}

setwd (path_res.ch)

f.PackGMT (
	id2eg.ls   = U_id2eg_Hs.ls,
	id2des.chv = U_id2des.chv,
	file.name  = "GO_K_NCI_BIOC_PF_Hs_eg.GMT"
	)

f.PackGMT (
	id2eg.ls   = U_id2eg_Mm.ls,
	id2des.chv = U_id2des.chv,
	file.name  = "GO_K_NCI_BIOC_PF_Mm_eg.GMT"
	)

f.PackGMT (
	id2eg.ls   = U_id2eg_Rn.ls,
	id2des.chv = U_id2des.chv,
	file.name  = "GO_K_NCI_BIOC_PF_Rn_eg.GMT"
	)

# FILTER

len_U.nv <- sapply (U_id2eg_Hs.ls, f.ulen)
U_id2eg_Hs_f10_1200.ls <- U_id2eg_Hs.ls[len_U.nv >= 10 & len_U.nv <= 1200]

# SAVE RDATA

setwd (path_ws.ch)

save (U_id2eg_Hs.ls,          U_id2des.chv, file = "Res02_1_U_Hs_nf.RData")
save (U_id2eg_Hs_f10_1200.ls, U_id2des.chv, file = "Res02_2_U_Hs_f10_1200.RData")

