Attachment 'GeneSet_DB_v02_exec20101108.R'
Download 1 # By: Daniele Merico (PhD), BaderLab, Donnelly CCBR, University of Toronto
2
3 # GENERAL RELEASE NOTES
4 # - Gene Ontology (GO), KEGG and PFAM from Bioconductor libraries
5 # each species has an annotation set
6 # - NCI from web page, in tabular format (http://pid.nci.nih.gov/download.shtml)
7 # primary data for Hs, used orthologs for other species
8 # - Biocarta from whichgenes (http://www.whichgenes.org/)
9 # primary data for Hs, used orthologs for other species
10 # * due to a quirk in the web server, this is not an updated version
11 # - HomoloGene from
12 # ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current/
13
14 # OTHER TECHNICAL NOTES
15 # - ID prefixes should be always fully capitalized,
16 # as GSEA automatically converts them to capital letters
17
18 # PATH
19
20 path_root.ch <- "/Users/danielemerico/Documents/Research_Works/GeneSet_DB_2.0/"
21 path_ws.ch <- paste (path_root.ch, "R_Works/R_ws/", sep = "")
22 path_data.ch <- paste (path_root.ch, "Core_Data/20101108", sep = "")
23 path_res.ch <- paste (path_root.ch, "Results/v02_20101108/", sep = "")
24
25 # COMMON FUNCTIONS
26
27 # Convert to orthologs
28
29 f.ConvHom <- function (input.eg, taxon_in.id, taxon_out.id, HomGene.df)
30 {
31 groups.id <- HomGene.df$HomologyGroupID[HomGene.df$TaxonID == taxon_in.id & HomGene.df$EgID %in% input.eg]
32 output.eg <- HomGene.df$EgID[HomGene.df$TaxonID == taxon_out.id & HomGene.df$HomologyGroupID %in% groups.id]
33 return (output.eg)
34 }
35
36 # LIBRARIES
37
38 # Update
39 # (suggest running in a separate workspace)
40
41 source("http://bioconductor.org/biocLite.R")
42
43 biocLite ("GO.db") # v:2.4.5, packaged: 2010-09-23
44 biocLite ("KEGG.db") # v:2.4.5, packaged: 2010-09-23
45 biocLite ("PFAM.db") # v:2.4.5, packaged: 2010-09-23
46
47 biocLite ("org.Hs.eg.db") # v:2.4.6, packaged: 2010-10-04
48 biocLite ("org.Mm.eg.db") # v:2.4.6, packaged: 2010-10-04
49 biocLite ("org.Rn.eg.db") # v:2.4.6, packaged: 2010-10-04
50
51 # Load
52
53 library ("GO.db")
54 library ("KEGG.db")
55 library ("PFAM.db")
56
57 library ("org.Hs.eg.db")
58 library ("org.Mm.eg.db")
59 library ("org.Rn.eg.db")
60
61 # IMPORT GO
62
63 GO_id2eg_Hs.ls <- as.list (org.Hs.egGO2ALLEGS)
64 GO_id2eg_Mm.ls <- as.list (org.Mm.egGO2ALLEGS)
65 GO_id2eg_Rn.ls <- as.list (org.Rn.egGO2ALLEGS)
66
67 GO_id2des.chv <- unlist (lapply (as.list (GOTERM), Term))
68
69 # Prune redundancies
70
71 GO_id2eg_Hs.ls <- lapply (GO_id2eg_Hs.ls, unique)
72 GO_id2eg_Mm.ls <- lapply (GO_id2eg_Mm.ls, unique)
73 GO_id2eg_Rn.ls <- lapply (GO_id2eg_Rn.ls, unique)
74
75 # IMPORT KEGG
76
77 KEGG_id2eg_Hs.ls <- as.list (org.Hs.egPATH2EG)
78 KEGG_id2eg_Mm.ls <- as.list (org.Mm.egPATH2EG)
79 KEGG_id2eg_Rn.ls <- as.list (org.Rn.egPATH2EG)
80
81 KEGG_id2des.chv <- unlist (as.list (KEGGPATHID2NAME))
82
83 # Add ID Prefix
84
85 names (KEGG_id2des.chv) <- paste ("KEGG:", names (KEGG_id2des.chv), sep = "")
86
87 names (KEGG_id2eg_Hs.ls) <- paste ("KEGG:", names (KEGG_id2eg_Hs.ls), sep = "")
88 names (KEGG_id2eg_Mm.ls) <- paste ("KEGG:", names (KEGG_id2eg_Mm.ls), sep = "")
89 names (KEGG_id2eg_Rn.ls) <- paste ("KEGG:", names (KEGG_id2eg_Rn.ls), sep = "")
90
91 # Checks
92 # (output should be always 0)
93
94 length (setdiff (names (KEGG_id2eg_Hs.ls), names (KEGG_id2des.chv)))
95 length (setdiff (names (KEGG_id2eg_Mm.ls), names (KEGG_id2des.chv)))
96 length (setdiff (names (KEGG_id2eg_Rn.ls), names (KEGG_id2des.chv)))
97
98 # Add Description Prefix
99
100 temp.names <- names (KEGG_id2des.chv)
101 KEGG_id2des.chv <- paste ("KEGG: ", KEGG_id2des.chv, sep = "")
102 names (KEGG_id2des.chv) <- temp.names
103 rm (temp.names)
104
105 # IMPORT PFAM
106
107 PFAM_eg2id_Hs.ls <- as.list (org.Hs.egPFAM)
108 PFAM_eg2id_Mm.ls <- as.list (org.Mm.egPFAM)
109 PFAM_eg2id_Rn.ls <- as.list (org.Rn.egPFAM)
110
111 PFAM_id2des.chv <- unlist (as.list (PFAMDE))
112
113 # Reformat eg2id to id2eg
114
115 PFAM_eg2id_Hs.ls <- lapply (PFAM_eg2id_Hs.ls, unique)
116 PFAM_eg2id_Mm.ls <- lapply (PFAM_eg2id_Mm.ls, unique)
117 PFAM_eg2id_Rn.ls <- lapply (PFAM_eg2id_Rn.ls, unique)
118
119 f.reformat <- function (eg2id.ls)
120 {
121 eg2id.df <- stack (eg2id.ls)
122 colnames (eg2id.df) <- c ("Id", "Eg")
123 id2eg.ls <- unstack (eg2id.df, form = Eg ~ Id)
124 return (id2eg.ls)
125 }
126
127 PFAM_id2eg_Hs.ls <- lapply (f.reformat (PFAM_eg2id_Hs.ls), unique)
128 PFAM_id2eg_Mm.ls <- lapply (f.reformat (PFAM_eg2id_Mm.ls), unique)
129 PFAM_id2eg_Rn.ls <- lapply (f.reformat (PFAM_eg2id_Rn.ls), unique)
130
131 rm (PFAM_eg2id_Hs.ls, PFAM_eg2id_Mm.ls, PFAM_eg2id_Rn.ls)
132
133 # No PFAM descriptions are missing
134
135 length (setdiff (names (PFAM_id2eg_Hs.ls), names (PFAM_id2des.chv)))
136 # [1] 0
137 length (setdiff (names (PFAM_id2eg_Mm.ls), names (PFAM_id2des.chv)))
138 # [1] 0
139 length (setdiff (names (PFAM_id2eg_Rn.ls), names (PFAM_id2des.chv)))
140 # [1] 0
141
142 # Add Prefix to Descriptions
143
144 temp.names <- names (PFAM_id2des.chv)
145 PFAM_id2des.chv <- paste ("PFAM: ", PFAM_id2des.chv, sep = "")
146 names (PFAM_id2des.chv) <- temp.names
147 rm (temp.names)
148
149 # IMPORT HOMOLOGY DATA
150
151 setwd (path_data.ch)
152
153 HomGene.df <- read.table (
154 file = "Homologene_Build65_20100910.txt",
155 header = T,
156 sep = "\t",
157 quote = "",
158 stringsAsFactors = F
159 )
160
161 colnames (HomGene.df) <- c ("HomologyGroupID", "TaxonID", "EgID", "Symbol", "ProteinGI", "ProteinAcc")
162
163 # Taxa ID
164 # - Hs: 9606
165 # - Mm: 10090
166 # - Rn: 10116
167
168 # IMPORT BIOCARTA
169
170 # Read GMT
171
172 Bioc_raw_Hs.chv <- scan (
173 file = "Biocarta_Hs_WhichGenes_20100326.GMT",
174 what = character(),
175 sep = "\n"
176 )
177
178 Bioc_raw_Hs.ls <- strsplit (Bioc_raw_Hs.chv, "\t")
179
180 f.extract_slotID <- function (input.chv)
181 {return (input.chv[1])}
182 f.extract_content <- function (input.chv)
183 {return (setdiff (input.chv, input.chv[1]))}
184
185 Bioc_id2eg_Hs.ls <- lapply (Bioc_raw_Hs.ls, f.extract_content)
186 names (Bioc_id2eg_Hs.ls) <- unlist (lapply (Bioc_raw_Hs.ls, f.extract_slotID))
187 names (Bioc_id2eg_Hs.ls) <- sub (
188 names (Bioc_id2eg_Hs.ls),
189 pattern = "Bioc_",
190 replacement = "BIOC:"
191 )
192
193 Bioc_id2des.chv <- sub (
194 names (Bioc_id2eg_Hs.ls),
195 pattern = "PATHWAY",
196 replacement = " Pathway"
197 )
198 Bioc_id2des.chv <- sub (
199 Bioc_id2des.chv,
200 pattern = "BIOC:",
201 replacement = " BIOC: "
202 )
203 names (Bioc_id2des.chv) <- names (Bioc_id2eg_Hs.ls)
204
205 # Check
206
207 length (setdiff (names (Bioc_id2eg_Hs.ls), names (Bioc_id2des.chv)))
208 # [1] 0
209
210 # Convert to orthologs
211
212 # Mouse
213
214 Bioc_id2eg_Mm.ls <- lapply (
215 lapply (
216 Bioc_id2eg_Hs.ls,
217 f.ConvHom,
218 9606,
219 10090,
220 HomGene.df
221 ),
222 unique
223 )
224
225 # Rat
226
227 Bioc_id2eg_Rn.ls <- lapply (
228 lapply (
229 Bioc_id2eg_Hs.ls,
230 f.ConvHom,
231 9606,
232 10116,
233 HomGene.df
234 ),
235 unique
236 )
237
238
239 # IMPORT NCI (downloaded 08/11/2010)
240
241 # Load tabular format:
242 # - protein ID
243 # - pathway description
244 # - pathway ID
245
246 setwd (path_data.ch)
247
248 # Note: externally replaced
249 # "naïve" --> "naive"
250
251 NCI_raw_Hs.df <- read.table (
252 file = "NCI_Hs_20101108.txt",
253 sep = "\t",
254 quote = "", comment.char = "",
255 header = F,
256 stringsAsFactors = F
257 )
258 colnames (NCI_raw_Hs.df) <- c ("UniProtID", "PathwayName", "PathwayID")
259
260 # Reformat to list
261
262 NCI_id2up_Hs.ls <- unstack (NCI_raw_Hs.df, form = UniProtID ~ PathwayID)
263
264 # Generate descriptions
265 # and add id and description suffix
266
267 NCI_id2des.chv <- paste ("NCI: ", NCI_raw_Hs.df$PathwayName, sep = "")
268 names (NCI_id2des.chv) <- paste ("NCI:", NCI_raw_Hs.df$PathwayID, sep = "")
269
270 names (NCI_id2up_Hs.ls) <- paste ("NCI:", names (NCI_id2up_Hs.ls), sep = "")
271
272 # Check
273
274 length (setdiff (names (NCI_id2up_Hs.ls), names (NCI_id2des.chv)))
275
276 # Convert UniProt to Eg
277
278 Conv_eg2uniprot.df <- stack (as.list (org.Hs.egUNIPROT))
279 colnames (Conv_eg2uniprot.df) <- c ("UniProt", "Eg")
280
281 f.ConvUniprot2Eg <- function (Uniprot.chv)
282 {
283 matching.ix <- Conv_eg2uniprot.df$UniProt %in% Uniprot.chv
284 Eg.chv <- unique (as.character (Conv_eg2uniprot.df$Eg[matching.ix]))
285 return (Eg.chv)
286 }
287
288 NCI_id2eg_Hs.ls <- lapply (NCI_id2up_Hs.ls, f.ConvUniprot2Eg)
289
290 # Remove intermediate objects
291
292 rm (NCI_raw_Hs.df)
293
294 # Convert to orthologs
295
296 # Mouse
297
298 NCI_id2eg_Mm.ls <- lapply (
299 lapply (
300 NCI_id2eg_Hs.ls,
301 f.ConvHom,
302 9606,
303 10090,
304 HomGene.df
305 ),
306 unique
307 )
308
309 # Rat
310
311 NCI_id2eg_Rn.ls <- lapply (
312 lapply (
313 NCI_id2eg_Hs.ls,
314 f.ConvHom,
315 9606,
316 10116,
317 HomGene.df
318 ),
319 unique
320 )
321
322 # Capitalize IDs
323
324 names (NCI_id2eg_Hs.ls) <- toupper (names (NCI_id2eg_Hs.ls))
325 names (NCI_id2eg_Mm.ls) <- toupper (names (NCI_id2eg_Mm.ls))
326 names (NCI_id2eg_Rn.ls) <- toupper (names (NCI_id2eg_Rn.ls))
327
328 names (NCI_id2des.chv) <- toupper (names (NCI_id2des.chv))
329
330 # Check
331
332 length (setdiff (names (NCI_id2eg_Hs.ls), names (NCI_id2des.chv)))
333
334 # MERGE
335
336 U_id2eg_Hs.ls <- c (GO_id2eg_Hs.ls, KEGG_id2eg_Hs.ls, NCI_id2eg_Hs.ls, Bioc_id2eg_Hs.ls, PFAM_id2eg_Hs.ls)
337 U_id2eg_Mm.ls <- c (GO_id2eg_Mm.ls, KEGG_id2eg_Mm.ls, NCI_id2eg_Mm.ls, Bioc_id2eg_Mm.ls, PFAM_id2eg_Mm.ls)
338 U_id2eg_Rn.ls <- c (GO_id2eg_Rn.ls, KEGG_id2eg_Rn.ls, NCI_id2eg_Rn.ls, Bioc_id2eg_Rn.ls, PFAM_id2eg_Rn.ls)
339
340 U_id2des.chv <- c (GO_id2des.chv, KEGG_id2des.chv, NCI_id2des.chv, Bioc_id2des.chv, PFAM_id2des.chv)
341
342 # Check
343
344 length (setdiff (names (U_id2eg_Hs.ls), names (U_id2des.chv)))
345 length (setdiff (names (U_id2eg_Mm.ls), names (U_id2des.chv)))
346 length (setdiff (names (U_id2eg_Rn.ls), names (U_id2des.chv)))
347
348 # PACKAGE INTO GMT
349
350 f.collapse <- function (input.genes)
351 {paste (input.genes, collapse = "\t")}
352
353 f.PackGMT <- function (id2eg.ls, id2des.chv, file.name)
354 {
355 genes.chv <- unlist (lapply (id2eg.ls, f.collapse))
356 output.chv <- paste (names (id2eg.ls), id2des.chv[names (id2eg.ls)], genes.chv, sep = "\t")
357
358 cat (output.chv, sep = "\n", file = file.name)
359 }
360
361 setwd (path_res.ch)
362
363 f.PackGMT (
364 id2eg.ls = U_id2eg_Hs.ls,
365 id2des.chv = U_id2des.chv,
366 file.name = "GO_K_NCI_BIOC_PF_Hs_eg.GMT"
367 )
368
369 f.PackGMT (
370 id2eg.ls = U_id2eg_Mm.ls,
371 id2des.chv = U_id2des.chv,
372 file.name = "GO_K_NCI_BIOC_PF_Mm_eg.GMT"
373 )
374
375 f.PackGMT (
376 id2eg.ls = U_id2eg_Rn.ls,
377 id2des.chv = U_id2des.chv,
378 file.name = "GO_K_NCI_BIOC_PF_Rn_eg.GMT"
379 )
380
381 # FILTER
382
383 len_U.nv <- sapply (U_id2eg_Hs.ls, f.ulen)
384 U_id2eg_Hs_f10_1200.ls <- U_id2eg_Hs.ls[len_U.nv >= 10 & len_U.nv <= 1200]
385
386 # SAVE RDATA
387
388 setwd (path_ws.ch)
389
390 save (U_id2eg_Hs.ls, U_id2des.chv, file = "Res02_1_U_Hs_nf.RData")
391 save (U_id2eg_Hs_f10_1200.ls, U_id2des.chv, file = "Res02_2_U_Hs_f10_1200.RData")
392
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.