Attachment 'collapse_ExpressionMatrix.py'
Download 1 #!/usr/bin/python
2 """
3 takes a GSEA expression matrix (*.gct or *.txt format) or ranked gene list (*.rnk format) and
4 - converts the Probe Set ID to a Gene Symbol by using a .chip file (if supplied by -chip option)
5 - collapses multiple probe sets for the same gene symbol using "max_probe" mode.
6
7 use cases:
8 1) replace probeSet IDs of GCT-file by corresponding gene symbols and collapse multiple genes (max_probe):
9 %prog -i input.gct -o output.gct -c Mouse430_2.chip --collapse
10
11 2) replace probeSet IDs of GCT-file by corresponding gene symbols:
12 %prog -i input.gct -o output.gct -c Mouse430_2.chip --no-collapse
13
14 3) collapse multiple Genes (max_probe) in GCT-file without replacing IDs:
15 %prog -i input.gct -o output.gct --collapse
16
17 4) replace probeSet IDs of RNK-file by corresponding gene symbols and collapse multiple genes (max_probe):
18 %prog -i input.rnk -o output.rnk -c Mouse430_2.chip --collapse --rnk
19
20 5) replace probeSet IDs of RNK-file by corresponding gene symbols:
21 %prog -i input.rnk -o output.rnk -c Mouse430_2.chip --no-collapse --rnk
22
23
24 x) fix Cytoscape Session - this is a very special case (that's why the --cys option is suppressed in the help page)
25 Situation: You have created an Enrichment Map with a non-collapsed expression file.
26 Now your expression table in the session has been filtered to only contain genes that are present
27 in one of the significant genesets ("genes_of_interest"), while just throwing out duplicates
28 originating from multiple probe sets per gene.
29
30 Solution: * determine the list of "genes_of_interest" from filtered expression table (-o, from session file)
31 * backup the original filtered expression table (from session file)
32 * collapse the full expression table (-i, GCT or TXT format) by "max_probe" (or other available modes)
33 * filter the collapsed expression table to only contain genes_of_interest
34 * write out the collapsed and filtered expression table to the same path as the original
35 filtered expression table. (in shorter TXT format)
36
37 * optional: replace Probe-Set-ID's by Symbols before collapsing
38
39
40 written 2009/2010 by Oliver Stueker <oliver.stueker@utoronto.ca>
41
42 Copyright 2009-2010 by the Baderlab (Research Group of Gary D. Bader)
43 Donnelly Centre for Cellular and Biomolecular Research
44 University of Toronto
45 Toronto, Ontario
46 CANDADA
47 http://baderlab.org
48
49 $Id: collapse_ExpressionMatrix.py 507 2010-04-19 23:50:47Z revilo $
50 """
51 #from __future__ import with_statement
52 __author__ = '$Author: revilo $'[9:-2]
53 __version__ = '0.' + '$Revision: 507 $'[11:-2]
54 __date__ = '$Date: 2010-04-19 19:50:47 -0400 (Mon, 19 Apr 2010) $'[7:17]
55
56 from Tkinter import *
57 import tkFileDialog, tkSimpleDialog, tkMessageBox
58 class ReplaceCollapseGui(Frame):
59
60 def __init__(self, master=None, version="0.001"):
61 Frame.__init__(self, master)
62 self.master.title("collapse Expression Matrix" + " v" + version)
63 self.grid(sticky=N + S + E + W)
64 self.version = version
65 self.createWidgets()
66
67
68 def createWidgets(self):
69 self.exprOrRankFileSelectorFrameText = StringVar(value="Expression Matrix or Ranked List:")
70 self.inputFileName = StringVar()
71 self.outputFileName = StringVar()
72 self.chipFileName = StringVar()
73 self.doCollapse = IntVar()
74 self.doIdReplace = IntVar()
75 self.suppress_null = IntVar()
76 self.messages = StringVar()
77 self.addExprInFileName = StringVar()
78 self.addExprOutFileName = StringVar()
79 self.collapseRankAndExpr = IntVar()
80
81 self.messages.set("")
82 self.debug = False
83
84 top = self.winfo_toplevel()
85 top.rowconfigure(0, weight=2)
86 top.columnconfigure(0, weight=2)
87 # top.minsize(width=400, height=250)
88
89 self.columnconfigure(0, weight=0)
90 self.columnconfigure(1, weight=6)
91 self.columnconfigure(2, weight=0)
92 self.grid(ipadx=2, ipady=1)
93
94 self.config(relief=GROOVE)
95
96 # Basic Mode selectors (RadioButtons)
97 currentRow = 0
98 radioBoxFrame = LabelFrame(self, text="Mode:")
99 radioBoxFrame.columnconfigure(0, weight=0)
100 radioBoxFrame.columnconfigure(1, weight=6)
101 radioBoxFrame.columnconfigure(2, weight=0)
102 radioBoxFrame.grid(row=currentRow, columnspan=3, sticky=E + W, ipadx=2, ipady=1, padx=2, pady=2)
103
104 radioRankOrEpr = Radiobutton(radioBoxFrame, text="ExpressionMatrix or Ranked List", variable=self.collapseRankAndExpr, value=0, command=self.modeRadioButtonPressed)
105 radioRankOrEpr.grid(row=0, column=0, sticky=W)
106 radioRankAndEpr = Radiobutton(radioBoxFrame, text="Ranked List with Expression Matrix", variable=self.collapseRankAndExpr, value=1, command=self.modeRadioButtonPressed)
107 radioRankAndEpr.grid(row=1, column=0, sticky=W)
108
109 # primary File Selectors Frame:
110 currentRow += 1
111 primaryFileSelectorsFrame = LabelFrame(self, relief=GROOVE, labelwidget=Label(self, textvariable=self.exprOrRankFileSelectorFrameText))
112 primaryFileSelectorsFrame.columnconfigure(0, weight=0)
113 primaryFileSelectorsFrame.columnconfigure(1, weight=6)
114 primaryFileSelectorsFrame.columnconfigure(2, weight=0)
115 primaryFileSelectorsFrame.grid(row=currentRow, columnspan=3, sticky=E + W, ipadx=2, ipady=1, padx=2, pady=2)
116
117 # File selector: Input File
118 infileLabel = Label(primaryFileSelectorsFrame, text='Input File:')
119 infileLabel.grid(row=0, column=0, sticky=W)
120 infileBox = Entry(primaryFileSelectorsFrame, textvariable=self.inputFileName, width=40)
121 infileBox.grid(row=0, column=1, sticky=E + W)
122 infileButton = Button(primaryFileSelectorsFrame, text="Browse", command=self.chooseInputFile)
123 infileButton.grid(row=0, column=2, sticky=E)
124
125 # File selector: Output File
126 currentRow += 1
127 outfileLabel = Label(primaryFileSelectorsFrame, text='Output File:')
128 outfileLabel.grid(row=1, column=0, sticky=W)
129 outfileBox = Entry(primaryFileSelectorsFrame, textvariable=self.outputFileName, width=40)
130 outfileBox.grid(row=1, column=1, sticky=E + W)
131 outfileButton = Button(primaryFileSelectorsFrame, text="Browse", command=self.chooseOutputFile)
132 outfileButton.grid(row=1, column=2, sticky=E)
133
134
135 #The Checkbuttons
136 currentRow += 1
137
138 checkbuttonFrame = LabelFrame(self, relief=GROOVE, labelanchor='nw', text="Modes for single file:")
139 checkbuttonFrame.grid(row=currentRow, column=0, columnspan=2, sticky=E + W, padx=2, pady=2)
140 self.doCollapseCheck = Checkbutton(checkbuttonFrame, text='Collapse Probesets', variable=self.doCollapse)
141 self.doCollapseCheck.select()
142 self.doCollapseCheck.grid(row=0, column=0, sticky=E)
143
144 self.doReplaceCheck = Checkbutton(checkbuttonFrame, text='Translate IDs', variable=self.doIdReplace, command=self.doReplaceButtonPressed)
145 self.doReplaceCheck.select()
146 self.doReplaceCheck.grid(row=0, column=1, sticky=W)
147
148 optionFrame = LabelFrame(self, relief=GROOVE, labelanchor='nw', text="Options:")
149 optionFrame.grid(row=currentRow, column=2, sticky=E + W, padx=2, pady=2)
150 self.suppressNullCheck = Checkbutton(optionFrame, text='Suppress Gene "NULL"', variable=self.suppress_null)
151 self.suppressNullCheck.grid(row=0, column=0, sticky=W)
152
153 # Additional file selectors:
154 currentRow += 1
155 secondaryFileSelectorsFrame = LabelFrame(self, relief=GROOVE, labelanchor='nw', text="Expression-file to be collapsed by probesets:")
156 secondaryFileSelectorsFrame.columnconfigure(0, weight=0)
157 secondaryFileSelectorsFrame.columnconfigure(1, weight=6)
158 secondaryFileSelectorsFrame.columnconfigure(2, weight=0)
159 secondaryFileSelectorsFrame.grid(row=currentRow, columnspan=3, sticky=E + W, ipadx=2, ipady=1, padx=2, pady=2)
160
161 #File Selector: additional Expression Input File
162 addExprInFileLabel = Label(secondaryFileSelectorsFrame, text='Input File:')
163 addExprInFileLabel.grid(row=0, column=0, sticky=W)
164 self.addExprInFileBox = Entry(secondaryFileSelectorsFrame, textvariable=self.addExprInFileName, width=40, disabledbackground='#CCCCCC', state=DISABLED)
165 self.addExprInFileBox.grid(row=0, column=1, sticky=E + W)
166 self.addExprInFileButton = Button(secondaryFileSelectorsFrame, text="Browse", state=DISABLED, command=self.chooseAddExprInputFile)
167 self.addExprInFileButton.grid(row=0, column=3, sticky=E)
168
169 #File Selector: additional Expression Output File
170 addExprOutFileLabel = Label(secondaryFileSelectorsFrame, text='Output File:')
171 addExprOutFileLabel.grid(row=1, column=0, sticky=W)
172 self.addExprOutFileBox = Entry(secondaryFileSelectorsFrame, textvariable=self.addExprOutFileName, width=40, disabledbackground='#CCCCCC', state=DISABLED)
173 self.addExprOutFileBox.grid(row=1, column=1, sticky=E + W)
174 self.addExprOutFileButton = Button(secondaryFileSelectorsFrame, text="Browse", state=DISABLED, command=self.chooseAddExprOutputFile)
175 self.addExprOutFileButton.grid(row=1, column=3, sticky=E)
176
177
178 # File selector: Chip-Annotation File
179 currentRow += 1
180 chipfileLabel = Label(self, text='Chip File:')
181 chipfileLabel.grid(row=currentRow, column=0, sticky=W)
182
183 self.chipfileBox = Entry(self, textvariable=self.chipFileName, width=40, disabledbackground='#CCCCCC')
184 self.chipfileBox.grid(row=currentRow, column=1, sticky=E + W)
185
186 self.chipfileButton = Button(self, text="Browse", command=self.chooseChipFile, state=DISABLED)
187 self.chipfileButton.grid(row=currentRow, column=2, sticky=E)
188 if self.doIdReplace.get() == 0:
189 self.chipfileBox.configure(state=DISABLED)
190 self.chipfileButton.configure(state=DISABLED)
191 else:
192 self.chipfileBox.configure(state=NORMAL)
193 self.chipfileButton.configure(state=NORMAL)
194
195
196 # Control Buttons
197 currentRow += 1
198 controlButtonFrame = Frame(self)
199 controlButtonFrame.grid(row=currentRow, columnspan=3, sticky=E + W)
200 versionLabel = Label(controlButtonFrame, text="Version: " + self.version)
201 versionLabel.grid(row=0, column=0, padx=10, sticky=W)
202
203 self.quitButton = Button (controlButtonFrame, text='Quit', command=self.quit)
204 self.quitButton.grid(row=0, column=1, padx=10, sticky=W)
205
206 self.clearButton = Button (controlButtonFrame, text='Clear', command=self.clear)
207 self.clearButton.grid(row=0, column=2, padx=10, sticky=E)
208
209 self.runButton = Button(controlButtonFrame, text="Run", command=self.run)
210 self.runButton.grid(row=0, column=3, padx=10, sticky=E)
211
212 # Message Box
213 currentRow += 1
214 self.rowconfigure(currentRow, minsize=200)
215 messageBoxBorder = LabelFrame(self, relief=RIDGE, height=150, width=425, text="Messages:")
216 messageBoxBorder.columnconfigure(0, weight=0)
217 messageBoxBorder.columnconfigure(1, weight=6)
218 messageBoxBorder.columnconfigure(2, weight=0)
219 messageBoxBorder.grid(row=currentRow, column=0, columnspan=3, sticky=N + S + E + W, padx=5, pady=5)
220 self.messageBox = Message(messageBoxBorder, textvariable=self.messages, justify=LEFT, width=415, aspect=150)
221 self.messageBox.rowconfigure(0, minsize=200)
222 self.messageBox.columnconfigure(0, weight=0)
223 self.messageBox.columnconfigure(1, weight=6)
224 self.messageBox.columnconfigure(2, weight=0)
225 self.messageBox.grid(row=0, columnspan=3, sticky=NW)
226
227 def writeMessage(self, messageText):
228 textLines = self.messages.get().splitlines()
229 textLines.append(messageText)
230 if len(textLines) > 10:
231 textLines = textLines[-10:]
232
233 newText = "\n".join(textLines)
234 self.messages.set(newText)
235 self.messageBox.update()
236
237 def center_window(self, w=450, h=250):
238 root = self.winfo_toplevel()
239
240 # get screen width and height
241 ws = root.winfo_screenwidth()
242 hs = root.winfo_screenheight()
243 # calculate position x, y
244 x = (ws / 2) - (w / 2)
245 y = (hs / 2) - (h / 2)
246 x = 50
247 y = 50
248 root.geometry('%dx%d+%d+%d' % (w, h, x, y))
249
250 def clear(self):
251 self.inputFileName.set("")
252 self.outputFileName.set("")
253 self.chipFileName.set("")
254 self.addExprInFileName.set("")
255 self.addExprOutFileName.set("")
256 self.collapseRankAndExpr.set(0)
257 self.modeRadioButtonPressed()
258
259 def chooseInputFile(self):
260 if self.collapseRankAndExpr.get() == 0:
261 filetypes = [("Supported Files (GCT, TXT, RNK)", "*.gct"),
262 ("Supported Files (GCT, TXT, RNK)", "*.txt"),
263 ("Supported Files (GCT, TXT, RNK)", "*.rnk"),
264 ("Supported Files (GCT, TXT, RNK)", "*.GCT"),
265 ("Supported Files (GCT, TXT, RNK)", "*.TXT"),
266 ("Supported Files (GCT, TXT, RNK)", "*.RNK")]
267 else:
268 filetypes = [("Ranked List (RNK)", "*.rnk"),
269 ("Ranked List (RNK)", "*.RNK")]
270 filename = tkFileDialog.askopenfilename(title="Choose Input Expression Matrix or Rank file",
271 filetypes=filetypes)
272 self.inputFileName.set(filename)
273
274 if self.debug:
275 self.writeMessage("selected Input File Name: %s " % filename)
276
277 def chooseOutputFile(self):
278 if self.collapseRankAndExpr.get() == 0:
279 filetypes = [("Supported Files (GCT, TXT, RNK)", "*.gct"),
280 ("Supported Files (GCT, TXT, RNK)", "*.txt"),
281 ("Supported Files (GCT, TXT, RNK)", "*.rnk"),
282 ("Supported Files (GCT, TXT, RNK)", "*.GCT"),
283 ("Supported Files (GCT, TXT, RNK)", "*.TXT"),
284 ("Supported Files (GCT, TXT, RNK)", "*.RNK")]
285 else:
286 filetypes = [("Ranked List (RNK)", "*.rnk"),
287 ("Ranked List (RNK)", "*.RNK")]
288 def_file = ""
289 def_ext = ".TXT"
290 if not self.inputFileName.get() == "":
291 (inputFileDir, inputFileName) = os.path.split(self.inputFileName.get())
292 tokens = inputFileName.rsplit(".", 1)
293 def_ext = "." + tokens[-1]
294 def_file = tokens[0] + "_collapsed" + "." + tokens[-1]
295
296 filename = tkFileDialog.asksaveasfilename(title="Choose Output File",
297 filetypes=filetypes,
298 defaultextension=def_ext,
299 initialfile=def_file,
300 initialdir=inputFileDir)
301 self.outputFileName.set(filename)
302 if self.debug:
303 self.writeMessage("selected Output File Name: %s " % filename)
304
305 def chooseAddExprInputFile(self):
306 filename = tkFileDialog.askopenfilename(title="Choose Additional Input Expression Matrix file",
307 filetypes=[("Supported Files (GCT, TXT)", "*.gct"),
308 ("Supported Files (GCT, TXT)", "*.txt"),
309 ("Supported Files (GCT, TXT)", "*.GCT"),
310 ("Supported Files (GCT, TXT)", "*.TXT") ])
311 self.addExprInFileName.set(filename)
312
313 if self.debug:
314 self.writeMessage("selected Input Expression File Name: %s " % filename)
315
316 def chooseAddExprOutputFile(self):
317 def_file = ""
318 def_ext = ".TXT"
319 if not self.addExprInFileName.get() == "":
320 (inputFileDir, addExprInFileName) = os.path.split(self.addExprInFileName.get())
321 tokens = addExprInFileName.rsplit(".", 1)
322 def_ext = "." + tokens[-1]
323 def_file = tokens[0] + "_collapsed" + "." + tokens[-1]
324
325 filename = tkFileDialog.asksaveasfilename(title="Choose Output Expression Marix File",
326 filetypes=[("Supported Files (GCT, TXT)", "*.gct"),
327 ("Supported Files (GCT, TXT)", "*.txt"),
328 ("Supported Files (GCT, TXT)", "*.GCT"),
329 ("Supported Files (GCT, TXT)", "*.TXT")],
330 defaultextension=def_ext,
331 initialfile=def_file,
332 initialdir=inputFileDir)
333 self.addExprOutFileName.set(filename)
334 if self.debug:
335 self.writeMessage("selected Output Expression File Name: %s " % filename)
336
337 def chooseChipFile(self):
338 filename = tkFileDialog.askopenfilename(title="Choose Chip Annotation file",
339 filetypes=[("Chip Annotation file (CHIP)", "*.chip"),
340 ("Chip Annotation file (CHIP)", "*.CHIP")])
341 self.chipFileName.set(filename)
342 if self.debug:
343 self.writeMessage("selected Output File Name: %s " % filename)
344
345 def doReplaceButtonPressed(self):
346 if self.doIdReplace.get() == 1:
347 self.chipfileBox.configure(state=NORMAL)
348 self.chipfileButton.configure(state=NORMAL)
349 else:
350 self.chipfileBox.configure(state=DISABLED)
351 self.chipfileButton.configure(state=DISABLED)
352
353 def modeRadioButtonPressed(self):
354 if self.collapseRankAndExpr.get() == 1:
355 #Change some text:
356 self.exprOrRankFileSelectorFrameText.set('Ranked List:')
357 #Enable some widgets
358 self.addExprInFileBox.configure(state=NORMAL)
359 self.addExprInFileButton.configure(state=NORMAL)
360 self.addExprOutFileBox.configure(state=NORMAL)
361 self.addExprOutFileButton.configure(state=NORMAL)
362 self.chipfileBox.configure(state=NORMAL)
363 self.chipfileButton.configure(state=NORMAL)
364 #Disable some other widgets
365 self.doCollapseCheck.configure(state=DISABLED)
366 self.doReplaceCheck.configure(state=DISABLED)
367 pass
368 else:
369 #Change some text:
370 self.exprOrRankFileSelectorFrameText.set('Expression Matrix or Ranked List:')
371 #Disable some widgets
372 self.addExprInFileBox.configure(state=DISABLED)
373 self.addExprInFileButton.configure(state=DISABLED)
374 self.addExprOutFileBox.configure(state=DISABLED)
375 self.addExprOutFileButton.configure(state=DISABLED)
376 #Enable some other widgets
377 self.doCollapseCheck.configure(state=NORMAL)
378 self.doReplaceCheck.configure(state=NORMAL)
379 #revert state of widgets to previous state
380 if self.doIdReplace.get() == 1:
381 self.chipfileBox.configure(state=NORMAL)
382 self.chipfileButton.configure(state=NORMAL)
383 else:
384 self.chipfileBox.configure(state=DISABLED)
385 self.chipfileButton.configure(state=DISABLED)
386
387 def checkInput(self):
388 self.inputOK = True
389
390 if self.inputFileName.get() == "":
391 self.inputOK = False
392 tkMessageBox.showerror(title="Input Error", message="Input-file required", icon=tkMessageBox.ERROR)
393 elif not os.path.isfile(self.inputFileName.get()):
394 self.inputOK = False
395 tkMessageBox.showerror(title="Input Error", message="Input-file does not exist", icon=tkMessageBox.ERROR)
396 else:
397 inputFileName = self.inputFileName.get()
398
399 if self.outputFileName.get() == "":
400 self.inputOK = False
401 tkMessageBox.showerror(title="Input Error", message="Output-file required", icon=tkMessageBox.ERROR)
402 else:
403 outputFileName = self.outputFileName.get()
404
405 if self.doIdReplace.get() == 1 or self.collapseRankAndExpr.get() == 1:
406 if self.chipFileName.get() == "":
407 self.inputOK = False
408 tkMessageBox.showerror(title="Input Error", message="Chip-file required", icon=tkMessageBox.ERROR)
409 elif not os.path.isfile(self.chipFileName.get()):
410 self.inputOK = False
411 tkMessageBox.showerror(title="Input Error", message="Chip-file does not exist", icon=tkMessageBox.ERROR)
412 else:
413 chipFileName = self.chipFileName.get()
414
415 if self.collapseRankAndExpr.get() == 1:
416 if self.addExprInFileName.get() == "":
417 self.inputOK = False
418 tkMessageBox.showerror(title="Input Error", message="Expression Matrix Input-file required", icon=tkMessageBox.ERROR)
419 elif not os.path.isfile(self.addExprInFileName.get()):
420 self.inputOK = False
421 tkMessageBox.showerror(title="Input Error", message="Expression Matrix Input-file does not exist", icon=tkMessageBox.ERROR)
422
423 if self.addExprOutFileName.get() == "":
424 self.inputOK = False
425 tkMessageBox.showerror(title="Input Error", message="Expression Matrix Output-file required", icon=tkMessageBox.ERROR)
426
427
428 return self.inputOK
429
430 def run(self):
431 self.writeMessage("Testing Input...")
432 inputOK = self.checkInput()
433 if inputOK:
434 self.writeMessage("Starting....")
435 if self.debug:
436 print "Running..."
437 print "Input File: %s" % self.inputFileName.get()
438 print "Output File: %s" % self.outputFileName.get()
439 print "Chip File: %s" % self.chipFileName.get()
440 print "Do Collapse: %i" % self.doCollapse.get()
441 print "Do Replace: %i" % self.doIdReplace.get()
442
443 if self.collapseRankAndExpr.get() == 0:
444 collapser = CollapseExpressionMatrix(inputFileName=self.inputFileName.get(),
445 outputFileName=self.outputFileName.get(),
446 chipFileName=self.chipFileName.get(),
447 doCollapse=(self.doCollapse.get() == 1),
448 collapseMode='max_probe',
449 verbose=True,
450 fix_session=False,
451 suppress_null=(self.suppress_null.get() == 1),
452 gui=self)
453 else:
454 collapser = CollapseExpressionMatrix(inputFileName=self.inputFileName.get(),
455 outputFileName=self.outputFileName.get(),
456 chipFileName=self.chipFileName.get(),
457 extra_expr_in=self.addExprInFileName.get(),
458 extra_expr_out=self.addExprOutFileName.get(),
459 doCollapse=True,
460 collapseMode='max_probe',
461 verbose=True,
462 fix_session=False,
463 suppress_null=(self.suppress_null.get() == 1),
464 gui=self)
465
466 collapser.main()
467 tkMessageBox.showinfo(title="Done", message="Done. Check Message-Box for status.")
468 # self.quit()
469
470 class CollapseExpressionMatrix:
471 def __init__(self,
472 inputFileName, outputFileName, chipFileName, extra_expr_in="", extra_expr_out="",
473 doCollapse=False, collapseMode="max_probe", verbose=True, fix_session=False, suppress_null=False, gui=None):
474 self.inputFileName = inputFileName
475 self.outputFileName = outputFileName
476 self.chipFileName = chipFileName
477 self.extra_expr_in = extra_expr_in
478 self.extra_expr_out = extra_expr_out
479 self.doCollapse = doCollapse
480 self.collapseMode = collapseMode
481 self.verbose = verbose
482 self.fix_session = fix_session
483 self.suppress_null = suppress_null
484 self.gui = gui
485
486 def printMessage(self, text):
487 if self.gui == None:
488 sys.stdout.write(text)
489 else:
490 self.gui.writeMessage(text)
491
492 def read_chipfile(self, chipfileName):
493 """
494 reads a GSEA chip annotation file (CHIP)
495 and returns a dict that maps the probeset ID's to their corresponding gene symbols.
496
497 Format:
498 see http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#CHIP:_Chip_file_format_.28.2A.chip.29
499
500 @return idSymbolMap
501 """
502 import re
503
504 id_symbol_map = {}
505 re_chip = re.compile("^(\W+)\w+(\W+)\w+.*")
506 re_chip_header = re.compile("^Probe Set ID\tGene Symbol\tGene Title.*")
507 passedHeader = 0
508
509 # read chip file into dict (mapping object)
510 if self.verbose :
511 self.printMessage("reading Chip file...\n")
512
513 try:
514 chipfile = file(chipfileName, "rU")
515 try:
516 for line in chipfile:
517 if not passedHeader or re_chip_header.match(line):
518 passedHeader = 1
519 continue
520 probe = line.split("\t")
521 id_symbol_map[probe[0]] = probe[1]
522 finally:
523 chipfile.close()
524 except IOError, (errorNo, text):
525 raise IOError, (errorNo, text)
526
527 return id_symbol_map
528
529
530 def read_inputFile(self, inputFileName):
531 """
532 Reads an input file and determines the type.
533
534 Supported file Types:
535 - Expression file "GCT" - three header lines, first line is always: "#1.2"
536 - Expression file "TXT" - one header line, Header always starts with "NAME\tDESCRIPTION\t"
537 - Ranked gene list "RNK" - two column: ID{tab}SCORE, score being numerical,
538 comment lines (starting with #) are ignored.
539
540 @return: (inputFileLines, type)
541 """
542 import re
543 type = ''
544 # read expression data
545 if self.verbose :
546 self.printMessage("reading input file...\n")
547
548 infile = file(inputFileName, "rU")
549 try:
550 inputFileLines = infile.readlines()
551
552 ## Guess the type of file:
553 if re.search("^#1.2\s*", inputFileLines[0]):
554 type = "GCT"
555 if self.verbose :
556 self.printMessage("...think it's GCT\n")
557 elif re.search("^NAME\tDESCRIPTION\t", inputFileLines[0], re.IGNORECASE):
558 type = "TXT"
559 if self.verbose :
560 self.printMessage("...think it's TXT\n")
561 else:
562 invalid = False
563 re_comment = re.compile("^#")
564 re_ranks = re.compile("^[^\t]+\t-?\d*\.?\d+")
565 for i in range(len(inputFileLines)):
566 if not (re_ranks.search(inputFileLines[i]) or re_comment.search(inputFileLines[i])):
567 invalid = True
568 break
569 if not invalid:
570 type = "RNK"
571 if self.verbose :
572 self.printMessage("...think it's RNK\n")
573 else:
574 error_text = "Error in line %i\n" % i
575 error_text += "Invalid Input File: '%s' \n" % inputFileName
576 error_text += "\tIt seems it's neither an expression file (GCT or TXT) or Ranked Gene list\n"
577 error_text += "\tRefer to http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats for specifications\n"
578 raise IOError, (1, error_text)
579
580 finally:
581 infile.close()
582
583 return inputFileLines, type
584
585
586 def replace_IDs(self, dataLines, id_symbol_map):
587 """
588 replaces the IDs in the first column by symbols (based on idSymbolMap)
589
590 @return: dataLines
591 """
592 # replace Probeset IDs with Gene Symbols
593 if self.verbose :
594 self.printMessage("replacing Probeset IDs with Gene Symbols...\n")
595
596 id_symbol_map_keys = id_symbol_map.keys()
597
598 for line_nr in range(len(dataLines)):
599 line = dataLines[line_nr]
600 newline = ""
601 tokens = line.split("\t", 1)
602 if tokens[0] in id_symbol_map_keys:
603 tokens[0] = id_symbol_map[tokens[0]]
604 newline = "\t".join(tokens)
605 else:
606 newline = line
607 dataLines[line_nr] = newline
608 return dataLines
609
610
611 def collapse_data(self, data_lines, type, mode='max_probe'):
612 """
613 collapses multiple expression values (or scores) per gene symbol
614
615 Modes:
616 - max_probe (default): for each sample, use the maximum expression value for the probe set.
617 For example:
618
619 Probeset_A 10 20 15 200
620 Probeset_B 100 105 110 95
621 ------------------------------------------
622 gene_symbol_AB 100 105 110 200
623
624 @return: data_lines - list with complete expression table or ranked gene list
625 """
626 import re
627 re_comment = re.compile("^#")
628
629 if self.verbose :
630 self.printMessage("collapsing Probe-Sets...\n")
631
632 # save header
633 if type == "GCT":
634 data_header = data_lines[:3]
635 data_lines = data_lines[3:]
636 elif type == "TXT":
637 data_header = data_lines[:1]
638 data_lines = data_lines[1:]
639 else:
640 data_header = []
641
642 # collapse
643 data_map = {}
644 for line in data_lines:
645 if type == "RNK" and re_comment.search(line):
646 # don't process comment lines in RANK files
647 # but rather append them to the header
648 data_header.append(line)
649 continue
650
651 tokens = line.split("\t")
652
653 if self.suppress_null and tokens[0].strip().upper() == 'NULL':
654 if self.verbose :
655 self.printMessage("dropping Gene 'NULL'...\n")
656 continue
657
658 ##### BEGIN FIX SESSION CODE #####
659 if self.fix_session and (tokens[0] not in self.genes_of_interest) :
660 # restrict to genes_of_interest
661 continue
662 ##### END FIX SESSION CODE #####
663
664 if not data_map.has_key(tokens[0]):
665 # if we hadn't had this gene before: take it
666 data_map[tokens[0]] = tokens[1:]
667 else:
668 # if we had:
669 if type == "GCT" or type == "TXT":
670 # case expression Table: take highest value!
671 for i in range(len(tokens[2:])):
672 if tokens[i + 2] > data_map[tokens[0]][i + 1]:
673 data_map[tokens[0]][i + 1] = tokens[i + 2]
674 data_map[tokens[0]][0] = data_map[tokens[0]][0] + " " + tokens[1]
675 else:
676 # case rank file: take value (Score) with highest magnitude
677 if abs(float(tokens[1])) > abs(float(data_map[tokens[0]][0])):
678 data_map[tokens[0]][0] = tokens[1]
679
680 # assemble new output data
681 data_lines = data_header
682
683 # restore header
684 if type == "GCT":
685 # calculate new dimensions of collapsed expression table
686 data_lines[1] = "\t".join([str(len(data_map.keys())), data_lines[1].split("\t")[1] ])
687
688 # restore expression table / ranked gene list
689 for gene in data_map.keys():
690 newline = gene + "\t" + "\t".join(data_map[gene])
691 data_lines.append(newline)
692
693 ##### BEGIN FIX SESSION CODE #####
694 if self.fix_session and type == "GCT" :
695 # expression tables in Session files have only one header line
696 del data_lines[1:2]
697 ##### END FIX SESSION CODE #####
698
699 return data_lines
700
701 def collapse_rank_and_expr(self, rank_data, expr_data, expr_type, idSymbolMap, mode="max_probe"):
702 import re
703 re_comment = re.compile("^#")
704 # 1) First pass though ranks: create map:
705 ranks_map = {}
706 """
707 ranks_map = { "Symbol_A" :{ "probeID_A1": 'score_A1',
708 "probeID_A2": 'score_A2' },
709 "Symbol_B" :{ "probeID_B1": 'score_B1' },
710 ...
711 }
712 """
713 rank_header_lines = []
714
715 if self.verbose:
716 self.printMessage('processing rank file...\n')
717 for rank_line in rank_data:
718 if re_comment.search(rank_line):
719 rank_header_lines.append(rank_line)
720 else:
721 (probeID, score) = rank_line.split("\t", 1) # we already made sure that it's 2 column when in read_inputFile()
722 if idSymbolMap.has_key(probeID):
723 symbol = idSymbolMap[probeID]
724 if not ranks_map.has_key(symbol):
725 ranks_map[symbol] = {probeID: score}
726 else:
727 if not ranks_map[symbol].has_key(probeID):
728 ranks_map[symbol][probeID] = score
729 else:
730 self.printMessage("WARNING: Duplicate Identifier '%s' in rank file '%s'\n Check your Input!!!!\n" % (probeID, self.inputFileName))
731 else:
732 self.printMessage("WARNING: Identifier '%s' not found in annotation file '%s'\n" % (probeID, self.chipFileName))
733
734
735 # 2) make map from expressions:
736 # exprs_map = { "probeID_A1": "rest of line A1",
737 # "probeID_A2": "rest of line A2",
738 # ...
739 # }
740 exprs_map = {}
741 exprs_header_lines = []
742
743 if expr_type == "GCT":
744 exprs_header_lines = expr_data[:3]
745 expr_data = expr_data[3:]
746 elif expr_type == "TXT":
747 exprs_header_lines = expr_data[:1]
748 expr_data = expr_data[1:]
749
750 if self.verbose:
751 self.printMessage('processing expressions file...\n')
752 for expr_line in expr_data:
753 (probeID, descr, data) = expr_line.split("\t", 2)
754 if idSymbolMap.has_key(probeID):
755 if not exprs_map.has_key(probeID):
756 exprs_map[probeID] = data
757 else:
758 self.printMessage("WARNING: Duplicate Identifier '%s' in expressions file '%s'\n Check your Input!!!!\n" % (probeID, self.extra_expr_in))
759 else:
760 self.printMessage("WARNING: Identifier '%s' not found in annotations file '%s'\n" % (probeID, self.chipFileName))
761
762
763
764 # 3) iterate over all symbols in ranks_map,
765 # iterate over probeID's
766 # pick probeID with highest score
767 # collect this probeID for rank file (and replace ID to symbol)
768 # collect this probeID for expr file, replace ID to symbol, keep selected ID (and others) in descr. col.
769 rank_data_lines = []
770 expr_data_lines = []
771
772 for symbol in ranks_map.keys():
773 best_probeID = ""
774 best_score = 0.0
775
776 if self.suppress_null and symbol.strip().upper() == 'NULL':
777 if self.verbose :
778 self.printMessage("dropping Gene 'NULL'...\n")
779 continue
780
781 if len(ranks_map[symbol].keys()) > 1:
782 for probeID in ranks_map[symbol].keys():
783 if abs(float(ranks_map[symbol][probeID])) > abs(best_score):
784 best_probeID = probeID
785 best_score = float(ranks_map[symbol][best_probeID])
786
787 rank_data_line = "\t".join([ symbol, ranks_map[symbol][best_probeID] ])
788
789 probeIDs = ranks_map[symbol].keys()
790 probeIDs.remove(best_probeID)
791 probeIDs = best_probeID + " " + "(" + ", ".join(probeIDs) + ")"
792 expr_data_line = "\t".join([ symbol, probeIDs, exprs_map[best_probeID] ])
793 else:
794 probeID = ranks_map[symbol].keys()[0]
795 rank_data_line = "\t".join([ symbol, ranks_map[symbol][probeID] ])
796 expr_data_line = "\t".join([ symbol, probeID, exprs_map[probeID] ])
797
798 rank_data_lines.append(rank_data_line)
799 expr_data_lines.append(expr_data_line)
800
801 # restore header
802 rank_data_lines[:0] = rank_header_lines
803 if expr_type == "GCT":
804 # calculate new dimensions of collapsed expression table
805 exprs_header_lines[1] = "\t".join([str(len(expr_data_lines)), exprs_header_lines[1].split("\t")[1] ]) + '\n'
806 expr_data_lines[:0] = exprs_header_lines
807
808 return (rank_data_lines, expr_data_lines)
809
810
811 def main(self):
812 "Main program function"
813 try:
814 if not self.chipFileName == "":
815 idSymbolMap = self.read_chipfile(self.chipFileName)
816
817 ##### BEGIN FIX SESSION CODE #####
818 if self.fix_session :
819 # collect genes of interest
820 self.genes_of_interest = []
821
822 expr_file = file(self.outputFileName, "rU")
823 try:
824 for line in expr_file:
825 data = line.split("\t", 1)
826 if not data[0] == "NAME":
827 self.genes_of_interest.append(data[0])
828 finally:
829 expr_file.close()
830 # make backup of the expression file
831 os.rename(self.outputFileName, self.outputFileName + ".BAK")
832 ##### END FIX SESSION CODE #####
833
834
835 if self.extra_expr_in != '' and self.extra_expr_out != '' and self.chipFileName != '' and self.doCollapse :
836 # if all data is available collapse a rank and expression file together
837 (rank_data, rnk_type) = self.read_inputFile(self.inputFileName)
838 (expr_data, expr_type) = self.read_inputFile(self.extra_expr_in)
839
840 if not rnk_type == 'RNK':
841 raise IOError, (1, "ERROR: Wrong file type!\nInput file %s needs to be a ranked list (RNK) in this mode." % self.inputFileName)
842 if not (expr_type == "GCT" or expr_type == "TXT"):
843 raise IOError, (1, "ERROR: Wrong file type!\n" + \
844 "Additional input Expression-table %s needs to of type GCT or TXT but was identified as '%s'" % (self.extra_expr_in, expr_type))
845
846
847 (rank_file_lines, expr_file_lines) = self.collapse_rank_and_expr(rank_data=rank_data,
848 expr_data=expr_data,
849 expr_type=expr_type,
850 idSymbolMap=idSymbolMap,
851 mode=self.collapseMode)
852 if self.verbose:
853 self.printMessage("writing RNK file...\n")
854 try:
855 rank_outfile = file(self.outputFileName, "w")
856 except IOError, (errorNo, text):
857 raise IOError, (errorNo, text + " : " + self.outputFileName)
858 try:
859 rank_outfile.writelines(rank_file_lines)
860 finally:
861 rank_outfile.close()
862
863 try:
864 expr_outfile = file(self.extra_expr_out, "w")
865 except IOError, (errorNo, text):
866 raise IOError, (errorNo, text + " : " + self.extra_expr_out)
867 try:
868 if self.verbose:
869 self.printMessage("writing Expression file...\n")
870 expr_outfile.writelines(expr_file_lines)
871 finally:
872 expr_outfile.close()
873
874 else:
875 # Do it the old fashioned way
876 (expr_file_lines, type) = self.read_inputFile(self.inputFileName)
877
878 if not self.chipFileName == "":
879 expr_file_lines = self.replace_IDs(expr_file_lines, idSymbolMap)
880
881 if self.doCollapse == True:
882 expr_file_lines = self.collapse_data(expr_file_lines, type, mode=self.collapseMode)
883
884 # write expression data in output file
885 outfile = file(self.outputFileName, "w")
886 try:
887 outfile.writelines(expr_file_lines)
888 finally:
889 outfile.close()
890
891 self.printMessage("Done!\n")
892
893 except IOError, (errorNo, text):
894 print parser.get_usage()
895 self.printMessage(text + '\n')
896 self.printMessage("exiting\n")
897 sys.exit(1)
898
899
900 if __name__ == "__main__":
901 from optparse import OptionParser, SUPPRESS_HELP
902 import sys, os
903
904
905 # Configure parser for command line options:
906 __usage = "%prog [options] -i input.gct -o output.gct [-c platform.chip] [--collapse]"
907 __description = "This tool can process a gene expression matrix (in GCT or TXT format) or ranked list (RNK format)\n" + \
908 "and either replace the Identifier based on a Chip Annotation file (e.g. AffyID -> Gene Symbol),\n" + \
909 "or collapse the expression values or rank-scores for Genes from more than one probe set.\n" + \
910 "Both can be done in one step by using both '-c platform.chip' and '--collapse' at the same time.\n" + \
911 "If a ranked list is to be collapsed, an additional expression matrix can be supplied by the -e/-x parameters\n" + \
912 "and will be filtered to contain the same probe-sets as selected from the RNK file.\n" + \
913 "If however the file supplied by -i is not recognized as a RNK file, these options have no effect.\n" + \
914 "\n" + \
915 "For detailed descriptions of the file formats, please refer to: http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats \n\n" + \
916 "Call without any parameters to select the files and options with a GUI (Graphical User Interface)"
917 parser = OptionParser(usage=__usage, description=__description, version="%prog " + __version__)
918 parser.add_option("-i", "--input",
919 dest="infile",
920 # default="",
921 help="input expression table or ranked list\n",
922 metavar="FILE")
923 parser.add_option("-o", "--output",
924 dest="outfile",
925 # default="output.gct",
926 help="output expression table or ranked list\n",
927 metavar="FILE")
928 parser.add_option("-c", "--chip",
929 dest="chipfile",
930 default='',
931 help="Chip File\nThis implies that the Identifiers are to be replaced.",
932 metavar="FILE")
933
934 parser.add_option("-e", "--ei",
935 dest="expr_in",
936 default='',
937 help="(optional) additional input Expression-table, to be restricted to the same probe-sets as the RNK file",
938 metavar="FILE"
939 )
940 parser.add_option("-x", "--xo",
941 dest="expr_out",
942 default='',
943 help="(optional) corresponding output file for -i/--ei option",
944 metavar="FILE"
945 )
946
947 parser.add_option("--collapse",
948 dest="collapse",
949 default=False,
950 help="Collapse multiple probe sets for the same gene symbol (max_probe)\n",
951 action="store_true",
952 )
953 parser.add_option("--no-collapse",
954 dest="collapse",
955 help="Don't collapse multiple probesets\n[default]\n",
956 action="store_false",
957 )
958 parser.add_option("-m", "--collapse-mode",
959 dest="mode",
960 default="max_probe",
961 type="choice",
962 choices=("max_probe", "median_of_probes"),
963 # help="Mode for collapsing data from multiple probe sets for the same gene symbol. Currently only 'max_probe' is supported.",
964 help=SUPPRESS_HELP
965 )
966 parser.add_option("--null",
967 dest="suppress_null",
968 default=False,
969 help="suppress Gene with Symbol NULL\n",
970 action="store_true",
971 )
972
973 parser.add_option("-g", "--gui",
974 dest="useGui",
975 action="store_true",
976 default=False,
977 help="Open a Window to choose the files and options.",
978 )
979 parser.add_option("-q", "--quiet",
980 dest="verbose",
981 default=True,
982 help="be quiet\n",
983 action="store_false",
984 )
985 parser.add_option("--cys",
986 dest="fix_session",
987 default=False,
988 help=SUPPRESS_HELP, #"write out shorter GCT format (only one header line)\n",
989 action="store_true",
990 )
991
992
993 (options, args) = parser.parse_args()
994
995 # decide if we start the GUI or use the command line
996 useGui = (not options.infile and
997 not options.outfile and
998 not options.chipfile and
999 not options.collapse) or options.useGui
1000
1001 if useGui:
1002 theGui = ReplaceCollapseGui(version=__version__)
1003 # theGui.master.title("Sample application")
1004 theGui.master.minsize(width=450, height=550)
1005 theGui.master.lift()
1006 theGui.center_window(w=450, h=550)
1007
1008 theGui.mainloop()
1009
1010 else:
1011 # Check Input
1012 if options.verbose :
1013 print parser.get_usage()
1014 if not options.infile :
1015 parser.error("input-file required")
1016 if not os.path.isfile(options.infile):
1017 parser.error("input-file does not exist")
1018 if not options.outfile :
1019 parser.error("output-file required")
1020 # if not (options.chipfile or options.fix_session):
1021 # parser.error("chip-file required")
1022 if not options.chipfile == "" and not os.path.isfile(options.chipfile):
1023 parser.error("Chip-file does not exist.")
1024 if not options.expr_in == "" and not os.path.isfile(options.expr_in):
1025 parser.error("Expression-file (--ei) does not exist.")
1026 if not options.expr_in == "" and options.expr_out == "":
1027 parser.error("If additional expression input file (--ei) is given, a corresponding output file (--xo) is required, too.")
1028 if (options.expr_in != "" and options.expr_out != "") and not (options.chipfile != "" and options.collapse == True):
1029 parser.error("Filtering of additional expression table (--ei/--xo) requires specifying both --chip and --collapse")
1030
1031 collapser = CollapseExpressionMatrix(inputFileName=options.infile,
1032 outputFileName=options.outfile,
1033 chipFileName=options.chipfile,
1034 extra_expr_in=options.expr_in,
1035 extra_expr_out=options.expr_out,
1036 doCollapse=options.collapse,
1037 collapseMode=options.mode,
1038 verbose=options.verbose,
1039 fix_session=options.fix_session,
1040 suppress_null=options.suppress_null)
1041 collapser.main()
1042
1043
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.