Attachment 'gc_nucleosome_predictor.py'
Download 1 #!/usr/bin/python
2
3 ################################################################################
4 #AUTHOR: Ron Ammar
5 #DATE: February/8/2012
6 #MODIFIED: July/24/2012
7 #LAB: Bader/Giaever/Nislow Labs, Dept of Molecular Genetics
8 #USAGE: gc_nucelosome_predictor.py [-hg] <Genome FASTA File>
9 #EXAMPLE: gc_nucleosome_predictor.py -g 138 chr_all.fasta > output.txt
10 #DESCRIPTION: Uses GC content of a genome to predict nucleosome occupancy.
11 # Outputs the positions to a tab-delimited file with GC-smoothed values based
12 # on GC counts. This is done by binarizing the genome to 1 where C/G and 0
13 # where A/T and then smoothing these binary values.
14 ################################################################################
15
16 import math
17 import optparse
18 import re
19 import sys
20
21
22 ##### Parse the command line options
23 usage= "usage: %prog [-hg] <Genome FASTA File>"
24 parser= optparse.OptionParser(usage)
25 parser.add_option("-g", help="Interval length for Gaussian smoother; in bp) [138]", \
26 action="store", type="int", dest="gaussian_interval", default=138)
27 parser.add_option("--no-secondary-smoothing", \
28 help="Turn off SMA (Simple Moving Average) to smooth afterwards for less jagged graph", \
29 action="store_false", dest="secondary_smooth", default=True)
30 (options, args)= parser.parse_args()
31 if len(args) != 1:
32 parser.error("incorrect number of arguments")
33
34
35 ##### Data Values
36 previousExtremeCoverage= None
37
38
39 ##### Functions ----------------------------------------------------------------
40
41 ##### Find the Gaussian value with a certain period
42 # Returns the value as a floating point number
43 # NOTE: the AUC of a Gaussian = 1, so no need to scale it by 1/period.
44 def findGaussian(middle, coverageList, period, mean):
45 middle= int(middle) # ensure that the centre coordinate is cast as an int
46 smaStart= middle - (period / 2)
47 smaStop= middle + (period / 2)
48
49 # Edge cases (where interval falls off the list); Remove all indexes less
50 # than 0 or greater/equal to length of the list
51 if smaStart < 0:
52 smaStart= 0
53 if smaStop >= len(coverageList):
54 smaStop= len(coverageList) - 1
55
56 interval= range(smaStart, smaStop + 1)
57
58 # Apply the Gaussian
59 convolution= 0.0
60 index= -(period/2)
61 s= float(period/6) # standard deviation; division by 6 is key to encompass the Guassian in 3 std each way
62 for i in interval:
63 # ensure that all numbers are cast as floats, to avoid errors in python's
64 # integer arithmetic
65 coefficient= \
66 (1.0 / (math.sqrt(2.0 * math.pi) * s)) * (math.e ** ((((index - mean)/(2.0 * s)) ** 2.0) * -1.0))
67 convolution += coefficient * coverageList[i]
68
69 index += 1 # should end at index == period/2
70
71 return convolution
72
73
74 ##### Find the simple moving average with a certain period
75 # Returns the mean as a floating point number
76 def findSMA(middle, coverageList, period):
77 middle= int(middle) # ensure that the centre coordinate is cast as an int
78 smaStart= middle - (period / 2)
79 smaStop= middle + (period / 2)
80
81 # If the number is even, the middle coordinate cannot be in the middle, so
82 # offset the smaStop value
83 if period % 2 == 0:
84 smaStop -= 1
85
86 # Edge cases (where interval falls off the list); Remove all indexes less
87 # than 0 or greater/equal to length of the list
88 if smaStart < 0:
89 smaStart= 0
90 if smaStop >= len(coverageList):
91 smaStop= len(coverageList) - 1
92
93 interval= range(smaStart, smaStop + 1)
94
95 # Calculate the mean for this middle coordinate
96 sum= 0.0
97 for i in interval:
98 sum += coverageList[i]
99 mean= sum/len(interval)
100
101 return mean
102
103
104 ##### Iterate along the list and smooth it using simple moving
105 # average smoothing. Store the smoothed data in a list.
106 # Method can be either "SMA" or "Gaussian", defaults to Gaussian
107 # PRECONDITION: The period (aka interval or bandwidth) must be odd
108 def smooth(coverageList, period, method="Gaussian"):
109 smoothedCoverage= []
110
111 if method == "Gaussian":
112 i= 0
113 while i != len(coverageList):
114 smoothedCoverage += [findGaussian(i, coverageList, period, 0.0)]
115 i += 1
116 elif method == "SMA":
117 i= 0
118 while i != len(coverageList):
119 smoothedCoverage += [findSMA(i, coverageList, period)]
120 i += 1
121
122 return smoothedCoverage
123
124
125 ##### Smooth binarized GC data and output for each chromosome
126 def smoothChromosome(chr, gcCalls):
127 # Smooth the GC binary data (at any given position- either G/C or A/T) with
128 # a Gaussian
129 gcCallsSmoothed= smooth(gcCalls, options.gaussian_interval, "Gaussian")
130
131 # Smooth out the little bumps so that extrema identification is not affected
132 # if that is performed subsequently
133 if options.secondary_smooth:
134 gcCallsSmoothed= smooth(gcCallsSmoothed, 15, "SMA")
135
136 # Output; Uses +1 because indeces in python are 0-based
137 for index in range(0, len(gcCalls)):
138 print "\t".join([chr, str(index + 1), '%.6f' % gcCallsSmoothed[index]])
139 #-------------------------------------------------------------------------------
140
141
142 ##### Store the genome sequence
143 genomeFile= file(args[0], "r")
144 currentChromosome= ""
145 gcCalls= None
146 for line in genomeFile:
147 line= line.strip()
148
149 if re.search(r"^>", line): # in FASTA line
150 # If we just completed storing a chromosome, smooth and output that data
151 if gcCalls != None:
152 smoothChromosome(currentChromosome, gcCalls)
153
154 # Store the next chromosome
155 currentChromosome= re.split(r"[>\s]", line)[1]
156 gcCalls= []
157 else:
158 # calculate the GC
159 currentGCString= []
160 for nucleotide in line:
161 if nucleotide == "C" or nucleotide == "c" or nucleotide == "G" or nucleotide == "g":
162 currentGCString += [1]
163 else:
164 currentGCString += [0]
165
166 gcCalls += currentGCString
167
168 genomeFile.close()
169
170 # Output the final chromosomal smoothed data
171 smoothChromosome(currentChromosome, gcCalls)
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.