Differences between revisions 2 and 5 (spanning 3 versions)

CNV Enrichment Test Code

Reference

Functional impact of global rare copy number variation in autism spectrum disorders.
Pinto D, Pagnamenta AT, Klei L, Anney R, Merico D, Regan R, Conroy J, Magalhaes TR, Correia C, Abrahams BS, Almeida J, Bacchelli E, Bader GD, Bailey AJ, Baird G, Battaglia A, Berney T, Bolshakova N, Bölte S, Bolton PF, Bourgeron T, Brennan S, Brian J, Bryson SE, Carson AR, Casallo G, Casey J, Chung BH, Cochrane L, Corsello C, Crawford EL, Crossett A, Cytrynbaum C, Dawson G, de Jonge M, Delorme R, Drmic I, Duketis E, Duque F, Estes A, Farrar P, Fernandez BA, Folstein SE, Fombonne E, Freitag CM, Gilbert J, Gillberg C, Glessner JT, Goldberg J, Green A, Green J, Guter SJ, Hakonarson H, Heron EA, Hill M, Holt R, Howe JL, Hughes G, Hus V, Igliozzi R, Kim C, Klauck SM, Kolevzon A, Korvatska O, Kustanovich V, Lajonchere CM, Lamb JA, Laskawiec M, Leboyer M, Le Couteur A, Leventhal BL, Lionel AC, Liu XQ, Lord C, Lotspeich L, Lund SC, Maestrini E, Mahoney W, Mantoulan C, Marshall CR, McConachie H, McDougle CJ, McGrath J, McMahon WM, Merikangas A, Migita O, Minshew NJ, Mirza GK, Munson J, Nelson SF, Noakes C, Noor A, Nygren G, Oliveira G, Papanikolaou K, Parr JR, Parrini B, Paton T, Pickles A, Pilorge M, Piven J, Ponting CP, Posey DJ, Poustka A, Poustka F, Prasad A, Ragoussis J, Renshaw K, Rickaby J, Roberts W, Roeder K, Roge B, Rutter ML, Bierut LJ, Rice JP, Salt J, Sansom K, Sato D, Segurado R, Sequeira AF, Senman L, Shah N, Sheffield VC, Soorya L, Sousa I, Stein O, Sykes N, Stoppioni V, Strawbridge C, Tancredi R, Tansey K, Thiruvahindrapduram B, Thompson AP, Thomson S, Tryfon A, Tsiantis J, Van Engeland H, Vincent JB, Volkmar F, Wallace S, Wang K, Wang Z, Wassink TH, Webber C, Weksberg R, Wing K, Wittemeyer K, Wood S, Wu J, Yaspan BL, Zurawiecki D, Zwaigenbaum L, Buxbaum JD, Cantor RM, Cook EH, Coon H, Cuccaro ML, Devlin B, Ennis S, Gallagher L, Geschwind DH, Gill M, Haines JL, Hallmayer J, Miller J, Monaco AP, Nurnberger Jr JI, Paterson AD, Pericak-Vance MA, Schellenberg GD, Szatmari P, Vicente AM, Vieland VJ, Wijsman EM, Scherer SW, Sutcliffe JS, Betancur C
Nature. 2010 Jun 9 (Epub ahead of print)

Code

The script below calls the functions in the zipped file.

Script

# CNV GENE-SET ENRICHMENT [LITE]

# INPUT DATA

# CNV data

# 'CNV.df' is a data.frame with the following columns:
# $Class (values: "case", "control")
# $SampleID [i.e. patient identifier]
# $Chr [i.e. CNV genomic coordinate: chromosome]
# $Coord_i [i.e. CNV genomic coordinate: begin position] 
# $Coord_f [i.e. CNV genomic coordinate: end position] 
# $Length [i.e. CNV genomic coordinate: length]
# $Type (values: '0' = DEL, '1' = DEL, '3' = DUP)  
# $Genes_eg (use ';' to separate multiple values) [i.e. genes overlapped by the CNV, Entrez-gene identifier] 
# $Gene_count [i.e. number of genes in the previous field]

# 'Sample2Class.df' can be extracted from the previous:
# it is a data.frame with the following columns:
# $SampleID
# $Class

# Gene sets

# 'GS2Genes.ls' is a list, with 
# - gene-set IDs as names 
# - genes (entrez-gene IDs) as slot content
# 'GS2Name.chv' is a character vector, where 
# - names are the gene-set IDs 
# - values are the gene-set descriptions

# These can be generated by parsing the GMT-formatted gene-sets at this page:
# http://baderlab.org/GeneSetDB_02

# LOAD FUNCTIONS
# - Load all functions

function.names <- c (
        "Filter_CNV_01.R", 
        "CNV_Key_01.R", "Sample2genes_from_CNV_01.R", "Sample2GS_from_Sample2genes_01.R",       "FisherTestGS_01.R", 
        "Add_GSname_01.R", "AddGSsize_toEnrdf_01.R", 
        "GeneCounts_from_CNV_01.R", "AddSuppGenes_toEnrdf_01.R"
        )

for (f.name in function.names)
        {source (f.name)}

# EXECUTE MAIN

# 1) Set parameters

# To keep only CNV-DEL, of any gene length
CNV_Limits.nv <- c (+Inf, 0)
names (CNV_Limits.nv) <- c ("DEL", "DUP")

# To keep both CNV-DEL and CNV-DUP, of any gene length
CNV_Limits.nv <- c (+Inf, +Inf)
names (CNV_Limits.nv) <- c ("DEL", "DUP")

# Add any Entrez-gene you want to remove from the analysis
# (use only for hypothesized false positives, e.g. instable regions)
Blacklist.eg   <- ""

# Number of randomizations for FDR computation
Iter.n         <- 5

# 2) Filter
        
CNV_f.df <- f.Filter_CNV2gene (
                                CNV.df = CNV.df, 
                                CNV_limits.nv = CNV_Limits.nv,
                                blacklist.eg  = Blacklist.eg
                                )

# 3) Generate 'sample 2 gene-sets' table

CNV_f.df <- f.MakeCNV_key (CNV_f.df)    

Samples2Genes.ls <- 
                f.Comp_Sample2genes (CNV.df = CNV_f.df)
                
Sample2GS.tab <- f.Comp_Sample2GS (
                                        GS2genes.ls = GS2Genes.ls, 
                                        Sample2genes.ls = Samples2Genes.ls, 
                                        Sample2class.df = Sample2Class.df
                                        )

# 4) Run Test
                
Enr_1.df <- f.FisherTestGS_FDR_Wrap (
                                        Sample2GS.tab   = Sample2GS.tab, 
                                        Sample2class.df = Sample2Class.df, 
                                        iter.n = Iter.n
                                        )                                       

# 5) Add gene-set attributes

Enr_2.df <- f.AddGeneSize (
                                        GS2genes.ls = GS2Genes.ls, 
                                        Enr.df      = Enr_1.df
                                        )

Enr_2.df <- f.Add_GSname (
                                        GS2name.chv = GS2Name.chv, 
                                        Enr.df       = Enr_2.df
                                        )
                                        
# 6) Add support genes
#    - defined as the genes with more mutations in cases than ctrls

CNV_Gene.tab <- 
                f.CompGeneCounts (CNV.df = CNV_f.df)    


Enr_3.df <- f.AddSupportGenes (
                                        GS_enr.df       = Enr_2.df, 
                                        Genes2class.tab = CNV_Gene.tab, 
                                        GS2genes.ls     = GS2Genes.ls
                                        )

Enr_final.df <- Enr_3.df

# EXECUTE ANCILLARY

# 1) Compute Gene Tables
#    - these have stats by-gene

source ("GeneTable_01.R")

GeneTable.ls <- f.MakeGeneTable (
                                        Enr.df         = Enr_final.df, 
                                        eg2sy.chv      = Ann_eg2sy.chv, 
                                        GeneCounts.tab = CNV_Gene.tab, 
                                        GS2genes.ls    = GS2Genes.ls, 
                                        CNV.df         = CNV_f.df
                                        )