#acl All:read DanieleMerico:write,delete,revert = CNV Enrichment Test Code = == Reference Paper == '''Functional impact of global rare copy number variation in autism spectrum disorders.'''<
>Pinto D, Pagnamenta AT, Klei L, Anney R, '''Merico D''', Regan R, et al. <
>[[http://www.nature.com/nature/journal/vaop/ncurrent/abs/nature09146.html|Nature. 2010 Jun 9 (Epub ahead of print)]] == Brief Description == This test was designed to assess which gene-sets are hit by CNVs, comparing cases to controls. It is a "self-contained" type test; unlike "competitive" tests, it does not rely on a gene-level statistics, but directly constructs a statistic at the gene-set level. It requires an input table connecting CNVs to patients (cases and controls). Please see the code below for a more detailed description of the input requirements. Essential description of the test procedure: * CNVs can occur both in control and case patients (about with the same frequency for the Autism study) * whenever a gene is affected by a CNV, we hypothesize every gene-set including that gene will (have a non-zero probability to) be "perturbed" * every patient contributes to the gene-set perturbation score as follows <
> * 0 = no CNV-affected gene in this gene-set for this patient * 1 = one or more CNV-affected genes in this gene-set for this patient * note: perturbation scores can be statistically treated as counts * we perform a Fisher's Exact Test on perturbation scores, comparing the case to the control class <
> we specifically test if perturbations are significantly higher in cases than in controls, for every gene-set * we also compute an "empirical" FDR (False Discovery Rate), by permuting patient-to-case/control associations For a more detailed description, please refer to the [[http://www.nature.com/nature/journal/v466/n7304/abs/nature09146.html#/supplementary-information|paper supplementaries]] == Code == The script below calls the functions in the zipped file at the bottom. === Script === {{{ #!rscript numbers=off # CNV GENE-SET ENRICHMENT [LITE] # (by DANIELE MERICO, Sept 2010) # INPUT DATA # CNV data # 'CNV.df' is a data.frame with the following columns: # $Class (values: "case", "control") # $SampleID [i.e. patient identifier] # $Chr [i.e. CNV genomic coordinate: chromosome] # $Coord_i [i.e. CNV genomic coordinate: begin position] # $Coord_f [i.e. CNV genomic coordinate: end position] # $Length [i.e. CNV genomic coordinate: length] # $Type (values: '0' = DEL, '1' = DEL, '3' = DUP) # $Genes_eg (use ';' to separate multiple values) [i.e. genes overlapped by the CNV, Entrez-gene identifier] # $Gene_count [i.e. number of genes in the previous field] # 'Sample2Class.df' can be extracted from the previous: # it is a data.frame with the following columns: # $SampleID # $Class # Gene sets # 'GS2Genes.ls' is a list, with # - gene-set IDs as names # - genes (entrez-gene IDs) as slot content # 'GS2Name.chv' is a character vector, where # - names are the gene-set IDs # - values are the gene-set descriptions # These can be generated by parsing the GMT-formatted gene-sets at this page: # http://baderlab.org/GeneSetDB_02 # - NB: don't forget filtering for size; # we originally discarded gene-sets smaller than 5 genes and larger than 700 genes # LOAD FUNCTIONS # - Load all functions function.names <- c ( "Filter_CNV_01.R", "CNV_Key_01.R", "Sample2genes_from_CNV_01.R", "Sample2GS_from_Sample2genes_01.R", "FisherTestGS_01.R", "Add_GSname_01.R", "AddGSsize_toEnrdf_01.R", "GeneCounts_from_CNV_01.R", "AddSuppGenes_toEnrdf_01.R" ) for (f.name in function.names) {source (f.name)} # EXECUTE MAIN # 1) Set parameters # To keep only CNV-DEL, of any gene length CNV_Limits.nv <- c (+Inf, 0) names (CNV_Limits.nv) <- c ("DEL", "DUP") # To keep both CNV-DEL and CNV-DUP, of any gene length CNV_Limits.nv <- c (+Inf, +Inf) names (CNV_Limits.nv) <- c ("DEL", "DUP") # Add any Entrez-gene you want to remove from the analysis # (use only for hypothesized false positives, e.g. instable regions) Blacklist.eg <- "" # Number of randomizations for FDR computation Iter.n <- 5 # 2) Filter CNV_f.df <- f.Filter_CNV2gene ( CNV.df = CNV.df, CNV_limits.nv = CNV_Limits.nv, blacklist.eg = Blacklist.eg ) # 3) Generate 'sample 2 gene-sets' table CNV_f.df <- f.MakeCNV_key (CNV_f.df) Samples2Genes.ls <- f.Comp_Sample2genes (CNV.df = CNV_f.df) Sample2GS.tab <- f.Comp_Sample2GS ( GS2genes.ls = GS2Genes.ls, Sample2genes.ls = Samples2Genes.ls, Sample2class.df = Sample2Class.df ) # 4) Run Test Enr_1.df <- f.FisherTestGS_FDR_Wrap ( Sample2GS.tab = Sample2GS.tab, Sample2class.df = Sample2Class.df, iter.n = Iter.n ) # 5) Add gene-set attributes Enr_2.df <- f.AddGeneSize ( GS2genes.ls = GS2Genes.ls, Enr.df = Enr_1.df ) Enr_2.df <- f.Add_GSname ( GS2name.chv = GS2Name.chv, Enr.df = Enr_2.df ) # 6) Add support genes # - defined as the genes with more mutations in cases than ctrls CNV_Gene.tab <- f.CompGeneCounts (CNV.df = CNV_f.df) Enr_3.df <- f.AddSupportGenes ( GS_enr.df = Enr_2.df, Genes2class.tab = CNV_Gene.tab, GS2genes.ls = GS2Genes.ls ) Enr_final.df <- Enr_3.df # EXECUTE ANCILLARY # 1) Compute Gene Tables # - these have stats by-gene source ("GeneTable_01.R") GeneTable.ls <- f.MakeGeneTable ( Enr.df = Enr_final.df, eg2sy.chv = Ann_eg2sy.chv, GeneCounts.tab = CNV_Gene.tab, GS2genes.ls = GS2Genes.ls, CNV.df = CNV_f.df ) }}} === Functions === * [[attachment:Pinto2010_Code.zip|Functions (zip file)]]