Import GMT in R

Description

Tasks accomplished:

  1. Read GMT file
  2. Export to R list object
  3. Filter the list by gene-set size
  4. Transform the list into a binary matrix (gene-set x gene)

Code

file.name <- "Your File.name here"

# 'scan' uses "\n" as separator, 
# so it reads every line as a character array

gmt.chv <- scan (
        file = file.name, 
        what = character(), 
        sep = "\n"
        )

# strsplit separates elements that are collapsed in the line
# "\n" is the separator
        
gmt.ls <- strsplit (gmt.chv, "\t")

# these functions extract the different elements
# from a separated line

# version without Description element

f.extract_slotID <- function (input.chv)
        {return (input.chv[1])}
f.extract_content <- function (input.chv)
        {return (setdiff (input.chv, input.chv[1]))}

# version with Description element (not used)   

f.extract_slotID <- function (input.chv)
        {return (input.chv[1])}
f.extract_content <- function (input.chv)
        {return (setdiff (input.chv, input.chv[1: 2]))}

# Final list object
        
gs.ls <- lapply (gmt.ls, f.extract_content)
names (gs.ls) <- unlist (lapply (gmt.ls, f.extract_slotID))

# Pre-processing

f.uniqueSet <- function (input.chv)
        {return (unique (input.chv))}

gs_u.ls <- lapply (gs.ls, f.uniqueSet)

lengths.nv <- unlist (lapply (gs_u.ls, length))

thU.n <- 700
thL.n <-   7

gs_uf.ls <- gs_u.ls[lengths.nv <= thU.n & lengths.nv >= thL.n]

# Restriction to an external universe set 'uni.chv'
# (repeat filtering if required)

f.intersUni <- function (input.chv)
# * warning: uni.chv is handled as a global variable *
        {return (intersect (input.chv, uni.chv))}

gs_uf.ls <- lapply (gs_uf.ls, f.intersUni)
        
# Conversion to binary matrix

gs.df <- stack (gs_uf.ls)

gs.tab <- table (gs.df)

DanieleMerico/Code/Read_GMT (last edited 2010-03-10 20:26:31 by DanieleMerico)

MoinMoin Appliance - Powered by TurnKey Linux