Import GMT in R
Description
Tasks accomplished:
- Read GMT file
- Export to R list object
- Filter the list by gene-set size
- Transform the list into a binary matrix (gene-set x gene)
Code
file.name <- "Your File.name here" # 'scan' uses "\n" as separator, # so it reads every line as a character array gmt.chv <- scan ( file = file.name, what = character(), sep = "\n" ) # strsplit separates elements that are collapsed in the line # "\n" is the separator gmt.ls <- strsplit (gmt.chv, "\t") # these functions extract the different elements # from a separated line # version without Description element f.extract_slotID <- function (input.chv) {return (input.chv[1])} f.extract_content <- function (input.chv) {return (setdiff (input.chv, input.chv[1]))} # version with Description element (not used) f.extract_slotID <- function (input.chv) {return (input.chv[1])} f.extract_content <- function (input.chv) {return (setdiff (input.chv, input.chv[1: 2]))} # Final list object gs.ls <- lapply (gmt.ls, f.extract_content) names (gs.ls) <- unlist (lapply (gmt.ls, f.extract_slotID)) # Pre-processing f.uniqueSet <- function (input.chv) {return (unique (input.chv))} gs_u.ls <- lapply (gs.ls, f.uniqueSet) lengths.nv <- unlist (lapply (gs_u.ls, length)) thU.n <- 700 thL.n <- 7 gs_uf.ls <- gs_u.ls[lengths.nv <= thU.n & lengths.nv >= thL.n] # Restriction to an external universe set 'uni.chv' # (repeat filtering if required) f.intersUni <- function (input.chv) # * warning: uni.chv is handled as a global variable * {return (intersect (input.chv, uni.chv))} gs_uf.ls <- lapply (gs_uf.ls, f.intersUni) # Conversion to binary matrix gs.df <- stack (gs_uf.ls) gs.tab <- table (gs.df)