Import GMT in R
Description
Tasks accomplished:
- Read GMT file
- Export to R list object
- Filter the list by gene-set size
- Transform the list into a binary matrix (gene-set x gene)
Code
file.name <- "Your File.name here"
# 'scan' uses "\n" as separator,
# so it reads every line as a character array
gmt.chv <- scan (
file = file.name,
what = character(),
sep = "\n"
)
# strsplit separates elements that are collapsed in the line
# "\n" is the separator
gmt.ls <- strsplit (gmt.chv, "\t")
# these functions extract the different elements
# from a separated line
# version without Description element
f.extract_slotID <- function (input.chv)
{return (input.chv[1])}
f.extract_content <- function (input.chv)
{return (setdiff (input.chv, input.chv[1]))}
# version with Description element (not used)
f.extract_slotID <- function (input.chv)
{return (input.chv[1])}
f.extract_content <- function (input.chv)
{return (setdiff (input.chv, input.chv[1: 2]))}
# Final list object
gs.ls <- lapply (gmt.ls, f.extract_content)
names (gs.ls) <- unlist (lapply (gmt.ls, f.extract_slotID))
# Pre-processing
f.uniqueSet <- function (input.chv)
{return (unique (input.chv))}
gs_u.ls <- lapply (gs.ls, f.uniqueSet)
lengths.nv <- unlist (lapply (gs_u.ls, length))
thU.n <- 700
thL.n <- 7
gs_uf.ls <- gs_u.ls[lengths.nv <= thU.n & lengths.nv >= thL.n]
# Restriction to an external universe set 'uni.chv'
# (repeat filtering if required)
f.intersUni <- function (input.chv)
# * warning: uni.chv is handled as a global variable *
{return (intersect (input.chv, uni.chv))}
gs_uf.ls <- lapply (gs_uf.ls, f.intersUni)
# Conversion to binary matrix
gs.df <- stack (gs_uf.ls)
gs.tab <- table (gs.df)