# IMPORT ANNOTATION FROM LIBRARIES
library (org.Hs.eg.db)
Ann_eg2sy.chv <- unlist (as.list (org.Hs.egSYMBOL))
Ann_sy2eg.chv <- names (Ann_eg2sy.chv)
names (Ann_sy2eg.chv) <- Ann_eg2sy.chv
Ann_eg2name.chv <- unlist (as.list (org.Hs.egGENENAME))
# GENERATE DIFFERENTIAL EXPRESSION SCORES
# Import expression matrix in GSEA format
setwd ("C:/Users/Daniele/Documents/DM New/Università/Classes_Taught/OBI Pathway Analysis/L2_Assignment/Data_Core")
expr.df <- read.table (
file = "MCF7_ExprMx_v2.txt",
sep = "\t",
header = T,
quote = "",
stringsAsFactors = F
)
# Define function
# the score is -log(p-value) multiplied by
# (-1) for DOWN and (+1) for UP
f.t_test <- function (input.nv)
{
t.htest <- t.test (x = input.nv[1: 3], y = input.nv[4: 6], alternative = "two.sided")
logpv.n <- - log (t.htest$p.value) * sign (t.htest$statistic)
return (logpv.n)
}
# Apply function
diff_expr.mx <- matrix (ncol = 2, nrow = nrow (expr.df))
diff_expr.mx[, 1] <- apply (expr.df[, grep (colnames (expr.df), pattern = "_12h_")], 1, f.t_test)
# apply (expr.df[, c ("E2_12h_01", "E2_12h_02", "E2_12h_03", "NT_12h_01", "NT_12h_02", "NT_12h_03")], 1, f.t_test)
diff_expr.mx[, 2] <- apply (expr.df[, grep (colnames (expr.df), pattern = "_24h_")], 1, f.t_test)
# apply (expr.df[, c ("E2_24h_01", "E2_24h_02", "E2_24h_03", "NT_24h_01", "NT_24h_02", "NT_24h_03")], 1, f.t_test)
colnames (diff_expr.mx) <- c ("diff_12h", "diff_24h")
rownames (diff_expr.mx) <- expr.df$NAME
# Make data.frame
diff_expr_1.df <- data.frame (
GeneID = rownames (diff_expr.mx),
GeneSy = Ann_eg2sy.chv [as.character (expr.df$NAME)],
GeneName = Ann_eg2name.chv[as.character (expr.df$NAME)],
stringsAsFactors = F
)
diff_expr_2.df <- as.data.frame (diff_expr.mx)
diff_expr.df <- cbind (diff_expr_1.df, diff_expr_2.df)
# IMPORT AND FORMAT HUMAN INTERACTION DATA
# Import .sif file into data.frame
int.df <- read.table (
file = "homo-sapiens.sif",
sep = "\t",
header = F,
quote = "",
stringsAsFactors = F
)
colnames (int.df) <- c ("Source", "IntType", "Target")
# Restrict to "INTERACTS_WITH"
int_ppi.df <- int.df[int.df$IntType == "INTERACTS_WITH", ]
nrow (int.df)
# 245077
nrow (int_ppi.df)
# 113203
# Convert to Entrez-gene
int_ppi_eg.df <- int_ppi.df
int_ppi_eg.df$Source <- Ann_sy2eg.chv[int_ppi.df$Source]
int_ppi_eg.df$Target <- Ann_sy2eg.chv[int_ppi.df$Target]
# Remove lines that were not completely converted
# (unconverted values are NAs)
sel.ix <- which ((! is.na (int_ppi_eg.df$Source)) & (! is.na (int_ppi_eg.df$Target)))
int_ppi_eg.df <- int_ppi_eg.df[sel.ix, ]
nrow (int_ppi_eg.df)
# [1] 109081
# lost in conversion:
113203 - 109081
# 4122 (3.6%)
# Remove duplications
# 1) order pairs
# 2) remove redundant pairs
f.orderPairs <- function (input.chv)
{
output.chv <- input.chv
sorted.chv <- sort (output.chv[c (1, 3)])
output.chv[1] <- sorted.chv[1]
output.chv[3] <- sorted.chv[2]
return (output.chv)
}
int_ppi_eg_unq.df <- unique (
as.data.frame (
t (
apply (int_ppi_eg.df, 1, f.orderPairs)
),
stringsAsFactors = F
)
)
nrow (int_ppi_eg_unq.df)
# 103588
# Restrict network to functional group of interest
GO.ls <- as.list (org.Hs.egGO2ALLEGS)
# - Replication Fork (GO:0005657)
RepF.genes <- unique (unlist (GO.ls[["GO:0005657"]]))
RepF.ix <- which ((int_ppi_eg_unq.df$Source %in% RepF.genes) & (int_ppi_eg_unq.df$Target %in% RepF.genes))
int_ppi_eg_RepF.df <- int_ppi_eg_unq.df[RepF.ix, ]
# 90% of Replication Fork genes
# are mapped to the physical interactions network
length (RepF.genes)
# 30
length (unique (c (int_ppi_eg_RepF.df$Source, int_ppi_eg_RepF.df$Target)))
# 27
# - DNA Replication (GO:0006260)
RepDNA.genes <- unique (unlist (GO.ls[["GO:0006260"]]))
RepDNA.ix <- which ((int_ppi_eg_unq.df$Source %in% RepDNA.genes) & (int_ppi_eg_unq.df$Target %in% RepDNA.genes))
int_ppi_eg_RepDNA.df <- int_ppi_eg_unq.df[RepDNA.ix, ]
# 64.2% of DNA replication genes
# are mapped to the physical interactions network
length (RepDNA.genes)
# 232
length (unique (c (int_ppi_eg_RepDNA.df$Source, int_ppi_eg_RepDNA.df$Target)))
# 149
# - APC-dependent Protein Degradation (GO:0031145)
# (full name: anaphase-promoting complex-dependent proteasomal
# ubiquitin-dependent protein catabolic process )
Prdeg.genes <- unique (unlist (GO.ls[["GO:0031145"]]))
Prdeg.ix <- which ((int_ppi_eg_unq.df$Source %in% Prdeg.genes) & (int_ppi_eg_unq.df$Target %in% Prdeg.genes))
int_ppi_eg_Prdeg.df <- int_ppi_eg_unq.df[Prdeg.ix, ]
# 92% of APC-dependent Protein Degradation
# are mapped to the physical interactions network
length (Prdeg.genes)
# 62
length (unique (c (int_ppi_eg_Prdeg.df$Source, int_ppi_eg_Prdeg.df$Target)))
# 57
# WRITE TO CYTOSCAPE NETWORK FORMAT
setwd ("C:/Users/Daniele/Documents/DM New/Università/Classes_Taught/OBI Pathway Analysis/L2_Assignment/Data_Prep")
write.table (
diff_expr.df,
sep = "\t",
quote = F,
col.names = T,
row.names = F,
file = "ES_MCF7_Diff.txt"
)
write.table (
int_ppi_eg_RepF.df,
sep = "\t",
quote = F,
col.names = T,
row.names = F,
file = "ppiNetw_RepF_eg.txt"
)
write.table (
int_ppi_eg_RepDNA.df,
sep = "\t",
quote = F,
col.names = T,
row.names = F,
file = "ppiNetw_RepDNA_eg.txt"
)
write.table (
int_ppi_eg_Prdeg.df,
sep = "\t",
quote = F,
col.names = T,
row.names = F,
file = "ppiNetw_Prdeg_eg.txt"
)