attachment:IlluminaHT12_V4_differential_expression.R of VeroniqueVoisin/Intranet/Protocol

Attachment 'IlluminaHT12_V4_differential_expression.R'

   1 #############################################################################
   2 #        R CODE FOR ILLUMINA BEADCHIPS ANALYSIS USING LUMI AND LIMMA        #
   3 #############################################################################
   4 
   5 ## YOU NEED 3 FILES TO RUN THIS SCRIPT ###
   6 
   7 # 1. Metadata consisting of sample names and array barcodes  (see instructions on how to create this document in help on format).
   8 
   9 # 2. Annotation file (see example in the help on format document)
  10 
  11 # 3. Illumina microarray raw data (see example in the help on format document)
  12 
  13 # see tips on how to run this code in the help on format document
  14 
  15 #########################################################################
  16 ######### PART OF THE CODE (VARIABLES) THAT YOU NEED TO MODIFY ##########
  17 #########################################################################
  18 
  19 ### NOTE: if you followed exactly the format of the 3 input files, you don't need to modify anything else in the code, just run the code line by line. 
  20 
  21 #The following variables need to be assigned inside the code, e.g.,
  22 #- workDirectory <- "C:/Users/changjiang/Documents/ut/LA02"
  23 #- metadataFileName <- "SampleBarcode_minus_65055.txt"     
  24 #- IlluminaDataFileName <- "sample_probe_nonnorm_FinalReport.txt"
  25 #- annotationFileName <- "HumanHT-12_V4_0_R2_15002873_B.txt"
  26 
  27 workDirectory <- "/Users/veroniquevoisin/Documents/Laurie Ailles/LA03/DEBUG_CODE"
  28 metadataFileName <- "SampleBarcode.txt"
  29 annotationFileName <- "HumanHT-12_V4_0_R2_15002873_B.txt"
  30 IlluminaDataFileName <- "sample_probe_nonnorm_FinalReport.txt"
  31 group1 <- "T"  # e.g "T"
  32 group2 <- "E" # e.g "C"
  33 designType = "paired"  #choice between "unpaired" or "paired"
  34 
  35 ### NOT IMPLEMENTED YET: do not change
  36 chiptype <- 'illumina_humanht_12_v4' ##### list number of choice here
  37 species <- 'human'  ### choices are 'human' or 'mouse'
  38 
  39 ### it is possible to add 1 extra confounding factor e.g tumor location, technical batch, tumor grade, male or female: the additional factor is an optional column in metadata called "variableX" and is incorporated in the model fitting. 
  40 
  41  ########################################################################
  42 ###############     R PACKAGES AND WORK DIRECTORY       ################
  43 ########################################################################
  44 
  45 ###  note : you just need to install the packages (also called library) once but YOU NEED to load the libraries each time you opened your workspace
  46 
  47 sessionInfo() # give you information about package versions and R versions that you used, information that you may need for publication purpose.
  48 
  49 #(1) install packages
  50 #(2) load packages
  51 #(3) set work directory
  52 
  53 ##Install packages
  54 #source("http://bioconductor.org/biocLite.R")
  55 #biocLite("BiocUpgrade")
  56 #biocLite()   #This installs a number of base packages
  57 #biocLite("limma")
  58 #biocLite("lumi")
  59 #####biocLite("affy")
  60 ######biocLite("preprocessCore") #In BioC software repository
  61 #biocLite("gplots")
  62 #biocLite("scatterplot3d")
  63 ####biocLite("lumiHumanAll.db")
  64 #biocLite("ggplot2")
  65 #biocLite("biomaRt")
  66 
  67 #update.packages(repos = biocinstallRepos())
  68 #install.packages("mgcv")
  69 
  70 ##Load library
  71 library(lumi)
  72 library(limma)
  73 library(scatterplot3d)
  74 #library(affy)
  75 #library(preprocessCore)
  76 library(gplots)
  77 library(gplots2)
  78 
  79 
  80 ##work directory
  81 
  82 ###
  83 setwd(workDirectory)
  84 #date <- date()
  85 #date <- gsub(" ", "", date)
  86 #temp = paste0(group1, "vs", group2, "_", date)
  87 subDir <- "temp"
  88 dir.create(file.path(workDirectory, subDir), showWarnings = FALSE)
  89 getwd()
  90 dir()
  91 
  92 ########################################################################
  93 ###############    METADATA: SAMPLE AND ARRAY BARCODES     #############
  94 ########################################################################
  95 
  96 
  97 ##Read metadata file
  98 
  99 sample_barcode <- read.table(metadataFileName, header=TRUE, stringsAsFactors=FALSE)
 100 dim(sample_barcode) #36  3
 101 sample_barcode
 102 head(sample_barcode)
 103 tail(sample_barcode)
 104 
 105 ##unique sample ID: sample_barcode$SampleNumber
 106 if (nrow(sample_barcode) != length(unique(sample_barcode$SampleNumber)))#36
 107  { print( "warning: your metadata contains non unique sample name")}
 108  
 109  
 110 ##add patient, marker, and chip
 111 patient <- unlist(lapply(strsplit(sample_barcode$sampleName, split="_"), function(s) s[1]))
 112 sample_barcode$patient <- patient
 113 table(sample_barcode$patient)
 114 
 115 #populations (markers)
 116 #marker <- unlist(lapply(strsplit(sample_barcode$sampleName, split="_"), function(s) s[2]))
 117 marker <- gsub(".*_", "", sample_barcode$sampleName)
 118 sample_barcode$marker <- marker
 119 table(sample_barcode$marker)
 120 
 121 #chips
 122 chip <- unlist(lapply(strsplit(sample_barcode$ArrayBarcode, split="_"), function(s) s[1]))
 123 chip <- factor(chip)
 124 table(chip)
 125 chip
 126 table(chip)
 127 sample_barcode$chip <- chip
 128 
 129 sample_barcode 
 130 
 131 ########################################################################
 132 ###############     PRE_PROCESSING USING LUMI PACKAGE       ############
 133 ########################################################################
 134 
 135 library(lumi)
 136 #(1) Read raw intensity data
 137 #(2) Normalization (In practice, this step should be after quality control.)
 138 #(3) Quality control (before and after normalization)
 139 
 140 ####################
 141 #(1) Read raw data
 142 ####################
 143 
 144 ##Illumina data file name
 145 datalumi <- lumiR(IlluminaDataFileName) # read the data
 146 pData(datalumi) # check if the samples have been loaded corectly
 147 
 148 ##change column names using sampleName in sample_barcode
 149 if (all(colnames(datalumi) %in% sample_barcode$ArrayBarcode) ) {
 150     indx <- match(colnames(datalumi), sample_barcode$ArrayBarcode)
 151     newSampleNames <- sample_barcode$sampleName
 152     colnames(datalumi)[!is.na(indx)] <- newSampleNames[indx[!is.na(indx)]]
 153     colnames(datalumi)
 154 } else {
 155     print("ERROR at change column names, check array barcode")
 156 }   
 157 
 158 ##summary of the raw data
 159 pData(datalumi)         # check the output to see whether the column names have been changed
 160 datalumi@QC             # display the quality control information of the LumiBatch object
 161 
 162 ####################
 163 #(2) Normalization
 164 ####################
 165 
 166 ##Variance Stabilization & Quantile normalization
 167 ##From raw Illumina probe intensities to expression values 
 168 ##return a processed LumiBatch object
 169 
 170 datalumiQN <- lumiExpresso(datalumi, QC.evaluation=TRUE, varianceStabilize.param=list(method='log2'), normalize.param = list(method='quantile'))
 171 summary(datalumiQN, 'QC')
 172 pData(datalumiQN)
 173 
 174 
 175 ###################################################################################
 176 ###############    QUALITY CONTROL PLOTS  ###########################
 177 #####################################################################################
 178 
 179 
 180 ##boxplot and density plot of both raw and normalized intensities on log2 scale
 181 pdf("temp/density_boxplot.pdf", width=11, height=11) # to save as the plots as pdf
 182 #png("temp/density_boxplot.png", width = 1.5*480, height = 1.5*480) # to save the plots as png
 183 par(mfrow=c(2,2), cex=0.8) # to create 4 panels
 184 plot(datalumi, what='density', main="Raw intensity on log2 scale", addLegend=F)  
 185 plot(datalumi, what='boxplot', main="Raw intensity on log2 scale")
 186 plot(datalumiQN, what='density', main="Quantile normalized data", addLegend=F)
 187 plot(datalumiQN, what='boxplot', main="Quantile normalized data") 
 188 dev.off() # to save an image on the local computer
 189 
 190 ##Pairs plot of sample intensities in an ExpressionSet object 
 191 #pairs(log2(ave.sig[sample(1:nrow(ave.sig), 5000), c(1,10)] + 1), pch=16, cex=0.5, lower.panel = NULL)
 192 #pairs(datalumi[, indx]) 
 193 indx <- c(1, 3) #a pair of two samples
 194 plot(datalumi[, indx], what='pair') # before normalization
 195 plot(datalumiQN[, indx], what='pair') # after normalization
 196 
 197 ##MA Plots
 198 indx <- c(1:2, 4:5) #any 4 samples
 199 MAplot(datalumi[, indx], subset=NULL) # before normalization
 200 MAplot(datalumiQN[, indx], subset=NULL) # after normalization
 201 
 202 
 203 ##############################################
 204 #############  CLUSTERING  ###################
 205 ##############################################
 206 
 207 ##1. Based on large coefficient of variance (mean / standard variance)
 208 ##Estimate the sample relations based on selected probes (the probes are selected if their coefficient of variance (mean / standard variance) are greater a threshold). 
 209 ##Two methods can be used: MDS (Multi-Dimensional Scaling) or hierarchical clustering methods (same as hclust). 
 210 
 211 #pdf("temp/sampleRelation.pdf")          
 212 #png("temp/sampleRelation.png")           
 213 cvTh <- 0.1  #a threshold of coefficient of variance
 214 plotSampleRelation(datalumiQN, cv.Th = cvTh, main=paste0("Clustering based on probes with sd/mean > ", cvTh))
 215 dev.off()
 216 
 217 ##2. Detect the outlier 
 218 ##The current outlier detection is based on the distance from the sample to the center (average of all samples after removing 10 percent samples farthest away from the center).
 219 
 220 pdf("temp/detectOutlier.pdf")
 221 temp <- detectOutlier(datalumiQN, ifPlot=TRUE) 
 222 dev.off()
 223 temp <- detectOutlier(datalumiQN, ifPlot=FALSE)
 224 any(temp) ##FALSE - none of the samples is outlier
 225 
 226 ##3. Clustering for the rows, in this case samples clustering, Euclidean Distance used here
 227 dataExprs <- exprs(datalumiQN) # transform the lumi object into a data.frame (table)
 228 tmp <- hclust(dist(t(dataExprs)),method="average")
 229 pdf("temp/hclust.pdf")
 230 #png("temp/hclustwithchip.png") #816 pixels = 8.5 inches
 231 plot(tmp, xlab=paste0(ncol(dataExprs), " samples"))
 232 dev.off()
 233 
 234 ##4. PCA (principal component analysis)
 235 #dataExprs <- exprs(datalumiQN) # transform the lumi object into a data.frame (table)
 236 clusteringPCA <- prcomp(t(dataExprs), scale=T)
 237 summary(clusteringPCA)
 238 
 239 marker <- factor(sample_barcode$marker)
 240 levels(marker) <- 1:length(levels(marker))
 241 markercolors <- palette(rainbow(length(levels(marker))))
 242 
 243 pdf("temp/pca2comps.pdf")
 244 plot(clusteringPCA$x[,1:2],col=markercolors[marker],pch=19,main="PCA")
 245 legend("topright", levels(factor(sample_barcode$marker)), col=markercolors, pch=19)
 246 dev.off()
 247 
 248 library(scatterplot3d)
 249 pdf("temp/pca3comps.pdf")
 250 scatterplot3d(clusteringPCA$x[,1:3],color=markercolors[marker],pch=19,main="PCA")
 251 legend("topleft", levels(factor(sample_barcode$marker)), col=markercolors, pch=19)
 252 dev.off()
 253 
 254 
 255 ####################
 256 #(3.3) Present counts
 257 ####################
 258 ## - Estimate or choose a threshold of detection p-values
 259 ## - A probe is present if its detection pvalue is less than or equal to the threshold. 
 260 ## - The probes that are absent in each sample should be removed for the analysis of differential expression.
 261 
 262 ##1. Retrieving the detection pvalues
 263 #identical(detection(datalumiQN), detection(datalumi))
 264 pvalues <- detection(datalumi)
 265 class(pvalues)
 266 pvalues[1:4,]
 267 
 268 ##boxplot
 269 pdf("temp/pvaluesBoxplot.pdf")
 270 par(mar=c(9,4,4,2))
 271 boxplot(pvalues, las=2, cex.axis = 0.75) 
 272 med_m <- apply(pvalues, 2, median) #median
 273 h <- round(max(med_m), digits=4)   #max median
 274 abline(h=h, lty=2, col=2)
 275 axis(2, at=h, h, cex.axis = 0.75, col.axis = "red", las=1)
 276 mtext(" max", side=4, cex = 0.75, col = "red", las=1, at=h)
 277 h <- round(quantile(c(as.matrix(pvalues)), 0.50), digits=4) #50%-quantile of all p-values
 278 abline(h=h, lty=2, col=2)
 279 axis(2, at=h, h, cex.axis = 0.75, col.axis = "red", las=1)
 280 mtext(" 50thQ", side=4, cex = 0.75, col = "red", las=1, at=h)
 281 title("Detection p-values")
 282 dev.off()
 283 
 284 
 285 ##2. Detection p-value threshold
 286 
 287 ##(1) estimate FDR using hisgram counts of the p-values
 288 ##Formula: FDR(x) = FP/(FP+TP) = n0*sum(p <= x)/sum(n[p <= x])
 289 ##  p - a vector of all detection p-values
 290 ##  x - a given p-value
 291 ##  n - counts of hisgram of p-values (i.e., p)
 292 ##  n0 - estimated average count of histogram of the absent probes' p-values (under H0), e.g., n0 = mean(n[p > 0.5]), the trimmed mean.
 293 
 294 nbreaks <- 1e3 #or length(c(pvalues))/1000
 295 f <- hist(c(pvalues), breaks = nbreaks, plot = F)
 296 
 297 n <- f$counts
 298 n0 <- mean(n[f$mids > 0.5])
 299 FDR <- sapply(f$mids, function(p) n0*sum(f$mids <= p)/sum(n[f$mids <= p]))
 300 max(f$mids[FDR <= 0.05]) #0.0435
 301 detection.p.th <- max(f$mids[FDR <= 0.1]) #FDR <= 0.1
 302 detection.p.th #is the detection threshold that has been calculated and that will be used to remove absent probes
 303 
 304 plot(FDR, f$mids, type="l") 
 305 abline(0, 1, col="gray")
 306 
 307 
 308 ##3. Estimate the present count of each gene (probe)
 309 ##the number of the samples that a probe presents 
 310 ##A probe is present if its detection pvalue is less than or equal to detection.p.th, a given threshold. 
 311 ##same as, rowSums(pvalues <= detection.p.th) 
 312 presentCount <- detectionCall(datalumiQN, Th = detection.p.th) # number of probes in each samples that are called present.
 313 
 314 ##histogram of the present counts: 
 315 ##The histogram represents a frequency of the probes that are present in the same number of samples.
 316 ##The lower end represents the number of probes that have detection-pvalues > detection.p.th in all samples (PresentCount=0)
 317 ##The upper end represents the probes that have detection p-values <= detection.p.th in all samples
 318 ##To speed up the processing and reduce false positives, remove the unexpressed genes, that is, remove all probes for which presentCount=0
 319 
 320 pdf("temp/hist_presentCount.pdf")
 321 hist(presentCount, main=paste0("Histogram of PresentCount of Probes at a Threshold of ", detection.p.th), cex.main=0.85, xlab="Number of samples", breaks=c(-1, 0:max(presentCount)))
 322 dev.off()
 323 sum(presentCount == 0) #6167
 324 sum(presentCount == ncol(datalumi)) #10912
 325 
 326 
 327 
 328 ##################################################################################################
 329 ###############      Differential expression (DE) analysis using limma package        ############
 330 ##################################################################################################
 331 
 332 
 333 
 334 #(1) data filtering: unexpressed genes (probes) and sample(s) (such as technical replicates, irrelevant patients, ...)
 335 #(2) model fitting for unpaired modelDesign (tests)
 336 #(3) model fitting for paired modelDesign (tests)
 337 #model fitting: 
 338 #fit  - lmFit, modelDesign <- model.matrix(...)
 339 #fit1 - contrasts.fit, contrast.matrix <- makeContrasts(...)
 340 #fit2 - eBayes
 341 #Question:
 342 #In the data filtering, which should we do first, filtering genes or samples? 
 343 #But the order does not matter if we give a fixed detection pvalue threshold.
 344 
 345 
 346 ###################
 347 #Annotation file 
 348 ###################
 349 
 350 
 351 ##Read annotation file from the analysis folder
 352 annotOriginal <- read.delim(annotationFileName, header = TRUE, sep = "\t", quote="\"", dec=".",fill = TRUE, comment.char="",skip=8, stringsAsFactors = FALSE)
 353 dim(annotOriginal) #48240    28
 354 head(annotOriginal)
 355 
 356 annotOriginal$Definition <- gsub("Homo sapiens", "",  annotOriginal$Definition)
 357 
 358 ##The row names of data must be in the annotation file, that is, the column Array_Address_Id!!!
 359 if (!all(rownames(datalumi) %in% annotOriginal$Array_Address_Id)){ print("some probes are not in the annotation file, please check that you are using the right annotation file")} # TRUE 
 360 
 361 ##extract the annotation file of the probes in the data
 362 anno <- annotOriginal[annotOriginal$Array_Address_Id %in% rownames(datalumi), ]
 363 dim(anno) #47323    28
 364 any(is.na(anno$Array_Address_Id)) #[1] FALSE
 365 rownames(anno) <- anno$Array_Address_Id
 366 table(anno$Species)
 367 head(anno)
 368 # Homo sapiens ILMN Controls 
 369 #        47231            92 
 370 
 371 ##combining annotations of interest and normalized expressions
 372 ##Notes:
 373 ##- The column of Array_Address_Id in the annotation data contains the unique probe IDs in the Illumina expression data.
 374 ##- The column of Probe_Id in the annotation data is not the same as the probe IDs in the Illumina expression data.
 375 #normlized expressions
 376 dataExprs <- exprs(datalumiQN)
 377 dim(dataExprs) #47323    36
 378 
 379 #annotations of interest
 380 annotationIncluded <- c("RefSeq_ID", "Entrez_Gene_ID", "Symbol", "Probe_Id", "Array_Address_Id", "Probe_Sequence", "Definition")
 381 if (!all(rownames(dataExprs) %in% anno$Array_Address_Id)){print("normalized data is not matching the annotation file")} #[1] TRUE
 382 indx <- NULL
 383 indx <- match(rownames(dataExprs), anno$Array_Address_Id)
 384 anno.exprs <- cbind(anno[indx, annotationIncluded], dataExprs)
 385 dim(anno.exprs) #47323    43
 386 identical(rownames(anno.exprs), as.character(anno.exprs$Array_Address_Id)) #[1] TRUE
 387 identical(rownames(anno.exprs), rownames(anno)[indx]) #[1] TRUE
 388 head(anno.exprs)
 389 
 390 filenm <- "temp/annotated_normalized_data"
 391 write.table(anno.exprs, file=paste0(filenm, ".txt"), quote=FALSE, sep="\t", row.names = FALSE)  
 392 
 393 ##remove all variables that will not be used subsequently.
 394 #Only "sample_barcode", "anno", "datalumi", and "datalumiQN" are needed for the analysis of differetial expression.
 395 #ls()
 396 #rm( list = setdiff(ls(), c("sample_barcode", "anno", "datalumi", "datalumiQN", "detection.p.th")) )
 397 #ls()
 398 
 399 
 400 ################################################################
 401 # Calculation differential expression (DE) between markers
 402 ################################################################
 403 
 404 ###################
 405 #I. Data filtering
 406 ###################
 407 
 408 ####  groups included in the comparison
 409 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(dataExprs))
 410 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(dataExprs))
 411 sample_included <- c(sample_group1, sample_group2)
 412 
 413 ## groups should only include paired samples
 414 if (designType =="paired")
 415 {
 416  dupPatient <- sample_barcode[ sample_barcode$marker %in% c(group1,group2) , ];
 417  dupPatient <- dupPatient$patient[duplicated(dupPatient$patient)];
 418  list <- NULL
 419  for ( i in 1:length(dupPatient))
 420  {
 421     list <- c(list,grep( paste0(dupPatient[i], "*") , colnames(dataExprs)))
 422  }
 423  sample_included <- sample_included[sample_included %in% list]
 424 };
 425 sample_included
 426 
 427 #filter unexpressed (absent) genes using present counts estimated by detection pvalue threshold
 428 
 429 ##normalized expressions
 430 dataExprs <- exprs(datalumiQN) # transform the lumi object in a data.frame
 431 dim(dataExprs) #47323    36
 432 
 433 #(1) presentCount estimated by detection.p.th 
 434 #### appply the present count only on groups included in the comparison in case normalized data includes different tissue types
 435 identical(colnames(datalumiQN), colnames(dataExprs))
 436 presentCount <- detectionCall(datalumiQN[ , sample_included], Th = detection.p.th) 
 437 head(presentCount)
 438 if (!identical(names(presentCount), rownames(dataExprs))) {print("names present count should have been identical to normalized expression names")} #TRUE
 439 
 440 
 441 #(3) filtering expressions 
 442 dataf <- dataExprs[presentCount > 0, sample_included ] 
 443 head(dataf)
 444 colnames(dataf)
 445 dim(dataf) #41156     9 #41156    33 #41156    36
 446 
 447 samplef <- sample_barcode[sample_included  , ]
 448 setequal(samplef$sampleName, colnames(dataf))  #[1] TRUE
 449 identical(samplef$sampleName, colnames(dataf)) #[1] TRUE. So orders are also identical.
 450 dim(samplef)
 451 
 452 samplef
 453 
 454 ###################
 455 #II. Model fittings
 456 ###################
 457 
 458 #Testing differential expression (DE) between markers
 459 #- unpaired design: in the case that the effects (mean) of patients are the same.
 460 #model fittings: 
 461 #fit  - lmFit, modelDesign <- model.matrix(...)
 462 #fit1 - contrasts.fit, contrast.matrix <- makeContrasts(...)
 463 #fit2 - eBayes
 464 
 465 
 466 patient <- factor(samplef$patient)
 467 table(patient)
 468 marker <- factor(samplef$marker)
 469 markerNames <- levels(marker)
 470 table(marker)
 471 chip <- factor(samplef$chip) 
 472 table(chip)
 473 variableX <- 0
 474 if (length(factor(samplef$variableX)) != 0 )
 475 variableX <- factor(samplef$variableX)
 476 
 477 ####if paired , shoud I remove other samples
 478 
 479 if (sum(samplef$variableX ) != nrow(samplef) )
 480 {
 481   if (designType == "paired") modelDesign <- model.matrix(~ 0 + marker + patient + variableX + chip) 
 482   if (designType == "unpaired") modelDesign <- model.matrix(~ 0 + marker +  variableX + chip) #not include an intercept.
 483 }
 484 if (sum(samplef$variableX ) == nrow(samplef) )
 485 {
 486   if (designType == "paired") modelDesign <- model.matrix(~ 0 + marker + patient + chip) 
 487   if (designType == "unpaired") modelDesign <- model.matrix(~ 0 + marker +  chip) #not include an intercept. 
 488 }
 489 
 490 dim(modelDesign)
 491 colnames(modelDesign)
 492 fit <- lmFit(dataf, modelDesign) 
 493 
 494 #(2) contrast fitting
 495 marker_group1 <- paste("marker",  group1, sep="")
 496 marker_group2 <- paste("marker",  group2, sep="")
 497 contrastnm <- paste(marker_group1, "-", marker_group2, sep="") #contrast between a pair of markers  #HARD CODED
 498 contrast.matrix <- makeContrasts(contrasts=contrastnm, levels=modelDesign)
 499 contrast.matrix
 500 fit1 <- contrasts.fit(fit, contrast.matrix)
 501 
 502 #(3) eBayes fitting
 503 fit2 <- eBayes(fit1)
 504 
 505 ##Mean Expressions for different marker 
 506 #produce a list of means for each marker (grouped by a marker)
 507 indx <- 1:length(marker)
 508 meangrp_list <- tapply(indx, marker[indx], function(i) rowMeans(dataf[, i])) 
 509 dim(meangrp_list)
 510 meangrp <- sapply(meangrp_list, function(x) x) #changed as a matrix
 511 dim(meangrp) #41156     3
 512 colnames(meangrp) <- paste0("marker", names(meangrp_list))
 513 head(meangrp)
 514 class(meangrp)
 515 logFC2 <- meangrp[,  c(paste0("marker", group1))] - meangrp[,  c(paste0("marker", group2))];
 516 logFC2 <- as.matrix(logFC2);
 517 
 518 ##Annotations need to be included
 519 annotationIncluded <- c("RefSeq_ID", "Entrez_Gene_ID", "Symbol", "Probe_Id", "Array_Address_Id", "Probe_Sequence", "Definition")
 520 
 521 topfit <- topTable(fit2, number=nrow(dataf), adjust="BH")   
 522 colnames(topfit)
 523 head(topfit)
 524 
 525 topfit <- topfit[, !(colnames(topfit) %in% c("B"))]
 526 head(topfit)
 527 dim(topfit)
 528 
 529 rownames(topfit) <- topfit$ID
 530 topID <- rownames(topfit)
 531 all(rownames(dataf) %in% rownames(meangrp))
 532 all(rownames(dataf) %in% anno$Array_Address_Id)
 533 all(rownames(topfit) %in% anno$Array_Address_Id)
 534 stopifnot( all(topID %in% rownames(meangrp)) ) #TRUE
 535 stopifnot( all(topID %in% anno$Array_Address_Id) ) #TRUE
 536 
 537 
 538 #combining annotations, mean expressions, and topfit 
 539 indx <- NULL
 540 indx <- match(topID, anno$Array_Address_Id)
 541 anno.topfit <- cbind(anno[indx, annotationIncluded], meangrp[topID, c(marker_group1, marker_group2)], logFC2[topID,], topfit)
 542 head(meangrp)
 543 
 544 #add the normalized data for the two markers
 545 indx <- NULL
 546 indx <- match(topID, rownames(dataf)) # rownames(dataf) is array id
 547 anno.topfit.dat <- cbind(anno.topfit, dataf[indx, ])
 548 
 549 
 550 ##########################################################
 551 ###### update annotation using the R biomaRt package #####
 552 ##########################################################
 553 
 554 library(biomaRt)
 555 mart = useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
 556 genes = getBM(attributes = c('illumina_humanht_12_v4', 'hgnc_symbol', 'description'), filters='illumina_humanht_12_v4', values=anno.topfit.dat$Probe_Id, mart=mart);
 557 genes[genes==""] = NA;
 558 head(genes)
 559 names(genes)
 560 names(genes) <- c("illumina_humanht_12_v4", "hgnc_symbol", "description")
 561 
 562 sum(duplicated(genes$hgnc_symbol))
 563 sum(duplicated(genes$description))
 564 sum(duplicated(genes$illumina_humanht_12_v4))
 565 genes$description = gsub("\\[Source.*", "", genes$description);
 566 sum(is.na(genes$hgnc_symbol))
 567 
 568 ##duplicated probes
 569 ##- one probe ID might correspond to more than one gene that could be in different chromosomes!
 570 ##- collapse gene names for the duplicated probe ID
 571 dupProbe <- unique( genes$illumina_humanht_12_v4[duplicated(genes$illumina_humanht_12_v4)] )
 572 Gene <-  genes$hgnc_symbol
 573 for (i in 1:length(dupProbe)) {
 574     indx <- (genes$illumina_humanht_12_v4 == dupProbe[i])
 575     Gene[indx] <- paste0(genes$hgnc_symbol[indx], collapse = "/")
 576     }
 577 genes$upd_symbol <- Gene
 578 rm(Gene)
 579 
 580 
 581 myannotate = function(data)
 582 {m = match(data$Probe_Id, genes$illumina_humanht_12_v4)
 583   data$description = genes[m,'description']
 584   data$GeneName = genes[m, 'upd_symbol']
 585   return(data)
 586 };
 587 
 588 anno.topfit.dat_annotated = myannotate(anno.topfit.dat)
 589 head(anno.topfit.dat_annotated)
 590 
 591 
 592 ####################
 593 # save the result table in the temp folder on the local computer
 594 ####################
 595 filenm <- contrastnm
 596 write.table(anno.topfit.dat_annotated, file=paste0("temp/", filenm,  "_", designType, ".txt"), quote=FALSE, sep="\t", row.names = FALSE)  
 597 #rm(anno.topfit.dat)
 598 
 599 
 600 
 601 ###########################################
 602 ##  hierarchical clustering on the 2 groups 
 603 ###########################################
 604 tmp <- hclust(dist(t(dataf)),method="average")
 605 pdf(paste("temp/hclust", group1, group2, ".pdf",  sep="_"))
 606 plot(tmp)
 607 dev.off()
 608 ############
 609 
 610 #############
 611 #### STARTING FROM THIS POINT, TABLES ARE REDUCED AT THE GENE LEVEL BY SELECTION OF ONE PROBE CORRESPONDING TO THE BEST T VALUE 
 612 ############
 613 anno.topfit.dat_annotated <- anno.topfit.dat_annotated[order(abs(anno.topfit.dat_annotated$t),decreasing=TRUE), ];
 614 anno.topfit.dat_annotated <- anno.topfit.dat_annotated[!duplicated(anno.topfit.dat_annotated$GeneName) & !is.na(anno.topfit.dat_annotated$GeneName), ];
 615 sum(duplicated(anno.topfit.dat_annotated$GeneName))
 616 
 617 ################
 618 ## Volcano plots
 619 ################
 620 library(ggplot2)
 621 
 622 
 623 head(anno.topfit.dat_annotated)
 624 colnames(anno.topfit.dat_annotated)
 625 ##Highlight genes that have an absolute fold change > 2 and a p-value < Bonferroni cut-off
 626 threshold = as.factor(abs(anno.topfit.dat_annotated$logFC) > 2 & anno.topfit.dat_annotated$adj.P.Val < 0.05)
 627  
 628 ##Construct the plot object
 629 g = ggplot(data=anno.topfit.dat_annotated, aes(x=logFC, y=-log10(adj.P.Val), colour=threshold)) +
 630   geom_point(alpha=0.4, size=1.75) +
 631   theme(legend.position = "none") +
 632   xlim(c(min(anno.topfit.dat_annotated$logFC-0.5), max(anno.topfit.dat_annotated$logFC+0.5))) + ylim(c(0, max((-log10(anno.topfit.dat_annotated$adj.P.Val))+0.5))) +
 633   xlab("log2 fold change") + ylab("-log10 p-value")
 634 
 635 anno.topfit.dat_annotated <- anno.topfit.dat_annotated[order(abs(anno.topfit.dat_annotated$t), decreasing=TRUE), ]
 636 dd_text2 = data=anno.topfit.dat_annotated[(abs(anno.topfit.dat_annotated$logFC) > 2), ]
 637 dd_text = anno.topfit.dat_annotated[ c(1:40),]
 638 
 639 text  = geom_text(data=dd_text, aes(x=logFC, y=-log10(adj.P.Val), label=GeneName), colour="black", size=2, hjust=-0.1, vjust=-0.1)
 640 text2  = geom_text(data=dd_text2, aes(x=logFC, y=-log10(adj.P.Val), label=GeneName), colour="black", size=2, hjust=-0.1, vjust=-0.1)
 641 title <- labs(title=paste(contrastnm, designType))
 642 pdf(paste0("temp/volcano_plot_", contrastnm, "_", designType, ".pdf"))
 643 g + text + text2 + title
 644 dev.off()
 645 
 646 ###############################################
 647 ## Heatmaps of the top 500 and top 50 genes
 648 ##############################################
 649 library(gplots)
 650 
 651 names(anno.topfit.dat_annotated)
 652 head(anno.topfit.dat_annotated)
 653 toptable500 <-anno.topfit.dat_annotated
 654 #Re-order probes according to adj Pval (FDR) from smallest to largest
 655 toptable500<-toptable500[order(toptable500$adj.P.Val,decreasing=FALSE),]
 656 head(toptable500)
 657 
 658 names(toptable500)
 659 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(toptable500))
 660 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(toptable500))
 661 
 662 toptable500_mx<-as.matrix(toptable500[c(1:500),c(sample_group1, sample_group2)])
 663 toptable500_df<-toptable500[c(1:500),];
 664 names(toptable500_df)
 665 
 666 names(toptable500_df)
 667 matrix <- toptable500_mx;
 668 myrownames <-toptable500_df$GeneName[1:500]
 669 mycolnames <-colnames(toptable500_mx)
 670  
 671 pdf(paste0("temp/heatmap_", group1, "_" ,group2,"_",  designType, "_", "top500.pdf"))
 672 heatmap.2(matrix, col=bluered(69), scale="row",key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.1,   Rowv=TRUE, Colv=TRUE, dendrogram = c("both"),
 673           labRow = myrownames, labCol=mycolnames,  cexCol=0.9, sepcolor="lightgray",  
 674           rowsep=FALSE, sepwidth=c(0.005,0.005), margins=c(9,9), 
 675           main=paste(group1, group2, designType, "top500", sep="_"));
 676 dev.off()
 677 
 678 #To create Excel table of toptable500 heatmap
 679 hm <- heatmap.2(matrix, col=bluered(69), scale="row",key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.1,   Rowv=TRUE, Colv=TRUE, dendrogram = c("both"),
 680           labRow = myrownames, labCol=mycolnames,  cexCol=0.9, sepcolor="lightgray",  
 681           rowsep=FALSE, sepwidth=c(0.005,0.005), margins=c(9,9), 
 682           main=paste(group1, group2, designType, "top500", sep="_"));
 683 
 684 head(toptable500_df)
 685 ordered_hm_df <- cbind(toptable500_df[ , c(1:15) ]  , toptable500_mx[ , hm$colInd])
 686 ordered_hm_df <- ordered_hm_df[rev(hm$rowInd)  , ]
 687 
 688 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(ordered_hm_df))
 689 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(ordered_hm_df))
 690 
 691 ordered_hm_df$mean <- apply(ordered_hm_df[ , c(sample_group1, sample_group2)], 1, mean);
 692 ordered_hm_df$sd <- apply(ordered_hm_df[ , c(sample_group1, sample_group2)], 1, sd);
 693 newdf <- apply(ordered_hm_df[ , c(sample_group1, sample_group2)], 2, function(x) ((x- ordered_hm_df$mean) / ordered_hm_df$sd))
 694 
 695 ordered_hm_df_scaled <- cbind(ordered_hm_df, newdf)      
 696 write.csv(ordered_hm_df_scaled, paste0("temp/heatmap_", group1, "_" ,group2,"_",  designType, "_", "top500.csv"))  ;          
 697 
 698 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(toptable500))
 699 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(toptable500))
 700 
 701 ## TOP50
 702 toptable50_mx<-as.matrix(toptable500[c(1:50),c(sample_group1, sample_group2)])
 703 toptable50_df<-toptable500[c(1:50),];
 704 matrix <- toptable50_mx;
 705 myrownames <-toptable50_df$GeneName[1:50]
 706 mycolnames <-colnames(toptable50_mx)
 707 pdf(paste0("temp/heatmap_", group1, "_" ,group2,"_",  designType, "_", "top50.pdf"))
 708 heatmap.2(matrix, col=bluered(69), scale="row",key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5,   Rowv=TRUE, Colv=TRUE, dendrogram = c("both"),
 709           labRow = myrownames, labCol=mycolnames,  cexCol=0.9, sepcolor="lightgray",  
 710           rowsep=FALSE, sepwidth=c(0.005,0.005), margins=c(9,9), 
 711           main=paste(group1, group2, designType, "top50", sep="_"));
 712 dev.off()
 713 
 714 
 715 #####################################################
 716 ## Preparing rank file and expression file for GSEA #
 717 #####################################################
 718 
 719 #### prepare rank files for GSEA ####
 720 myrank = function(data)
 721 {data = data[order(abs(data$t),decreasing=TRUE), ];
 722  data = data[!duplicated(data$GeneName) & !is.na(data$GeneName), ];
 723  data = data[order(data$t, decreasing=TRUE), ];
 724  rank = data[ , c("GeneName", "t")];
 725  return(rank)
 726 }
 727 
 728 anno.topfit.dat_rank = myrank(anno.topfit.dat_annotated)
 729 
 730 write.table(anno.topfit.dat_rank, paste0("temp/", contrastnm, "_", designType, ".RNK"), sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE);
 731 
 732 #### create expression file ####
 733 head(anno.topfit.dat_annotated)
 734 
 735 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(anno.topfit.dat_annotated))
 736 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(anno.topfit.dat_annotated))
 737 
 738 make_expression_file = function(data)
 739 {data = data[order(abs(data$t),decreasing=TRUE), ];
 740  data = data[!duplicated(data$GeneName) & !is.na(data$GeneName), ];
 741  data = data[order(data$t, decreasing=TRUE), ];
 742  data = cbind( data[ , c("GeneName","description")] ,  data[ , c(sample_group1, sample_group2)]    )
 743  return(data);#edited the Description   
 744 }
 745 
 746 anno.topfit.dat_expression = make_expression_file(anno.topfit.dat_annotated) 
 747 
 748 # write expression file
 749 write.table(anno.topfit.dat_expression, paste0("temp/", contrastnm, "_", designType, "_expression.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
 750 
 751 ### save the workspace
 752 save.image(paste0("temp/", contrastnm, ".RData"))
 753 
 754 ### remove the temp output folder with the name of the comparison
 755 ### the output folder should not exist
 756 if (file.exists("temp")) { file.rename("temp",  paste0(contrastnm, designType, "_output"))};
 757 
 758 
 759 
 760 
 761 
 762 
$
Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
[get | view] (2015-05-07 15:18:41, 220.0 KB) [[attachment:DOCUMENTATION_ILLUMINA_CODE.pdf]]
[get | view] (2014-10-01 15:34:24, 1612.2 KB) [[attachment:GSE11352paper.pdf]]
[get | view] (2014-10-01 16:19:53, 48.2 KB) [[attachment:GSEA_download.png]]
[get | view] (2015-03-02 19:17:28, 125.1 KB) [[attachment:GeneSet.png]]
[get | view] (2014-10-01 18:23:13, 6372.5 KB) [[attachment:Human_GOBP_AllPathways_no_GO_iea_August_20_2014_symbol.gmt]]
[get | view] (2015-05-07 15:13:10, 30.7 KB) [[attachment:IlluminaHT12_V4_differential_expression.R]]
[get | view] (2015-04-23 14:01:39, 133.4 KB) [[attachment:LeadingEdge.png]]
[get | view] (2014-10-01 18:22:53, 7040.0 KB) [[attachment:Mouse_GOBP_AllPathways_no_GO_iea_August_20_2014_symbol.gmt]]
[get | view] (2014-10-01 17:12:08, 295.1 KB) [[attachment:example_rank.rnk]]
[get | view] (2015-06-06 14:40:42, 1793.1 KB) [[attachment:oesophagus_paper.pdf]]
[get | view] (2014-10-01 16:08:10, 116.6 KB) [[attachment:preranked_workflow_steps.png]]
[get | view] (2014-10-01 17:10:59, 103.8 KB) [[attachment:runGSEAv2.png]]
All files | Selected Files: delete move to page
You are not allowed to attach a file to this page.