Attachment 'IlluminaHT12_V4_differential_expression.R'
Download 1 #############################################################################
2 # R CODE FOR ILLUMINA BEADCHIPS ANALYSIS USING LUMI AND LIMMA #
3 #############################################################################
4
5 ## YOU NEED 3 FILES TO RUN THIS SCRIPT ###
6
7 # 1. Metadata consisting of sample names and array barcodes (see instructions on how to create this document in help on format).
8
9 # 2. Annotation file (see example in the help on format document)
10
11 # 3. Illumina microarray raw data (see example in the help on format document)
12
13 # see tips on how to run this code in the help on format document
14
15 #########################################################################
16 ######### PART OF THE CODE (VARIABLES) THAT YOU NEED TO MODIFY ##########
17 #########################################################################
18
19 ### NOTE: if you followed exactly the format of the 3 input files, you don't need to modify anything else in the code, just run the code line by line.
20
21 #The following variables need to be assigned inside the code, e.g.,
22 #- workDirectory <- "C:/Users/changjiang/Documents/ut/LA02"
23 #- metadataFileName <- "SampleBarcode_minus_65055.txt"
24 #- IlluminaDataFileName <- "sample_probe_nonnorm_FinalReport.txt"
25 #- annotationFileName <- "HumanHT-12_V4_0_R2_15002873_B.txt"
26
27 workDirectory <- "/Users/veroniquevoisin/Documents/Laurie Ailles/LA03/DEBUG_CODE"
28 metadataFileName <- "SampleBarcode.txt"
29 annotationFileName <- "HumanHT-12_V4_0_R2_15002873_B.txt"
30 IlluminaDataFileName <- "sample_probe_nonnorm_FinalReport.txt"
31 group1 <- "T" # e.g "T"
32 group2 <- "E" # e.g "C"
33 designType = "paired" #choice between "unpaired" or "paired"
34
35 ### NOT IMPLEMENTED YET: do not change
36 chiptype <- 'illumina_humanht_12_v4' ##### list number of choice here
37 species <- 'human' ### choices are 'human' or 'mouse'
38
39 ### it is possible to add 1 extra confounding factor e.g tumor location, technical batch, tumor grade, male or female: the additional factor is an optional column in metadata called "variableX" and is incorporated in the model fitting.
40
41 ########################################################################
42 ############### R PACKAGES AND WORK DIRECTORY ################
43 ########################################################################
44
45 ### note : you just need to install the packages (also called library) once but YOU NEED to load the libraries each time you opened your workspace
46
47 sessionInfo() # give you information about package versions and R versions that you used, information that you may need for publication purpose.
48
49 #(1) install packages
50 #(2) load packages
51 #(3) set work directory
52
53 ##Install packages
54 #source("http://bioconductor.org/biocLite.R")
55 #biocLite("BiocUpgrade")
56 #biocLite() #This installs a number of base packages
57 #biocLite("limma")
58 #biocLite("lumi")
59 #####biocLite("affy")
60 ######biocLite("preprocessCore") #In BioC software repository
61 #biocLite("gplots")
62 #biocLite("scatterplot3d")
63 ####biocLite("lumiHumanAll.db")
64 #biocLite("ggplot2")
65 #biocLite("biomaRt")
66
67 #update.packages(repos = biocinstallRepos())
68 #install.packages("mgcv")
69
70 ##Load library
71 library(lumi)
72 library(limma)
73 library(scatterplot3d)
74 #library(affy)
75 #library(preprocessCore)
76 library(gplots)
77 library(gplots2)
78
79
80 ##work directory
81
82 ###
83 setwd(workDirectory)
84 #date <- date()
85 #date <- gsub(" ", "", date)
86 #temp = paste0(group1, "vs", group2, "_", date)
87 subDir <- "temp"
88 dir.create(file.path(workDirectory, subDir), showWarnings = FALSE)
89 getwd()
90 dir()
91
92 ########################################################################
93 ############### METADATA: SAMPLE AND ARRAY BARCODES #############
94 ########################################################################
95
96
97 ##Read metadata file
98
99 sample_barcode <- read.table(metadataFileName, header=TRUE, stringsAsFactors=FALSE)
100 dim(sample_barcode) #36 3
101 sample_barcode
102 head(sample_barcode)
103 tail(sample_barcode)
104
105 ##unique sample ID: sample_barcode$SampleNumber
106 if (nrow(sample_barcode) != length(unique(sample_barcode$SampleNumber)))#36
107 { print( "warning: your metadata contains non unique sample name")}
108
109
110 ##add patient, marker, and chip
111 patient <- unlist(lapply(strsplit(sample_barcode$sampleName, split="_"), function(s) s[1]))
112 sample_barcode$patient <- patient
113 table(sample_barcode$patient)
114
115 #populations (markers)
116 #marker <- unlist(lapply(strsplit(sample_barcode$sampleName, split="_"), function(s) s[2]))
117 marker <- gsub(".*_", "", sample_barcode$sampleName)
118 sample_barcode$marker <- marker
119 table(sample_barcode$marker)
120
121 #chips
122 chip <- unlist(lapply(strsplit(sample_barcode$ArrayBarcode, split="_"), function(s) s[1]))
123 chip <- factor(chip)
124 table(chip)
125 chip
126 table(chip)
127 sample_barcode$chip <- chip
128
129 sample_barcode
130
131 ########################################################################
132 ############### PRE_PROCESSING USING LUMI PACKAGE ############
133 ########################################################################
134
135 library(lumi)
136 #(1) Read raw intensity data
137 #(2) Normalization (In practice, this step should be after quality control.)
138 #(3) Quality control (before and after normalization)
139
140 ####################
141 #(1) Read raw data
142 ####################
143
144 ##Illumina data file name
145 datalumi <- lumiR(IlluminaDataFileName) # read the data
146 pData(datalumi) # check if the samples have been loaded corectly
147
148 ##change column names using sampleName in sample_barcode
149 if (all(colnames(datalumi) %in% sample_barcode$ArrayBarcode) ) {
150 indx <- match(colnames(datalumi), sample_barcode$ArrayBarcode)
151 newSampleNames <- sample_barcode$sampleName
152 colnames(datalumi)[!is.na(indx)] <- newSampleNames[indx[!is.na(indx)]]
153 colnames(datalumi)
154 } else {
155 print("ERROR at change column names, check array barcode")
156 }
157
158 ##summary of the raw data
159 pData(datalumi) # check the output to see whether the column names have been changed
160 datalumi@QC # display the quality control information of the LumiBatch object
161
162 ####################
163 #(2) Normalization
164 ####################
165
166 ##Variance Stabilization & Quantile normalization
167 ##From raw Illumina probe intensities to expression values
168 ##return a processed LumiBatch object
169
170 datalumiQN <- lumiExpresso(datalumi, QC.evaluation=TRUE, varianceStabilize.param=list(method='log2'), normalize.param = list(method='quantile'))
171 summary(datalumiQN, 'QC')
172 pData(datalumiQN)
173
174
175 ###################################################################################
176 ############### QUALITY CONTROL PLOTS ###########################
177 #####################################################################################
178
179
180 ##boxplot and density plot of both raw and normalized intensities on log2 scale
181 pdf("temp/density_boxplot.pdf", width=11, height=11) # to save as the plots as pdf
182 #png("temp/density_boxplot.png", width = 1.5*480, height = 1.5*480) # to save the plots as png
183 par(mfrow=c(2,2), cex=0.8) # to create 4 panels
184 plot(datalumi, what='density', main="Raw intensity on log2 scale", addLegend=F)
185 plot(datalumi, what='boxplot', main="Raw intensity on log2 scale")
186 plot(datalumiQN, what='density', main="Quantile normalized data", addLegend=F)
187 plot(datalumiQN, what='boxplot', main="Quantile normalized data")
188 dev.off() # to save an image on the local computer
189
190 ##Pairs plot of sample intensities in an ExpressionSet object
191 #pairs(log2(ave.sig[sample(1:nrow(ave.sig), 5000), c(1,10)] + 1), pch=16, cex=0.5, lower.panel = NULL)
192 #pairs(datalumi[, indx])
193 indx <- c(1, 3) #a pair of two samples
194 plot(datalumi[, indx], what='pair') # before normalization
195 plot(datalumiQN[, indx], what='pair') # after normalization
196
197 ##MA Plots
198 indx <- c(1:2, 4:5) #any 4 samples
199 MAplot(datalumi[, indx], subset=NULL) # before normalization
200 MAplot(datalumiQN[, indx], subset=NULL) # after normalization
201
202
203 ##############################################
204 ############# CLUSTERING ###################
205 ##############################################
206
207 ##1. Based on large coefficient of variance (mean / standard variance)
208 ##Estimate the sample relations based on selected probes (the probes are selected if their coefficient of variance (mean / standard variance) are greater a threshold).
209 ##Two methods can be used: MDS (Multi-Dimensional Scaling) or hierarchical clustering methods (same as hclust).
210
211 #pdf("temp/sampleRelation.pdf")
212 #png("temp/sampleRelation.png")
213 cvTh <- 0.1 #a threshold of coefficient of variance
214 plotSampleRelation(datalumiQN, cv.Th = cvTh, main=paste0("Clustering based on probes with sd/mean > ", cvTh))
215 dev.off()
216
217 ##2. Detect the outlier
218 ##The current outlier detection is based on the distance from the sample to the center (average of all samples after removing 10 percent samples farthest away from the center).
219
220 pdf("temp/detectOutlier.pdf")
221 temp <- detectOutlier(datalumiQN, ifPlot=TRUE)
222 dev.off()
223 temp <- detectOutlier(datalumiQN, ifPlot=FALSE)
224 any(temp) ##FALSE - none of the samples is outlier
225
226 ##3. Clustering for the rows, in this case samples clustering, Euclidean Distance used here
227 dataExprs <- exprs(datalumiQN) # transform the lumi object into a data.frame (table)
228 tmp <- hclust(dist(t(dataExprs)),method="average")
229 pdf("temp/hclust.pdf")
230 #png("temp/hclustwithchip.png") #816 pixels = 8.5 inches
231 plot(tmp, xlab=paste0(ncol(dataExprs), " samples"))
232 dev.off()
233
234 ##4. PCA (principal component analysis)
235 #dataExprs <- exprs(datalumiQN) # transform the lumi object into a data.frame (table)
236 clusteringPCA <- prcomp(t(dataExprs), scale=T)
237 summary(clusteringPCA)
238
239 marker <- factor(sample_barcode$marker)
240 levels(marker) <- 1:length(levels(marker))
241 markercolors <- palette(rainbow(length(levels(marker))))
242
243 pdf("temp/pca2comps.pdf")
244 plot(clusteringPCA$x[,1:2],col=markercolors[marker],pch=19,main="PCA")
245 legend("topright", levels(factor(sample_barcode$marker)), col=markercolors, pch=19)
246 dev.off()
247
248 library(scatterplot3d)
249 pdf("temp/pca3comps.pdf")
250 scatterplot3d(clusteringPCA$x[,1:3],color=markercolors[marker],pch=19,main="PCA")
251 legend("topleft", levels(factor(sample_barcode$marker)), col=markercolors, pch=19)
252 dev.off()
253
254
255 ####################
256 #(3.3) Present counts
257 ####################
258 ## - Estimate or choose a threshold of detection p-values
259 ## - A probe is present if its detection pvalue is less than or equal to the threshold.
260 ## - The probes that are absent in each sample should be removed for the analysis of differential expression.
261
262 ##1. Retrieving the detection pvalues
263 #identical(detection(datalumiQN), detection(datalumi))
264 pvalues <- detection(datalumi)
265 class(pvalues)
266 pvalues[1:4,]
267
268 ##boxplot
269 pdf("temp/pvaluesBoxplot.pdf")
270 par(mar=c(9,4,4,2))
271 boxplot(pvalues, las=2, cex.axis = 0.75)
272 med_m <- apply(pvalues, 2, median) #median
273 h <- round(max(med_m), digits=4) #max median
274 abline(h=h, lty=2, col=2)
275 axis(2, at=h, h, cex.axis = 0.75, col.axis = "red", las=1)
276 mtext(" max", side=4, cex = 0.75, col = "red", las=1, at=h)
277 h <- round(quantile(c(as.matrix(pvalues)), 0.50), digits=4) #50%-quantile of all p-values
278 abline(h=h, lty=2, col=2)
279 axis(2, at=h, h, cex.axis = 0.75, col.axis = "red", las=1)
280 mtext(" 50thQ", side=4, cex = 0.75, col = "red", las=1, at=h)
281 title("Detection p-values")
282 dev.off()
283
284
285 ##2. Detection p-value threshold
286
287 ##(1) estimate FDR using hisgram counts of the p-values
288 ##Formula: FDR(x) = FP/(FP+TP) = n0*sum(p <= x)/sum(n[p <= x])
289 ## p - a vector of all detection p-values
290 ## x - a given p-value
291 ## n - counts of hisgram of p-values (i.e., p)
292 ## n0 - estimated average count of histogram of the absent probes' p-values (under H0), e.g., n0 = mean(n[p > 0.5]), the trimmed mean.
293
294 nbreaks <- 1e3 #or length(c(pvalues))/1000
295 f <- hist(c(pvalues), breaks = nbreaks, plot = F)
296
297 n <- f$counts
298 n0 <- mean(n[f$mids > 0.5])
299 FDR <- sapply(f$mids, function(p) n0*sum(f$mids <= p)/sum(n[f$mids <= p]))
300 max(f$mids[FDR <= 0.05]) #0.0435
301 detection.p.th <- max(f$mids[FDR <= 0.1]) #FDR <= 0.1
302 detection.p.th #is the detection threshold that has been calculated and that will be used to remove absent probes
303
304 plot(FDR, f$mids, type="l")
305 abline(0, 1, col="gray")
306
307
308 ##3. Estimate the present count of each gene (probe)
309 ##the number of the samples that a probe presents
310 ##A probe is present if its detection pvalue is less than or equal to detection.p.th, a given threshold.
311 ##same as, rowSums(pvalues <= detection.p.th)
312 presentCount <- detectionCall(datalumiQN, Th = detection.p.th) # number of probes in each samples that are called present.
313
314 ##histogram of the present counts:
315 ##The histogram represents a frequency of the probes that are present in the same number of samples.
316 ##The lower end represents the number of probes that have detection-pvalues > detection.p.th in all samples (PresentCount=0)
317 ##The upper end represents the probes that have detection p-values <= detection.p.th in all samples
318 ##To speed up the processing and reduce false positives, remove the unexpressed genes, that is, remove all probes for which presentCount=0
319
320 pdf("temp/hist_presentCount.pdf")
321 hist(presentCount, main=paste0("Histogram of PresentCount of Probes at a Threshold of ", detection.p.th), cex.main=0.85, xlab="Number of samples", breaks=c(-1, 0:max(presentCount)))
322 dev.off()
323 sum(presentCount == 0) #6167
324 sum(presentCount == ncol(datalumi)) #10912
325
326
327
328 ##################################################################################################
329 ############### Differential expression (DE) analysis using limma package ############
330 ##################################################################################################
331
332
333
334 #(1) data filtering: unexpressed genes (probes) and sample(s) (such as technical replicates, irrelevant patients, ...)
335 #(2) model fitting for unpaired modelDesign (tests)
336 #(3) model fitting for paired modelDesign (tests)
337 #model fitting:
338 #fit - lmFit, modelDesign <- model.matrix(...)
339 #fit1 - contrasts.fit, contrast.matrix <- makeContrasts(...)
340 #fit2 - eBayes
341 #Question:
342 #In the data filtering, which should we do first, filtering genes or samples?
343 #But the order does not matter if we give a fixed detection pvalue threshold.
344
345
346 ###################
347 #Annotation file
348 ###################
349
350
351 ##Read annotation file from the analysis folder
352 annotOriginal <- read.delim(annotationFileName, header = TRUE, sep = "\t", quote="\"", dec=".",fill = TRUE, comment.char="",skip=8, stringsAsFactors = FALSE)
353 dim(annotOriginal) #48240 28
354 head(annotOriginal)
355
356 annotOriginal$Definition <- gsub("Homo sapiens", "", annotOriginal$Definition)
357
358 ##The row names of data must be in the annotation file, that is, the column Array_Address_Id!!!
359 if (!all(rownames(datalumi) %in% annotOriginal$Array_Address_Id)){ print("some probes are not in the annotation file, please check that you are using the right annotation file")} # TRUE
360
361 ##extract the annotation file of the probes in the data
362 anno <- annotOriginal[annotOriginal$Array_Address_Id %in% rownames(datalumi), ]
363 dim(anno) #47323 28
364 any(is.na(anno$Array_Address_Id)) #[1] FALSE
365 rownames(anno) <- anno$Array_Address_Id
366 table(anno$Species)
367 head(anno)
368 # Homo sapiens ILMN Controls
369 # 47231 92
370
371 ##combining annotations of interest and normalized expressions
372 ##Notes:
373 ##- The column of Array_Address_Id in the annotation data contains the unique probe IDs in the Illumina expression data.
374 ##- The column of Probe_Id in the annotation data is not the same as the probe IDs in the Illumina expression data.
375 #normlized expressions
376 dataExprs <- exprs(datalumiQN)
377 dim(dataExprs) #47323 36
378
379 #annotations of interest
380 annotationIncluded <- c("RefSeq_ID", "Entrez_Gene_ID", "Symbol", "Probe_Id", "Array_Address_Id", "Probe_Sequence", "Definition")
381 if (!all(rownames(dataExprs) %in% anno$Array_Address_Id)){print("normalized data is not matching the annotation file")} #[1] TRUE
382 indx <- NULL
383 indx <- match(rownames(dataExprs), anno$Array_Address_Id)
384 anno.exprs <- cbind(anno[indx, annotationIncluded], dataExprs)
385 dim(anno.exprs) #47323 43
386 identical(rownames(anno.exprs), as.character(anno.exprs$Array_Address_Id)) #[1] TRUE
387 identical(rownames(anno.exprs), rownames(anno)[indx]) #[1] TRUE
388 head(anno.exprs)
389
390 filenm <- "temp/annotated_normalized_data"
391 write.table(anno.exprs, file=paste0(filenm, ".txt"), quote=FALSE, sep="\t", row.names = FALSE)
392
393 ##remove all variables that will not be used subsequently.
394 #Only "sample_barcode", "anno", "datalumi", and "datalumiQN" are needed for the analysis of differetial expression.
395 #ls()
396 #rm( list = setdiff(ls(), c("sample_barcode", "anno", "datalumi", "datalumiQN", "detection.p.th")) )
397 #ls()
398
399
400 ################################################################
401 # Calculation differential expression (DE) between markers
402 ################################################################
403
404 ###################
405 #I. Data filtering
406 ###################
407
408 #### groups included in the comparison
409 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(dataExprs))
410 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(dataExprs))
411 sample_included <- c(sample_group1, sample_group2)
412
413 ## groups should only include paired samples
414 if (designType =="paired")
415 {
416 dupPatient <- sample_barcode[ sample_barcode$marker %in% c(group1,group2) , ];
417 dupPatient <- dupPatient$patient[duplicated(dupPatient$patient)];
418 list <- NULL
419 for ( i in 1:length(dupPatient))
420 {
421 list <- c(list,grep( paste0(dupPatient[i], "*") , colnames(dataExprs)))
422 }
423 sample_included <- sample_included[sample_included %in% list]
424 };
425 sample_included
426
427 #filter unexpressed (absent) genes using present counts estimated by detection pvalue threshold
428
429 ##normalized expressions
430 dataExprs <- exprs(datalumiQN) # transform the lumi object in a data.frame
431 dim(dataExprs) #47323 36
432
433 #(1) presentCount estimated by detection.p.th
434 #### appply the present count only on groups included in the comparison in case normalized data includes different tissue types
435 identical(colnames(datalumiQN), colnames(dataExprs))
436 presentCount <- detectionCall(datalumiQN[ , sample_included], Th = detection.p.th)
437 head(presentCount)
438 if (!identical(names(presentCount), rownames(dataExprs))) {print("names present count should have been identical to normalized expression names")} #TRUE
439
440
441 #(3) filtering expressions
442 dataf <- dataExprs[presentCount > 0, sample_included ]
443 head(dataf)
444 colnames(dataf)
445 dim(dataf) #41156 9 #41156 33 #41156 36
446
447 samplef <- sample_barcode[sample_included , ]
448 setequal(samplef$sampleName, colnames(dataf)) #[1] TRUE
449 identical(samplef$sampleName, colnames(dataf)) #[1] TRUE. So orders are also identical.
450 dim(samplef)
451
452 samplef
453
454 ###################
455 #II. Model fittings
456 ###################
457
458 #Testing differential expression (DE) between markers
459 #- unpaired design: in the case that the effects (mean) of patients are the same.
460 #model fittings:
461 #fit - lmFit, modelDesign <- model.matrix(...)
462 #fit1 - contrasts.fit, contrast.matrix <- makeContrasts(...)
463 #fit2 - eBayes
464
465
466 patient <- factor(samplef$patient)
467 table(patient)
468 marker <- factor(samplef$marker)
469 markerNames <- levels(marker)
470 table(marker)
471 chip <- factor(samplef$chip)
472 table(chip)
473 variableX <- 0
474 if (length(factor(samplef$variableX)) != 0 )
475 variableX <- factor(samplef$variableX)
476
477 ####if paired , shoud I remove other samples
478
479 if (sum(samplef$variableX ) != nrow(samplef) )
480 {
481 if (designType == "paired") modelDesign <- model.matrix(~ 0 + marker + patient + variableX + chip)
482 if (designType == "unpaired") modelDesign <- model.matrix(~ 0 + marker + variableX + chip) #not include an intercept.
483 }
484 if (sum(samplef$variableX ) == nrow(samplef) )
485 {
486 if (designType == "paired") modelDesign <- model.matrix(~ 0 + marker + patient + chip)
487 if (designType == "unpaired") modelDesign <- model.matrix(~ 0 + marker + chip) #not include an intercept.
488 }
489
490 dim(modelDesign)
491 colnames(modelDesign)
492 fit <- lmFit(dataf, modelDesign)
493
494 #(2) contrast fitting
495 marker_group1 <- paste("marker", group1, sep="")
496 marker_group2 <- paste("marker", group2, sep="")
497 contrastnm <- paste(marker_group1, "-", marker_group2, sep="") #contrast between a pair of markers #HARD CODED
498 contrast.matrix <- makeContrasts(contrasts=contrastnm, levels=modelDesign)
499 contrast.matrix
500 fit1 <- contrasts.fit(fit, contrast.matrix)
501
502 #(3) eBayes fitting
503 fit2 <- eBayes(fit1)
504
505 ##Mean Expressions for different marker
506 #produce a list of means for each marker (grouped by a marker)
507 indx <- 1:length(marker)
508 meangrp_list <- tapply(indx, marker[indx], function(i) rowMeans(dataf[, i]))
509 dim(meangrp_list)
510 meangrp <- sapply(meangrp_list, function(x) x) #changed as a matrix
511 dim(meangrp) #41156 3
512 colnames(meangrp) <- paste0("marker", names(meangrp_list))
513 head(meangrp)
514 class(meangrp)
515 logFC2 <- meangrp[, c(paste0("marker", group1))] - meangrp[, c(paste0("marker", group2))];
516 logFC2 <- as.matrix(logFC2);
517
518 ##Annotations need to be included
519 annotationIncluded <- c("RefSeq_ID", "Entrez_Gene_ID", "Symbol", "Probe_Id", "Array_Address_Id", "Probe_Sequence", "Definition")
520
521 topfit <- topTable(fit2, number=nrow(dataf), adjust="BH")
522 colnames(topfit)
523 head(topfit)
524
525 topfit <- topfit[, !(colnames(topfit) %in% c("B"))]
526 head(topfit)
527 dim(topfit)
528
529 rownames(topfit) <- topfit$ID
530 topID <- rownames(topfit)
531 all(rownames(dataf) %in% rownames(meangrp))
532 all(rownames(dataf) %in% anno$Array_Address_Id)
533 all(rownames(topfit) %in% anno$Array_Address_Id)
534 stopifnot( all(topID %in% rownames(meangrp)) ) #TRUE
535 stopifnot( all(topID %in% anno$Array_Address_Id) ) #TRUE
536
537
538 #combining annotations, mean expressions, and topfit
539 indx <- NULL
540 indx <- match(topID, anno$Array_Address_Id)
541 anno.topfit <- cbind(anno[indx, annotationIncluded], meangrp[topID, c(marker_group1, marker_group2)], logFC2[topID,], topfit)
542 head(meangrp)
543
544 #add the normalized data for the two markers
545 indx <- NULL
546 indx <- match(topID, rownames(dataf)) # rownames(dataf) is array id
547 anno.topfit.dat <- cbind(anno.topfit, dataf[indx, ])
548
549
550 ##########################################################
551 ###### update annotation using the R biomaRt package #####
552 ##########################################################
553
554 library(biomaRt)
555 mart = useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
556 genes = getBM(attributes = c('illumina_humanht_12_v4', 'hgnc_symbol', 'description'), filters='illumina_humanht_12_v4', values=anno.topfit.dat$Probe_Id, mart=mart);
557 genes[genes==""] = NA;
558 head(genes)
559 names(genes)
560 names(genes) <- c("illumina_humanht_12_v4", "hgnc_symbol", "description")
561
562 sum(duplicated(genes$hgnc_symbol))
563 sum(duplicated(genes$description))
564 sum(duplicated(genes$illumina_humanht_12_v4))
565 genes$description = gsub("\\[Source.*", "", genes$description);
566 sum(is.na(genes$hgnc_symbol))
567
568 ##duplicated probes
569 ##- one probe ID might correspond to more than one gene that could be in different chromosomes!
570 ##- collapse gene names for the duplicated probe ID
571 dupProbe <- unique( genes$illumina_humanht_12_v4[duplicated(genes$illumina_humanht_12_v4)] )
572 Gene <- genes$hgnc_symbol
573 for (i in 1:length(dupProbe)) {
574 indx <- (genes$illumina_humanht_12_v4 == dupProbe[i])
575 Gene[indx] <- paste0(genes$hgnc_symbol[indx], collapse = "/")
576 }
577 genes$upd_symbol <- Gene
578 rm(Gene)
579
580
581 myannotate = function(data)
582 {m = match(data$Probe_Id, genes$illumina_humanht_12_v4)
583 data$description = genes[m,'description']
584 data$GeneName = genes[m, 'upd_symbol']
585 return(data)
586 };
587
588 anno.topfit.dat_annotated = myannotate(anno.topfit.dat)
589 head(anno.topfit.dat_annotated)
590
591
592 ####################
593 # save the result table in the temp folder on the local computer
594 ####################
595 filenm <- contrastnm
596 write.table(anno.topfit.dat_annotated, file=paste0("temp/", filenm, "_", designType, ".txt"), quote=FALSE, sep="\t", row.names = FALSE)
597 #rm(anno.topfit.dat)
598
599
600
601 ###########################################
602 ## hierarchical clustering on the 2 groups
603 ###########################################
604 tmp <- hclust(dist(t(dataf)),method="average")
605 pdf(paste("temp/hclust", group1, group2, ".pdf", sep="_"))
606 plot(tmp)
607 dev.off()
608 ############
609
610 #############
611 #### STARTING FROM THIS POINT, TABLES ARE REDUCED AT THE GENE LEVEL BY SELECTION OF ONE PROBE CORRESPONDING TO THE BEST T VALUE
612 ############
613 anno.topfit.dat_annotated <- anno.topfit.dat_annotated[order(abs(anno.topfit.dat_annotated$t),decreasing=TRUE), ];
614 anno.topfit.dat_annotated <- anno.topfit.dat_annotated[!duplicated(anno.topfit.dat_annotated$GeneName) & !is.na(anno.topfit.dat_annotated$GeneName), ];
615 sum(duplicated(anno.topfit.dat_annotated$GeneName))
616
617 ################
618 ## Volcano plots
619 ################
620 library(ggplot2)
621
622
623 head(anno.topfit.dat_annotated)
624 colnames(anno.topfit.dat_annotated)
625 ##Highlight genes that have an absolute fold change > 2 and a p-value < Bonferroni cut-off
626 threshold = as.factor(abs(anno.topfit.dat_annotated$logFC) > 2 & anno.topfit.dat_annotated$adj.P.Val < 0.05)
627
628 ##Construct the plot object
629 g = ggplot(data=anno.topfit.dat_annotated, aes(x=logFC, y=-log10(adj.P.Val), colour=threshold)) +
630 geom_point(alpha=0.4, size=1.75) +
631 theme(legend.position = "none") +
632 xlim(c(min(anno.topfit.dat_annotated$logFC-0.5), max(anno.topfit.dat_annotated$logFC+0.5))) + ylim(c(0, max((-log10(anno.topfit.dat_annotated$adj.P.Val))+0.5))) +
633 xlab("log2 fold change") + ylab("-log10 p-value")
634
635 anno.topfit.dat_annotated <- anno.topfit.dat_annotated[order(abs(anno.topfit.dat_annotated$t), decreasing=TRUE), ]
636 dd_text2 = data=anno.topfit.dat_annotated[(abs(anno.topfit.dat_annotated$logFC) > 2), ]
637 dd_text = anno.topfit.dat_annotated[ c(1:40),]
638
639 text = geom_text(data=dd_text, aes(x=logFC, y=-log10(adj.P.Val), label=GeneName), colour="black", size=2, hjust=-0.1, vjust=-0.1)
640 text2 = geom_text(data=dd_text2, aes(x=logFC, y=-log10(adj.P.Val), label=GeneName), colour="black", size=2, hjust=-0.1, vjust=-0.1)
641 title <- labs(title=paste(contrastnm, designType))
642 pdf(paste0("temp/volcano_plot_", contrastnm, "_", designType, ".pdf"))
643 g + text + text2 + title
644 dev.off()
645
646 ###############################################
647 ## Heatmaps of the top 500 and top 50 genes
648 ##############################################
649 library(gplots)
650
651 names(anno.topfit.dat_annotated)
652 head(anno.topfit.dat_annotated)
653 toptable500 <-anno.topfit.dat_annotated
654 #Re-order probes according to adj Pval (FDR) from smallest to largest
655 toptable500<-toptable500[order(toptable500$adj.P.Val,decreasing=FALSE),]
656 head(toptable500)
657
658 names(toptable500)
659 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(toptable500))
660 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(toptable500))
661
662 toptable500_mx<-as.matrix(toptable500[c(1:500),c(sample_group1, sample_group2)])
663 toptable500_df<-toptable500[c(1:500),];
664 names(toptable500_df)
665
666 names(toptable500_df)
667 matrix <- toptable500_mx;
668 myrownames <-toptable500_df$GeneName[1:500]
669 mycolnames <-colnames(toptable500_mx)
670
671 pdf(paste0("temp/heatmap_", group1, "_" ,group2,"_", designType, "_", "top500.pdf"))
672 heatmap.2(matrix, col=bluered(69), scale="row",key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.1, Rowv=TRUE, Colv=TRUE, dendrogram = c("both"),
673 labRow = myrownames, labCol=mycolnames, cexCol=0.9, sepcolor="lightgray",
674 rowsep=FALSE, sepwidth=c(0.005,0.005), margins=c(9,9),
675 main=paste(group1, group2, designType, "top500", sep="_"));
676 dev.off()
677
678 #To create Excel table of toptable500 heatmap
679 hm <- heatmap.2(matrix, col=bluered(69), scale="row",key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.1, Rowv=TRUE, Colv=TRUE, dendrogram = c("both"),
680 labRow = myrownames, labCol=mycolnames, cexCol=0.9, sepcolor="lightgray",
681 rowsep=FALSE, sepwidth=c(0.005,0.005), margins=c(9,9),
682 main=paste(group1, group2, designType, "top500", sep="_"));
683
684 head(toptable500_df)
685 ordered_hm_df <- cbind(toptable500_df[ , c(1:15) ] , toptable500_mx[ , hm$colInd])
686 ordered_hm_df <- ordered_hm_df[rev(hm$rowInd) , ]
687
688 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(ordered_hm_df))
689 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(ordered_hm_df))
690
691 ordered_hm_df$mean <- apply(ordered_hm_df[ , c(sample_group1, sample_group2)], 1, mean);
692 ordered_hm_df$sd <- apply(ordered_hm_df[ , c(sample_group1, sample_group2)], 1, sd);
693 newdf <- apply(ordered_hm_df[ , c(sample_group1, sample_group2)], 2, function(x) ((x- ordered_hm_df$mean) / ordered_hm_df$sd))
694
695 ordered_hm_df_scaled <- cbind(ordered_hm_df, newdf)
696 write.csv(ordered_hm_df_scaled, paste0("temp/heatmap_", group1, "_" ,group2,"_", designType, "_", "top500.csv")) ;
697
698 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(toptable500))
699 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(toptable500))
700
701 ## TOP50
702 toptable50_mx<-as.matrix(toptable500[c(1:50),c(sample_group1, sample_group2)])
703 toptable50_df<-toptable500[c(1:50),];
704 matrix <- toptable50_mx;
705 myrownames <-toptable50_df$GeneName[1:50]
706 mycolnames <-colnames(toptable50_mx)
707 pdf(paste0("temp/heatmap_", group1, "_" ,group2,"_", designType, "_", "top50.pdf"))
708 heatmap.2(matrix, col=bluered(69), scale="row",key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5, Rowv=TRUE, Colv=TRUE, dendrogram = c("both"),
709 labRow = myrownames, labCol=mycolnames, cexCol=0.9, sepcolor="lightgray",
710 rowsep=FALSE, sepwidth=c(0.005,0.005), margins=c(9,9),
711 main=paste(group1, group2, designType, "top50", sep="_"));
712 dev.off()
713
714
715 #####################################################
716 ## Preparing rank file and expression file for GSEA #
717 #####################################################
718
719 #### prepare rank files for GSEA ####
720 myrank = function(data)
721 {data = data[order(abs(data$t),decreasing=TRUE), ];
722 data = data[!duplicated(data$GeneName) & !is.na(data$GeneName), ];
723 data = data[order(data$t, decreasing=TRUE), ];
724 rank = data[ , c("GeneName", "t")];
725 return(rank)
726 }
727
728 anno.topfit.dat_rank = myrank(anno.topfit.dat_annotated)
729
730 write.table(anno.topfit.dat_rank, paste0("temp/", contrastnm, "_", designType, ".RNK"), sep='\t', quote=FALSE, row.names=FALSE, col.names=FALSE);
731
732 #### create expression file ####
733 head(anno.topfit.dat_annotated)
734
735 sample_group1 <- grep (paste(".*_", group1, "$", sep=""), colnames(anno.topfit.dat_annotated))
736 sample_group2 <- grep (paste(".*_", group2, "$", sep=""), colnames(anno.topfit.dat_annotated))
737
738 make_expression_file = function(data)
739 {data = data[order(abs(data$t),decreasing=TRUE), ];
740 data = data[!duplicated(data$GeneName) & !is.na(data$GeneName), ];
741 data = data[order(data$t, decreasing=TRUE), ];
742 data = cbind( data[ , c("GeneName","description")] , data[ , c(sample_group1, sample_group2)] )
743 return(data);#edited the Description
744 }
745
746 anno.topfit.dat_expression = make_expression_file(anno.topfit.dat_annotated)
747
748 # write expression file
749 write.table(anno.topfit.dat_expression, paste0("temp/", contrastnm, "_", designType, "_expression.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
750
751 ### save the workspace
752 save.image(paste0("temp/", contrastnm, ".RData"))
753
754 ### remove the temp output folder with the name of the comparison
755 ### the output folder should not exist
756 if (file.exists("temp")) { file.rename("temp", paste0(contrastnm, designType, "_output"))};
757
758
759
760
761
762
$
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.