### This is an R script that implements the program DESeq for gene expression analysis. ### Much more information on the program and specific function (particularly for checking quality) ### can be found here: http://bioconductor.org/packages/2.8/bioc/vignettes/DESeq/inst/doc/DESeq.pdf ### The elements of this script were written by Melissa Pespeni and Daniel Barshis. #setwd("YOURWORKINGDIRECTORYPATH") # The drag and drop from finder works in R, too. library(DESeq) #useful functions #head() - prints out the top 6 lines #dim() - prints the dimensions of a variable #nrow() - returns the number of rows in a vector or matrix # ?[functionName] - opens documentation describing the function #read in your data to make counts table countsTable <- read.delim('combinedcounts.txt', header=TRUE, stringsAsFactors=TRUE, row.names=1) head(countsTable) #define the conditions of each individual (e.g. "Cold" and "Hot") There should be the same number of conditions described as there are samples in your data file, and in the same order conds <- c("Cold","Cold","Cold","Cold","Cold","Cold","Hot","Hot","Hot","Hot","Hot","Hot") #make count data sets cds <- newCountDataSet(countsTable, conds) cds<-estimateSizeFactors(cds) sizeFactors(cds) # factor such that values in a column can be brought to a common scale by dividing by the corresponding size factor cds<-estimateVarianceFunctions(cds) #negative binomial testing (e.g. compare "Cold" vs. "Hot" for differences in gene expression) res <- nbinomTest(cds, "Cold", "Hot") head(res) #count the number of significantly differentially expressed genes nrow(res[res$padj<0.05 & !is.na(res$padj),]) # at the .05 level nrow(res[res$padj<0.01 & !is.na(res$padj),]) # at the .01 level #filter for contigs with average(baseMean) >5 res5<-res[res$baseMean>5, ] dim(res) dim(res5) # number of genes that have >5 counts #p-value readjustment Benjamini and Hochberg after >5 filtering res5$padj <- p.adjust(res5$pval, method="BH") nrow(res5[res5$padj<0.01 & !is.na(res5$padj),]) # Num significantly differentially expressed genes excluding the no/low count genes #### Now filter for average counts and variance # Make a counts table that is scaled by the size factors temp = t(sizeFactors(cds)) sizematrix<-matrix(data=temp, nrow=nrow(countsTable), ncol=ncol(temp), byrow=TRUE) scaledcounts = countsTable/sizematrix head(scaledcounts) # Calculate the standard deviation across all individuals for each gene sdcol<-apply(scaledcounts,1,sd) head(sdcol) sdcolT<-data.frame(sdcol) head(sdcolT) # Combine the results data frame with the calculated standard deviation ressd<-cbind(res,sdcolT) head(ressd) ressdFilter<-ressd[ressd$baseMean>5 & ressd$sdcol5 counts and where the mean is greater than the std deviation #p-value readjustment Benjamini and Hochberg after >5 filtering ressdFilter$padj <- p.adjust(ressdFilter$pval, method="BH") head(ressdFilter) nrow(ressdFilter[ressdFilter$padj<0.01 & !is.na(ressdFilter$padj),]) # Num significantly differentially expressed genes in the filtered data frame #### Output files that can be generated: id<-data.frame(rownames(scaledcounts)) # making a data frame of the rownames from the scaledcounts data frame colnames(id)<-c("id") # renaming the column header to "id" scaledcountsID<-cbind(id,scaledcounts) # combining the columns (column binding) CountsAndResTable<-merge(scaledcountsID,res, sort=FALSE) # merge the two tables; uses the common 'id' column to link the tables head(CountsAndResTable) dim(CountsAndResTable) write.table(CountsAndResTable, file="ScaledcountsAndResultsOutput.txt", sep="\t", row.names=F) CountsAndRessdTable<-merge(scaledcountsID,ressdFilter, sort=FALSE) # merge the two tables; uses the common 'id' column to link the tables head(CountsAndRessdTable) dim(CountsAndRessdTable) write.table(CountsAndRessdTable, file="ScaledcountsAndFilteredResultsOutput.txt", sep="\t", row.names=F) write.table(ressdFilter, file="FilteredResultsOutput.txt", sep="\t", row.names=F) CountsAndResTable<-merge(scaledcountsID,ressdFilter) head(CountsAndResTable) dim(CountsAndResTable) write.table(CountsAndResTable, file="CountsAndResultsOutput.txt", sep="\t", row.names=F) Sig<-data.frame(CountsAndResTable[ressdFilter$padj<0.01 & !is.na(ressdFilter$padj),]) head(Sig) dim(Sig) write.table(Sig, file="Sig_CountsAndResultsOutput.txt", sep="\t", row.names=F) library(gplots) heatmap.2(data.matrix(scaledcounts))