mkatari-bioinformatics-august-2013-phytozome2gostats
This is an old revision of the document!
Back to Manny's Bioinformatics Workshop HOME
phytozome2gostats<-function(annofile, speciesname) {
# annofile is the path to the go-term association file.
# speciesname is the name to provide the GSEA dataset you will be creating.
library("GOstats")
library("GSEABase")
anno<-read.table(annofile,
header=FALSE,
sep="\t",
quote="",
comment.char="")
# The annotation file is expected to be in Phytozome format where
# second row columns is the gene and the tenth column is the GO-term.
# Please change the column definition below accrounding to format of your go association file.
justGeneGo<-anno[,c(2,10)]
# initialize vectors
go_id=character()
Evidence=character()
gene_id=character()
#loop row by to get all the go-terms.
simple_counter=1
for (i in 1:nrow(justGeneGo)) {
gene=as.character(justGeneGo[i,1])
# The phytozome format also expects multiple GO-terms associated with the
# the gene to be in one line separated by commas. Please change this if necessary
go=unlist(strsplit(as.character(justGeneGo[i,2]),","))
for (eachgo in go) {
for (eachgene in gene) {
go_id[simple_counter]=eachgo
Evidence[simple_counter]="ISO" #Default GO-term evidence for simplicity.
gene_id[simple_counter]=eachgene
simple_counter=simple_counter+1
}
}
}
#This data frame is what is needed for the GOFram function. If you already have it so that one gene is associated with one GO-term per line, then you can skip the loop above.
goframeData = data.frame(go_id, Evidence, gene_id)
goFrame=GOFrame(goframeData,organism=speciesname)
goAllFrame=GOAllFrame(goFrame)
gsc <- GeneSetCollection(goAllFrame, setType = GOCollection())
return(gsc)
}
runGostats<-function(gsc, geneids, geneuniverse, ont="BP", pval=0.05) {
# this code is for running the gsc created with phytozome to gostats script.
# gsc is the object retruned from phytozome to gostats
# geneids is the list of genes in your list
# geneuniverse is the list of all genes in the genome
# the default for ontology is Bioloigical Process.
# the default for adjusted p-value cutoff is 0.05
params <- GSEAGOHyperGParams(name="My Custom GSEA based annot Params",
geneSetCollection=gsc,
geneIds = geneids,
universeGeneIds = geneuniverse,
ontology = ont,
pvalueCutoff = 1,
conditional = TRUE,
testDirection = "over")
#conditional=TRUE means only show parent if the go-term is significantly different from child term
Over <- hyperGTest(params)
allGoterms = summary(Over)
# correct p-values for multiple hypothesis testing.
adjpvalues = p.adjust(allGoterms$Pvalue, method="BH")
allGoterms$Pvalue = adjpvalues
colnames(allGoterms)[2] = "adjPvalue"
adjGoterms = allGoterms[adjpvalues < pval,]
return(adjGoterms)
}
mkatari-bioinformatics-august-2013-phytozome2gostats.1471249405.txt.gz · Last modified: by mkatari
