mkatari-bioinformatics-august-2013-phytozome2gostats
This is an old revision of the document!
phytozome2gostats←function(annofile, speciesname) {
# annofile is the path to the go-term association file.
# speciesname is the name to provide the GSEA dataset you will be creating.
library("GOstats")
library("GSEABase")
anno<-read.table(annofile,
header=FALSE,
sep="\t",
quote="",
comment.char="")
# The annotation file is expected to be in Phytozome format where # second row columns is the gene and the tenth column is the GO-term. # Please change the column definition below accrounding to format of your go association file. justGeneGo<-anno[,c(2,10)]
# initialize vectors
go_id=character()
Evidence=character()
gene_id=character()
#loop row by to get all the go-terms.
simple_counter=1
for (i in 1:nrow(justGeneGo)) {
gene=as.character(justGeneGo[i,1])
# The phytozome format also expects multiple GO-terms associated with the
# the gene to be in one line separated by commas. Please change this if necessary
go=unlist(strsplit(as.character(justGeneGo[i,2]),","))
for (eachgo in go) {
for (eachgene in gene) {
go_id[simple_counter]=eachgo
Evidence[simple_counter]="ISO" #Default GO-term evidence for simplicity.
gene_id[simple_counter]=eachgene
simple_counter=simple_counter+1
}
}
}
#This data frame is what is needed for the GOFram function. If you already have it so that one gene is associated with one GO-term per line, then you can skip the loop above. goframeData = data.frame(go_id, Evidence, gene_id) goFrame=GOFrame(goframeData,organism=speciesname) goAllFrame=GOAllFrame(goFrame) gsc <- GeneSetCollection(goAllFrame, setType = GOCollection()) return(gsc)
}
runGostats←function(gsc, geneids, geneuniverse, ont="BP", pval=0.05) { # this code is for running the gsc created with phytozome to gostats script. # gsc is the object retruned from phytozome to gostats # geneids is the list of genes in your list # geneuniverse is the list of all genes in the genome # the default for ontology is Bioloigical Process. # the default for adjusted p-value cutoff is 0.05
params <- GSEAGOHyperGParams(name="My Custom GSEA based annot Params",
geneSetCollection=gsc,
geneIds = geneids,
universeGeneIds = geneuniverse,
ontology = ont,
pvalueCutoff = 1,
conditional = TRUE,
testDirection = "over")
#conditional=TRUE means only show parent if the go-term is significantly different from child term Over <- hyperGTest(params) allGoterms = summary(Over)
# correct p-values for multiple hypothesis testing. adjpvalues = p.adjust(allGoterms$Pvalue, method="BH") allGoterms$Pvalue = adjpvalues colnames(allGoterms)[2] = "adjPvalue" adjGoterms = allGoterms[adjpvalues < pval,] return(adjGoterms)
}
mkatari-bioinformatics-august-2013-phytozome2gostats.1471249194.txt.gz · Last modified: by mkatari
