mkatari-bioinformatics-august-2013-phytozome2gostats
This is an old revision of the document!
phytozome2gostats←function(annofile, speciesname) {
# annofile is the path to the go-term association file. # speciesname is the name to provide the GSEA dataset you will be creating. library("GOstats") library("GSEABase") anno<-read.table(annofile, header=FALSE, sep="\t", quote="", comment.char="")
# The annotation file is expected to be in Phytozome format where # second row columns is the gene and the tenth column is the GO-term. # Please change the column definition below accrounding to format of your go association file. justGeneGo<-anno[,c(2,10)]
# initialize vectors go_id=character() Evidence=character() gene_id=character() #loop row by to get all the go-terms. simple_counter=1 for (i in 1:nrow(justGeneGo)) { gene=as.character(justGeneGo[i,1])
# The phytozome format also expects multiple GO-terms associated with the # the gene to be in one line separated by commas. Please change this if necessary go=unlist(strsplit(as.character(justGeneGo[i,2]),",")) for (eachgo in go) { for (eachgene in gene) { go_id[simple_counter]=eachgo Evidence[simple_counter]="ISO" #Default GO-term evidence for simplicity. gene_id[simple_counter]=eachgene simple_counter=simple_counter+1 } } }
#This data frame is what is needed for the GOFram function. If you already have it so that one gene is associated with one GO-term per line, then you can skip the loop above. goframeData = data.frame(go_id, Evidence, gene_id) goFrame=GOFrame(goframeData,organism=speciesname) goAllFrame=GOAllFrame(goFrame) gsc <- GeneSetCollection(goAllFrame, setType = GOCollection()) return(gsc)
}
runGostats←function(gsc, geneids, geneuniverse, ont="BP", pval=0.05) { # this code is for running the gsc created with phytozome to gostats script. # gsc is the object retruned from phytozome to gostats # geneids is the list of genes in your list # geneuniverse is the list of all genes in the genome # the default for ontology is Bioloigical Process. # the default for adjusted p-value cutoff is 0.05
params <- GSEAGOHyperGParams(name="My Custom GSEA based annot Params", geneSetCollection=gsc, geneIds = geneids, universeGeneIds = geneuniverse, ontology = ont, pvalueCutoff = 1, conditional = TRUE, testDirection = "over")
#conditional=TRUE means only show parent if the go-term is significantly different from child term Over <- hyperGTest(params) allGoterms = summary(Over)
# correct p-values for multiple hypothesis testing. adjpvalues = p.adjust(allGoterms$Pvalue, method="BH") allGoterms$Pvalue = adjpvalues colnames(allGoterms)[2] = "adjPvalue" adjGoterms = allGoterms[adjpvalues < pval,] return(adjGoterms)
}
mkatari-bioinformatics-august-2013-phytozome2gostats.1471249194.txt.gz · Last modified: 2016/08/15 08:19 by mkatari