User Tools

Site Tools


mkatari-bioinformatics-august-2013-phytozome2gostats

This is an old revision of the document!


Back to Manny's Bioinformatics Workshop HOME

phytozome2gostats<-function(annofile, speciesname) {

    # annofile is the path to the go-term association file.
    # speciesname is the name to provide the GSEA dataset you will be creating.
    
    library("GOstats")
    library("GSEABase")
    
    anno<-read.table(annofile,
                     header=FALSE, 
                     sep="\t", 
                     quote="",
                     comment.char="")

    
    # The annotation file is expected to be in Phytozome format where
    # second row columns is the gene and the tenth column is the GO-term.
    # Please change the column definition below accrounding to format of your go association file.
    justGeneGo<-anno[,c(2,10)]

    # initialize vectors
    go_id=character()
    Evidence=character()
    gene_id=character()
  
    #loop row by to get all the go-terms.
    simple_counter=1
    for (i in 1:nrow(justGeneGo)) {
        gene=as.character(justGeneGo[i,1])

        # The phytozome format also expects multiple GO-terms associated with the
        # the gene to be in one line separated by commas. Please change this if necessary
        go=unlist(strsplit(as.character(justGeneGo[i,2]),","))
        for (eachgo in go) {
            for (eachgene in gene) {
                go_id[simple_counter]=eachgo
                Evidence[simple_counter]="ISO" #Default GO-term evidence for simplicity.
                gene_id[simple_counter]=eachgene
                simple_counter=simple_counter+1
            }
        }
    }

    #This data frame is what is needed for the GOFram function. If you already have it so that one gene is associated with one GO-term per line, then you can skip the loop above.
    goframeData = data.frame(go_id, Evidence, gene_id)
    
    goFrame=GOFrame(goframeData,organism=speciesname)
    goAllFrame=GOAllFrame(goFrame)
  
    gsc <- GeneSetCollection(goAllFrame, setType = GOCollection())
  
    return(gsc) 

}



runGostats<-function(gsc, geneids, geneuniverse, ont="BP", pval=0.05) {
# this code is for running the gsc created with phytozome to gostats script.
# gsc is the object retruned from phytozome to gostats
# geneids is the list of genes in your list
# geneuniverse is the list of all genes in the genome
# the default for ontology is Bioloigical Process.
# the default for adjusted p-value cutoff is 0.05
    
    params <- GSEAGOHyperGParams(name="My Custom GSEA based annot Params",
                               geneSetCollection=gsc,
                               geneIds = geneids,
                               universeGeneIds = geneuniverse,
                               ontology = ont,
                               pvalueCutoff = 1,
                               conditional = TRUE,
                               testDirection = "over")

    #conditional=TRUE means only show parent if the go-term is significantly different from child term
  Over <- hyperGTest(params)
    allGoterms = summary(Over)

    # correct p-values for multiple hypothesis testing.
  adjpvalues = p.adjust(allGoterms$Pvalue, method="BH")
  allGoterms$Pvalue = adjpvalues
  colnames(allGoterms)[2] = "adjPvalue"
  adjGoterms = allGoterms[adjpvalues < pval,]
  return(adjGoterms)  
}
  
  
mkatari-bioinformatics-august-2013-phytozome2gostats.1471249405.txt.gz · Last modified: 2016/08/15 08:23 by mkatari