User Tools

Site Tools


mkatari-bioinformatics-august-2013-phytozome2gostats

This is an old revision of the document!


phytozome2gostats←function(annofile, speciesname) {

  # annofile is the path to the go-term association file.
  # speciesname is the name to provide the GSEA dataset you will be creating.
  
  library("GOstats")
  library("GSEABase")
  
  anno<-read.table(annofile,
                   header=FALSE, 
                   sep="\t", 
                   quote="",
                   comment.char="")
  
  # The annotation file is expected to be in Phytozome format where
  # second row columns is the gene and the tenth column is the GO-term.
  # Please change the column definition below accrounding to format of your go association file.
  justGeneGo<-anno[,c(2,10)]
  # initialize vectors
  go_id=character()
  Evidence=character()
  gene_id=character()

  #loop row by to get all the go-terms.
  simple_counter=1
  for (i in 1:nrow(justGeneGo)) {
      gene=as.character(justGeneGo[i,1])
      # The phytozome format also expects multiple GO-terms associated with the
      # the gene to be in one line separated by commas. Please change this if necessary
      go=unlist(strsplit(as.character(justGeneGo[i,2]),","))
      for (eachgo in go) {
          for (eachgene in gene) {
              go_id[simple_counter]=eachgo
              Evidence[simple_counter]="ISO" #Default GO-term evidence for simplicity.
              gene_id[simple_counter]=eachgene
              simple_counter=simple_counter+1
          }
      }
  }
  #This data frame is what is needed for the GOFram function. If you already have it so that one gene is associated with one GO-term per line, then you can skip the loop above.
  goframeData = data.frame(go_id, Evidence, gene_id)
  
  goFrame=GOFrame(goframeData,organism=speciesname)
  goAllFrame=GOAllFrame(goFrame)

  gsc <- GeneSetCollection(goAllFrame, setType = GOCollection())

  return(gsc) 

}

runGostats←function(gsc, geneids, geneuniverse, ont="BP", pval=0.05) { # this code is for running the gsc created with phytozome to gostats script. # gsc is the object retruned from phytozome to gostats # geneids is the list of genes in your list # geneuniverse is the list of all genes in the genome # the default for ontology is Bioloigical Process. # the default for adjusted p-value cutoff is 0.05

  
  params <- GSEAGOHyperGParams(name="My Custom GSEA based annot Params",
                             geneSetCollection=gsc,
                             geneIds = geneids,
                             universeGeneIds = geneuniverse,
                             ontology = ont,
                             pvalueCutoff = 1,
                             conditional = TRUE,
                             testDirection = "over")
  #conditional=TRUE means only show parent if the go-term is significantly different from child term
Over <- hyperGTest(params)
  allGoterms = summary(Over)
  # correct p-values for multiple hypothesis testing.
adjpvalues = p.adjust(allGoterms$Pvalue, method="BH")
allGoterms$Pvalue = adjpvalues
colnames(allGoterms)[2] = "adjPvalue"
adjGoterms = allGoterms[adjpvalues < pval,]
return(adjGoterms)  

}

mkatari-bioinformatics-august-2013-phytozome2gostats.1471249194.txt.gz · Last modified: 2016/08/15 08:19 by mkatari