#!/usr/bin/env python

import sys
filecount=0
#Usage GOcollapser.py Final_AGbaseresults.txt ParsedBlastx2Uniprot.txt GOTable.txt

INFILE = open(sys.argv[1], 'r') #opens an input file with concatenated AGBaseoutput
contignames = sys.argv[2] #an input file with the gene names and uniprot ID's
OUT=open(sys.argv[3],'w')  # open a new output file with the collapsed GO Id's in the format necessary for ErmineJ enrichment analyses

def make_dict1(file):
	fin = open(file, 'r')
	dict={}
	count=0
	for line in fin:
		count+=1
		line=line.rstrip()
		cols=line.split('\t') #for tab-delimited text files
		if count==1:
			headers=cols[0:]
			#count+=1
		if count > 1:
			dict[cols[0]]=cols[1]
	
	return dict, headers

dictnames, namesheaders=make_dict1(contignames)


GOdict={}
name=''
linenum=0
for line in INFILE:
	linenum+=1
	line=line.rstrip()
	if linenum == 1:
#		OUT.write(line)	#Used to copy header to new files
		linenum+=1
#		print 'hi'
	if linenum>1:	#Just used to skip past the header row
		cols=line.split('\t')
		if GOdict.has_key(cols[2]):
#			print 'repeat'
			if cols[5] in GOdict[cols[2]]:
				continue
			else:
				GOdict[cols[2]].append(cols[5])
		else:
			GOdict[cols[2]]=[]
			GOdict[cols[2]].append(cols[5])
#			print 'added'

OUT.write('GeneName'+ '\t'+ 'GeneName'+ '\t' + 'GOterms'+'\n')
l=[]
nogo=0
for key,value in dictnames.items():
	num=int(key[6:])
	if GOdict.has_key(value):
		l.append((num,key,GOdict[value]))   #translates dictnames into a tuple and adds just the contig # (minus"contig") as the first item
	else:
#		l.append((num,key,['Nogo']))
		nogo+=1
l.sort() #sorts tuple into numeric order
print 'Contigs with no GO annotation: ' + str(nogo)
for item in l:
#	print item
	OUT.write(str(item[1])+'\t'+str(item[1])+'\t'+','.join(item[2])+'\n') #writes each line of the tuple as separate tab delimited text

OUT.close()