#!/usr/bin/env python

import urllib
import sys
import os
##### totalannotation.py by DJ Barshis
##### This script takes an input fasta file of sequence names and sequences, and blast results files of blasts against 
##### nr (parsed .txt with 1 hit per line) and swissprot and tremble (in -outfmt 7) uniprot databases 
##### and downloads the corresponding uniprot flat files from the www.uniprot.org web server,
##### extracts particular annotation information from the nr blast and each uniprot flat file and combines it into a meta-annotation table.
##### you will need to create a 2-line .txt file that has the names of the particular columns you would like to extract from the
##### nr parsed blast file separated by tabs (these files can be large so I suggest extracting the header using head or less in terminal
##### the second line consists of the "bad words" you want to skip over in you nr results separated by tabs.
##### I usually use "predicted	PREDICTED	hypothetical	unknown" or some combination thereof.

# usage is totalannotation.py YOUR_contigs.fasta BLASTx2nr.txt nrcolumnheadersandbadwords.txt BLASTx2Sprot.txt BLASTx2TrEMBL.txt evaluethreshold directoryforflatfiles(no slashes) outtablename.txt

#this is for setting how the script sorts your contigs into order
#change the word to 'text' for a text-based sorting or 'coral' for a
#palumbi-lab coral-specific numerical sorting
textorcoralsort = 'text'

#innames, inseqs read_fasta_lists(sys.argv[1])
#sys.argv[2] = BLASTx2nr.txt
#sys.argv[3] = thingsfornr.txt
#uniprotIDs read_uniprot(sys.argv[4], sys.argv[5]) 
evalue=float(sys.argv[6])
directory=sys.argv[7] #name only, no /'s
#o=open(str(sys.argv[8]), 'w') # New data table file name

#####This reads in a fasta file and extracts the sequence names into a dictionary as the keys

def read_fasta_dict(file):
	fin = open(file, 'r')
	filelines=fin.readlines()
	filelines.append('EOF')
	count=0
	names={}
	seqs=[]
	numseqs=0
	for line in filelines:
		if line=='EOF':
			names[cols[0]]='%i' %(len(seq))
		line=line.strip()
		if line and line[0] == '>':                #indicates the name of the sequence
			if count>=1:
				names[cols[0]]='%i' %(len(seq))
			count+=1
			line=line[1:]
			cols=line.split(' ')
			seq=''
		else: seq +=line
	fin.close()
	return names

innames=read_fasta_dict(sys.argv[1])
print 'Read in fasta of %i sequences: ...' %(len(innames.keys()))

####This function reads in a parsed (every hit on one line) nr blast file and extracts certain columns and returns a dictionary

def nr_dict(file, colstoextract):
	fin = open(file, 'r')       # open input file
	cols2extract = open(colstoextract, 'r')
	d={}
	headers=[]
	contig=''
	linenum=0
	goodnrhits=0
	for line in fin:
		linenum+=1
		line=line.rstrip()
		cols=line.split('\t')
		if linenum == 1:
			headers=line	#Used to copy header to new files
# this loop is for extracting the column indexes for the column names specified on the first line of the stufffornr.txt file
			extractlinecount=0
			for aline in cols2extract:
				extractlinecount+=1
				if extractlinecount==1:
					aline=aline.rstrip()
					words=aline.split('\t')
					hitdescription=cols.index(words[0])
					nrEval=cols.index(words[1])
		if linenum >1:
			cols[0]=cols[0].split(' ')[0]
			if cols[0] == contig:
#				print line
				d[cols[0]].append('%s\t%s' %(cols[hitdescription],cols[nrEval]))
			else: 
				if float(cols[nrEval]) <= evalue:
					goodnrhits+=1
					contig = cols[0]
					numhit = 1
					d[cols[0]]=d.get(cols[0],[])
					d[cols[0]].append('%s\t%s' %(cols[hitdescription],cols[nrEval]))

	fin.close()
	cols2extract.close()
	return headers, d, goodnrhits

headers, d, goodnrhits=nr_dict(sys.argv[2], sys.argv[3])
print "Read in nr blast..."
print '%s%i' %('Number of good nr matches: ',goodnrhits)
print '%s%i' %('Number not matched in nr: ',len(innames.keys())-goodnrhits)
print "Searching for badwords..."

######This function parses the nr dictionary for hits that do not contain badwords (e.g. 'Predicted', 'hypothetical', etc.)

def parse_badwords(value, badwords):
	onlybad=0
	madegood=0
	badhits=[]
	goodhits=[]
	tophit=value[0]
	for each in value:
		numbadhits=0
		for item in badwords:
			if item in each:
				numbadhits+=1
		if numbadhits >=1:
			badhits.append(each)
		if numbadhits == 0:
			goodhits.append(each)
	if len(goodhits)==0:
		onlybad +=1
	if len(goodhits)>=1:
		madegood +=1
	goodhits+=badhits
	return tophit, goodhits, onlybad, madegood

badwordlist=[]
#reading in a list of badwords from stufffornr.txt
badwordfile=open(sys.argv[3],'r')
badwordline=0

for line in badwordfile:
	badwordline+=1
	if badwordline==2:
		line=line.rstrip()
		badwordlist=line.split('\t')

onlybadnrs=0
madegoodnrs=0

####this step loops through the entrys in your contig dictionary
####and calls the badword parser for each entry that has a match in the nr dictionary and returns the top hit and the top non-badword hit (if there is one)

for key,value in innames.items():
	if d.has_key(key):
		tophit, goodhits, onlybad, madegood= parse_badwords(d[key], badwordlist)
		innames[key]='%s\t%s\t%s' %(innames[key],tophit, goodhits[0])
		onlybadnrs+=onlybad
		madegoodnrs+=madegood
	else:
		innames[key]+='\t%s\t%s\t%s\t%s' %('No_sig_nr_hit','No_sig_nr_hit','No_sig_nr_hit','No_sig_nr_hit')

print '%s%i' %('Number of nr hits with only a bad word hit: ', onlybadnrs)
print '%s%i' %('Number of nr hits with a good word hit: ', madegoodnrs)

#######This function reads in the swissprot and trembl outputs and returns
#######a dictionary that contains the top uniprot ID from swissprot (if available) or trembl (if no swissprot match was found)

def read_uniprot(sprotfile,tremblfile):
	queryname=''
	uniprotIDs={}
	uniqueprotIDs={}
	sprotmatch=0
	tremblpeats=0
	tremblmatch=0
	sprot = open(sprotfile,'r')
	trembl = open(tremblfile,'r')
	for line in sprot:
		line=line.rstrip()
		if line[0] == '#':
			continue
		else:
			cols=line.split('\t')
			if cols[0] == queryname:
				continue
			else:
#				if float(cols[10]) <= evalue and cols[1].split('|')[2].split('_')[1] != 'NEMVE': #for parsing based on threshold value and excluding hits to Nematostella
				if float(cols[10]) <= evalue: #for parsing based on threshold value only
					ID=cols[1].split('|')
					uniprotIDs[cols[0]]=uniprotIDs.get(cols[0],[])
					uniprotIDs[cols[0]].append(ID[1])
					if innames.has_key(cols[0]):
						sprotmatch+=1
						innames[cols[0]]+='\t%s\t%s\t%s' %(ID[1],cols[2],cols[10])
						queryname=cols[0]
						if uniqueprotIDs.has_key(ID[1]):
							continue
						else:
							uniqueprotIDs[uniprotIDs[cols[0]][0]]=''
	print 'Read in swissprot blast ...'
	print '%s%i' %('Number of good swissprot matches: ', sprotmatch)
	for line in trembl:
		line=line.rstrip()
		if line[0] == '#':
			continue
		else:
			cols=line.split('\t')
			if cols[0] == queryname:
				continue
			else:
#				if float(cols[10]) <= evalue and cols[1].split('|')[2].split('_')[1] != 'NEMVE': #for parsing based on threshold value
				if float(cols[10]) <= evalue: #for parsing based on threshold value
					ID=cols[1].split('|')
					if uniprotIDs.has_key(cols[0]):
						uniprotIDs[cols[0]].append(ID[1])
						queryname=cols[0]
						tremblpeats+=1
					else:
						uniprotIDs[cols[0]]=uniprotIDs.get(cols[0],[])
						uniprotIDs[cols[0]].append(ID[1])
						if innames.has_key(cols[0]):
							innames[cols[0]]+='\t%s\t%s\t%s' %(ID[1],cols[2],cols[10])
							queryname=cols[0]
							tremblmatch+=1
							if uniqueprotIDs.has_key(uniprotIDs[cols[0]][0]):
								continue
							else:
								uniqueprotIDs[uniprotIDs[cols[0]][0]]=''

	print 'Read in TrEMBL blast ...'
	print '%s%i'%('Number of repeat matches from TrEMBL: ', tremblpeats)
	print '%s%i'%('Number of additional good matches from TrEMBL: ', tremblmatch)
	print '%s%i' %('flatfilesneeded: ',len(uniqueprotIDs.keys()))

	return uniprotIDs, uniqueprotIDs

#this line calls the uniprot reading function
uniprotIDs, uniquesforflats=read_uniprot(sys.argv[4], sys.argv[5])
print 'downloading flat files ...'

#this loop downloads all the uniprot flat files for the list of unique uniprotIDs that was parsed from the blast results
for key, value in uniquesforflats.items():
	if os.path.exists('./'+directory+'/'+key+'.txt'): #thanks JTL for this addition!
		continue
	else:
		urllib.urlretrieve('http://www.uniprot.org/uniprot/'+key+'.txt', './'+directory+'/'+key+'.txt')

print 'extracting relevant info from flat files ...'
print 'don\'t worry this takes awhile ...'

########this function extracts the relevant information from each individual flat file

def extractGO(contigname):
	if uniprotIDs.has_key(contigname):
		flatfile = open('./'+directory+'/'+uniprotIDs[contigname][0]+'.txt','r')
		ID='No_ID'
		DE='No_description'
		description=0
		KEGG='No_KEGG'
		KEGGKO='No_KEGGKO'
		flatfiledict={}
		GOcodes=''
		GOBP=''
		GOMF=''
		GOCC=''
		keywords=''
		for line in flatfile:
			line=line.rstrip()
			if line[0:2] == 'ID':
				line=line.split(' ')
				ID=line[3]
			if line[0:2] == 'DE' and description == 0:
				line=line.split('=')
				DE=line[1][:-1]
				description +=1
			if line[0:2] == 'DR':
				if line[5:9] == 'KEGG':
					line=line.split(';')
					KEGG=line[1].strip()
				if line[5:7] == 'KO':
					line=line.split(';')
					KEGGKO=line[1].strip()
				if line[5:7] == 'GO':
					line=line.split(';')
					if GOcodes == '':
						GOcodes+='%s' %(line[1].strip())
					else:
						GOcodes+=' // %s' %(line[1].strip())
					if line[2].strip().split(':')[0] == 'C':
						GOCC+='%s (%s);' %(line[2].strip().split(':')[1], line[1].strip())
					if line[2].strip().split(':')[0] == 'P':
						GOBP+='%s (%s);' %(line[2].strip().split(':')[1], line[1].strip())
					if line[2].strip().split(':')[0] == 'F':
						GOMF+='%s (%s);' %(line[2].strip().split(':')[1], line[1].strip())
			if line[0:2] == 'KW':
				line=line[2:].split(';')
				for item in line:
					if item == '':
						continue
					else:
						keywords+='%s;' %(item.strip())
		if GOcodes=='':
			GOcodes='No_GOcodes'
		if GOBP=='':
			GOBP='No_GOBP'
		if GOMF=='':
			GOMF='No_GOMF'
		if GOCC=='':
			GOCC='No_GOCC'
		if keywords=='':
			keywords='No_keywords'
		outstring='\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %(ID, DE, KEGG, KEGGKO, GOcodes, GOBP, GOMF, GOCC, keywords)
		nomatch=0
	else:
		nomatch=1
		outstring='\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %('No_Uniprotmatch','No_%_identity','No_evalue','No_ID','No_Description','No_KEGG', 'No_KEGGKO','No_GO','No_GOCC','No_GOBP','No_GOMF','No_keywords')
	return outstring, nomatch

notmatched=0
extractingcounter=0
#####This loop calls the extraction function for each contig that has a uniprot match

for key, value in innames.items():
	extractingcounter+=1
	outstring, nomatch = extractGO(key)	
	innames[key]+=outstring
	notmatched+=nomatch

o=open(str(sys.argv[8]), 'w') # New data table file name

o.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %('ContigName', 'ContigLength', 'topnrMatch','topnrEvalue', 'nobadwordnrMatch', 'nobadwordnrEvalue','Uniprotmatch','%_identity','evalue','ID','Description','KEGG', 'KEGGKO','GO','GOCC','GOBP','GOMF','Keywords')) #used if you want a specific header and filename for each file

print '%s%i' %('Hits not matched in sprot: ', notmatched)
print 'compiling extracted information ...'

############this if for sorting your contigs based on text order#############
if textorcoralsort == 'text':
	l=[]
	for key,value in innames.items():
		l.append((key,value))
	l.sort()
	for item in l:
		o.write('%s\t%s\n' % (item[0], item[1])) #writes each line of the tuple as separate tab delimited text
		
	o.close()

#############this is for sorting your contigs based on our coral specific contig names##############
if textorcoralsort == 'coral':
	l=[]
	joinedcontigcounter=600247
	
	for key,value in innames.items():
		name=key.split(' ')
		if name[0][0:6]=='contig':
			newname=name[0].split('_')
			if len(newname)==1:
				num=int(newname[0][6:])
			if len(newname)>1:
				joinedcontigcounter+=1
				num=joinedcontigcounter
		if name[0][0:6]=='c_sym_':
			newname=name[0].split('_')
			num=700000+int(newname[2])
		if name[0][0:6]=='d_sym_':
			newname=name[0].split('_')
			num=900000+int(newname[2])
				
		l.append((num,key,value))
	l.sort()
	for item in l:
	#	print item
		o.write('%s\t%s\n' % (item[1], item[2])) #writes each line of the tuple as separate tab delimited text
		
	o.close()