#!/usr/bin/env python #----------------USAGE----------------------- #by Dan Barshis #This script parses a .sam alignment file created using BWA and returns a tab-delimited text file with the #number of multiply aligned reads and good quality (above a certain mapping quality threshold and read length) singly aligned #reads for every contig/gene in the reference sequence database. #The input text for the command line is: #countxpression.py mapqualitythreshold, lengththreshold, outputstatsfilename, anynumberofinputfiles ... def countxpression(infilename, threshold, lengththresh, zeroforheader1forno, tosort, outfilename, countsoutputname): #Opens an infile specified by the user. Should be a .sam file IN = open(infilename, 'r') threshold=float(threshold) #inputs a mapping quality threshold for counting reads as sucessfully mapped lenthresh=float(lengththresh) #Opens an output text file as specified by user OUT = open(outfilename, 'w') countsout = open(countsoutputname, 'a') linecount=0 totreads=0 notaligned=0 multi=0 single=0 aligned=0 goodmaps=0 contigswzeros=0 goodmaps=0 coralgoodmaps=0 symgoodmaps=0 csymgoodmaps=0 dsymgoodmaps=0 #Starts a for loop to read through the infile line by line multicontigs={} contigs={} for line in IN: line=line.rstrip() cols=line.split('\t') if cols[0]=="@SQ": #Generate dictionary of contig names contigname=cols[1].split(':')[1] contigs[contigname]=[0, 0, 0] if cols[0][0] != '@' : #to skip past header if 'RG:' in line: columntocount=11 #This column should contain "X0:1:number" else: columntocount=11 #This column should contain "X0:1:number" if float(cols[1]) == 4 or float(cols[1]) == 20: #this is to count reads that are flagged as unaligned notaligned+=1 totreads+=1 else: #this is to find reads that had >=1 alignment aligned+=1 totreads+=1 # if cols[11].split(':')[2] == 'R': if float(cols[columntocount].split(':')[2]) > 1: # for reads with >1 optimal hit multi+=1 contigs[cols[2]][1]+=1 #add a count to that contig for the 'top' hit for reads that optimally align to multiple contigs (all the other good hits are often not printed with current bwa settings) if float(cols[columntocount].split(':')[2]) == 1: # for reads with ==1 optimal hit # if cols[11].split(':')[2] == 'U': single+=1 if float(cols[4]) >= threshold and len(cols[9]) >= lenthresh: goodmaps+=1 contigs[cols[2]][0]+=1 if cols[2][0:6]=='contig': coralgoodmaps+=1 if cols[2][0:6]=='c_sym_': symgoodmaps+=1 csymgoodmaps+=1 if cols[2][0:6]=='d_sym_': symgoodmaps+=1 dsymgoodmaps+=1 for item in contigs: contigs[item][2]=contigs[item][0]+contigs[item][1] if contigs[item][0] == 0: contigswzeros+=1 if totreads > 0: contigsmatched=len(contigs.keys())-contigswzeros propqualaligned=float(goodmaps)/float(totreads) coralpropqualaligned=float(coralgoodmaps)/float(totreads) sympropqualaligned=float(symgoodmaps)/float(totreads) csympropqualaligned=float(csymgoodmaps)/float(totreads) dsympropqualaligned=float(dsymgoodmaps)/float(totreads) elif totreads == 0: contigsmatched=len(contigs.keys())-contigswzeros propqualaligned=0 coralpropqualaligned=0 sympropqualaligned=0 csympropqualaligned=0 dsympropqualaligned=0 if tosort=='coral': if int(zeroforheader1forno) == 0: countsout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Filename', 'Total#Reads', 'NumContigsMatched', 'NumUnaligned', 'NumAligned', 'NumMultiAligned', 'NumSingleAligned', 'NumQualSingles', 'PropQualAligned', 'CoralPropQualAligned','SymPropQualAligned','csymPropQualAligned','dsymPropQualAligned')) countsout.write('\n%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (infilename, totreads, contigsmatched, notaligned, aligned, multi, single, goodmaps, propqualaligned, coralpropqualaligned, sympropqualaligned, csympropqualaligned, dsympropqualaligned)) if tosort=='text': if int(zeroforheader1forno) == 0: countsout.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ('Filename', 'Total#Reads', 'NumContigsMatched', 'NumUnaligned', 'NumAligned', 'NumMultiAligned', 'NumSingleAligned', 'NumQualSingles', 'PropQualAligned')) countsout.write('\n%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%.3f' % (infilename, totreads, contigsmatched, notaligned, aligned, multi, single, goodmaps, propqualaligned)) OUT.write('ContigName'+'\t'+'UniqueTotReads'+'\t'+'MultiTotReads'+'\t'+'totalreadsgoodmapped') l=[] #This is for sorting the contigs into numerical order and is written specifically to handle a coral and symbiodinium mixed reference #you will need to modify this part based on your unique "gene" or "contig" names used in your reference assembly #this is for general sorting based on text sorting rules if tosort=='text': for key,value in contigs.items(): l.append((key,value)) l.sort() for item in l: # print item OUT.write('\n%s\t%d\t%d\t%d' % (item[0], item[1][0], item[1][1], item[1][2])) #writes each line of the tuple as separate tab delimited text OUT.close() #this is for sorting based on the specific naming scheme we use in the coral/sym reference set if tosort=='coral': joinedcontigcounter=600247 for key,value in contigs.items(): name=key.split(' ') if name[0][0:6]=='contig': newname=name[0].split('_') if len(newname)==1: num=int(newname[0][6:]) if len(newname)>1: joinedcontigcounter+=1 num=joinedcontigcounter if name[0][0:6]=='c_sym_': newname=name[0].split('_') num=700000+int(newname[2]) if name[0][0:6]=='d_sym_': newname=name[0].split('_') num=900000+int(newname[2]) l.append((num,key,value)) l.sort() for item in l: # print item OUT.write('\n%s\t%d\t%d\t%d' % (item[1], item[2][0], item[2][1], item[2][2])) #writes each line of the tuple as separate tab delimited text OUT.close() if __name__=="__main__": import sys #Usage countxpression(infilename, threshold, lengththreshold, zeroforheader1forno, tosort, outfilename, countsoutputname) filelist=sys.argv[4:] numfiles=0 for file in filelist: numfiles+=1 if numfiles==1: countxpression(file, sys.argv[1], sys.argv[2], 0, 'text', file[:-4]+'_counts.txt', sys.argv[3]) if numfiles>1: countxpression(file, sys.argv[1], sys.argv[2], 1, 'text', file[:-4]+'_counts.txt', sys.argv[3])