#!/usr/bin/env python ######################################################################################################################################### # # # Written by Jason Ladner # # questions?: jtladner@gmail.com # # Usage: # # python getgenosfromvcf_clean.py input_vcf_file output_file_name output_type genotype_quality_cutoff subset_contig_name_file # # # # input_vcf_file = the .vcf file that you want to extract genotype information from # # output_file_name = the name of the file that the genotype info will be written to # # it will be created if it doesn't exist, or written over if it does (*so be careful*) # # output_type = either 'cols' or 'rows' (enter on the command line without the quotes) # # 'cols' specifies that SNPs should be in columns with 2 columns per SNP (one for each allele) # # This is very similar to the input required by smartpca # # 'rows' specifies that SNPs should be in rows (one for each SNP),2 columns for each indiv (one for each allele) # # This is easier to import into excel to visualize the data # # genotype_quality_cutoff = a floating point number that specifies the minimum quality allowed by genotype information to be used # # This parameter is optional (default = 0), but must be specified if subset_contig_name_file is specified # # subset_contig_name_file = An optional file that if used should be a list of contig names, with one contig name per line # # This parameter can be used to specifiy a subset of contigs to extract genotype information from # # # ######################################################################################################################################### '''---------------------------> VCF Format <----------------------------- cols[0] = contig name cols[1] = SNP position cols[2] = ID???? - empty in my vcf files cols[3] = Reference base cols[4] = Alternative base cols[5] = SNP quality (Phred score) cols[6] = filter flags cols[7] = SNP Info cols[8] = genotype format (e.g., GT:AD:DP:GQ:PL) ### This is different for some!!!!!!!! It can even be different for different SNPs in the same .vcf cols[9:] = individual genotypes''' import sys #-------------------------------------------------------------# # This function takes a file and creates a dictionary with: # # -one entry for each line # # -keys will be first column of each line # # -values will be the complete line # #-------------------------------------------------------------# def file_dict(file): fin = open(file, 'r') d={} for line in fin: line=line.rstrip() cols=line.split() d[cols[0]]=line fin.close() return d #-------------------------------------------------------------# # This function takes the genotype description for a SNP # # (e.g., GT:AD:DP:GQ:PL) # # and returns the position of the genotype quality (GQ) # # with the first position being 0 # #-------------------------------------------------------------# def get_GQ_pos(genotype_description): return genotype_description.split(':').index('GQ') #-------------------------------------------------------------# # This function writes genotype data to an outfile # # with 2 columns per SNP (one for each allele) # # # # This is very similar to the input required by smartpca # #-------------------------------------------------------------# def write_snps_as_cols(outfile_name, headers, contigs, pos, ref, alt, all_gens): #Opens an output text file as specified by user OUT = open(outfile_name, 'w') OUT.write(headers[0] + '\t') for index in range(len(contigs)): OUT.write("%s\t\t" % (contigs[index])) #Can be easily changed so that the contig name is a combination of the contig_pos OUT.write('\n%s\t' % (headers[1])) #Positions for index in range(len(contigs)): OUT.write(pos[index] + '\t\t') OUT.write('\n%s\t' % (headers[2])) #Reference Alleles for index in range(len(contigs)): OUT.write("%s\t\t" % (ref[index])) OUT.write('\n%s\t' % (headers[3])) #Alternate alleles for index in range(len(contigs)): OUT.write('%s\t\t' % (alt[index])) indivcount=0 for indiv in headers[4:]: OUT.write('\n%s' % (indiv)) for index in range(len(contigs)): for each in all_gens[indivcount][index]: OUT.write('\t%s' % (each)) indivcount+=1 OUT.close() #-------------------------------------------------------------# # This function writes genotype data to an outfile # # with SNPs in rows (one for each SNP) # # 2 columns for each individual (one for each allele # # This is easier to import into excel to visualize the data # #-------------------------------------------------------------# def write_snps_as_rows(outfile_name, headers, contigs, pos, ref, alt, all_gens): #Opens an output text file as specified by user OUT = open(outfile_name, 'w') for item in headers[:4]: OUT.write('%s\t' % (item)) for item in headers[4:]: OUT.write('%s\t' % (item)) for index in range(len(contigs)): OUT.write('\n%s\t%s\t%s\t%s' % (contigs[index], pos[index], ref[index], alt[index])) for ind in all_gens: OUT.write('\t%s%s' % (ind[index][0], ind[index][1])) def extract_genos_from_vcf(vcf_file, outfile_name, output_type, quality_cutoff = 0, subset_contig_names = {"no_refs_given": ""}): #Opens an infile specified by the user. Should be a .vcf file fin = open(sys.argv[1], 'r') if type(subset_contig_names) == type(""): #Checks to see if a list of names was provided subset_contig_names = file_dict(subset_contig_names) #Should be a list of the contigs of interest, with one name on each line linecount=0 contigs=[] pos=[] ref=[] alt=[] all_gens=[] locus=[] headers=[] #Reads through the vcf file line by line for line in fin: cols=line.rstrip().split('\t') gens=[] #Will hold individual level genotype data, resets list form previous SNP gencount=0 #Will be used below to keep track of the individuals when appending new SNPs if cols[0][0] == '#' and cols[0][1] != '#': #To pull out headers headers=cols[0:2] + cols[3:5] + cols[9:] #Creates list of headers of interest print "Indivs with genotypes in vcf file: %s" % ("\t".join(headers[4:])) #Prints the IDs of the individuals with genotypes in the vcf if cols[0][0] != '#': #Specifies only SNP lines if subset_contig_names==None : linecount+=1 #Keeps track of the numbers of SNPs contigs.append(cols[0]) #Creates a list of all the contig names pos.append(cols[1]) #Creates a list of all the SNP base positions ref.append(cols[3]) #Creates a list of all the reference bases alt.append(cols[4]) #Creates a list of all the alternative bases genotype_description=cols[8] GQ_position = get_GQ_pos(genotype_description) gens=cols[9:] #Creates a list of all the genotype info if linecount==1: #will only execute these commands once, when reading the first line of the file for gen in gens: all_gens.append([]) #Creates seperate list entry for each individual, will hold genotypes at all SNPs locus.append([]) #Creates seperate list entry for each individual, will hold genoypes at a specific locus for gen in gens: #Steps through the genotype info for each individual parts=gen.split(':') #Breaks apart the different sections of the genotype info genotype=parts[0] #Most probable genotype for the individual, with the two alleles seperated by '/' genotypes=genotype.split('/') if genotypes != ['.','.']: #Had to be added because in new version of GATK, .vcf files are written so that if there is no genotype info for an individual, their genotype is './.' as opposed to './.:0:0:0:0', or whatever gen_quality=float(parts[GQ_position]) #May need to change based on the structure of your genotype data if gen_quality'cols-style output is what you need to go into smartpca, but it still needs to be *****tweaked to include population***** if output_type=='cols': #This will have the contig name, pos, ref and alt alleles listed as rows at the top and then each individual and all of its genotyoes listed in the rows that follow. There will be two columns of data for each SNP write_snps_as_cols(outfile_name, headers, contigs, pos, ref, alt, all_gens) if output_type=='rows': #This format has a row for each snp, it is easier to view and put into excel write_snps_as_rows(outfile_name, headers, contigs, pos, ref, alt, all_gens) ###------->>> if len(sys.argv)==4: #No quality cutoff or cubset of contig names provided extract_genos_from_vcf(sys.argv[1], sys.argv[2], sys.argv[3]) elif len(sys.argv)==5: extract_genos_from_vcf(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], subset_contig_names = None) elif len(sys.argv)==6: extract_genos_from_vcf(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) else: if len(sys.argv) <4: print "You did not enter enough parameters" elif len(sys.argv) >6: print "You entered too many parameters"