#! /usr/bin/perl -w # # Script to post-process filtered SNP calls from Maq (maq.pl SNPfilter) for # *two* (parental) genotypes, each computed with respect to a common # pseudo-reference sequence. In the first development, our pseudo refseq is # assumed to be a unigene set assembled from clustered/assembled EST data and # so may contain unresolved bases (N's) # # ========================================================================= # This script should be considered "as is" and almost certainly contains # bugs, poor/inefficient coding and maybe some fundamental flaws. # ========================================================================= # # Copyright (C) 2008 Martin Trick # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # Copy available at http://www.gnu.org/licences/lgpl.txt # # # Revision history # # v1,v2 - experimental versions # # v3 - modified, as we are now using Maq output that has operated # on raw reference seqs (i.e. 'N' uncertainty codes have been left in) # Martin.Trick@bbsrc.ac.uk,1/8/2008 # # v4 - now we access the Maq pileup file (containing raw read calls) for # the opposing genotype at candidate SNP positions - and use these to filter. # Changed to output SNP details in refseq:position x basecalls (0..1) format # Martin.Trick@bbsrc.ac.uk, 10/8/2008 # # NB. After experimenting with a single hash structure to contain all the SNP # data, reverted to original two hash structure which was more memory-efficient # but is slow # Martin.Trick@bbsrc.ac.uk, 13/8/2008 # # v5 - various bug fixes, new treatment of genotype-specific SNPs and now # re-calling of bases at SNP positions using the raw read data # Martin.Trick@bbsrc.ac.uk, 10/9/2008 # # v6 - use pileup -v output to directly access base qualities. Impose a base # quality threshold below which bases will not contribute to the mixtures # for the recalling of alleles. Made the depth cutoff a variable. # Martin.Trick@bbsrc.ac.uk, 17/9/2008 # # v7 - changed code to use number of (high-quality calls * noise factor) # rather than number of raw reads in some cutoffs. Classified reasons for # each SNP deletion and print these totals to STDOUT plus details to OUT if # DEBUG is on. # # Fixed a bad bug in SNP density histogram output code. I thinks it's OK now. # Martin.Trick@bbsrc.ac.uk, 27/9/2008 # # Fixed a bug in hemi-SNP counting # Martin.Trick@bbsrc.ac.uk, 13/11/2008 # =============================================================================== # Let's see some progress in the terminal window as this is a slow process! use FileHandle; STDOUT->autoflush(1); # Check arguments (none too sophisticated, so fragile to user error here) unless (@ARGV == 6) { die "Usage: SNP_parser.pl SNP_file_1 SNP_file_2 pileup_file_1 pileup_file_2 output_file read-depth\n"; } # Open output file for dumping out details of SNPs open (OUT, ">$ARGV[4]") or die "Couldn't open output file ($!)\n"; # DEBUG level for verbosity of output to OUT my $DEBUG = 0; # Various heuristic parameters my $base_quality = 20; # Routinely gettting > Q20 at base 80 my $depth_cutoff = $ARGV[5] || 8; my $noise_factor = 0.05; # Various counters for the statistics my $recall = 0; my $asym_N_corrs = 0; my $asym_corrs = 0; my $same_SNPs = 0; # Hash to record eliminations of raw SNP calls, keyed by reasons (1) to (6) my %delete = (); # Hash to store the raw reference seqs my %seq = (); #my $ref = '/illumina/jw/TIGR_Triticum_aestivum.mfa'; # Cristobal's experiment #my $ref = '/usr/users/cbu/trick/diversity/ref.fasta'; #my $ref = '/tgac/nextgen6/oleracea_scaffolds.fasta'; #my $ref = '/tgac/nextgen6/jw/refs/Ta.seq.uniq'; #my $ref = '/tgac/nextgen6/ref.fasta'; #my $ref = '/tgac/nextgen6/jw/refs/TaRFL6162.fas'; my $ref = '/tgac/nextgen6/cristobal_rnaseq/Riken_prime/Riken_prime.fa'; # Hash to store our own calls at SNP positions calculated from raw base mixtures my %corrected_calls = (); # Hash for IUB uncertainty codes (and its inverse for reverse lookups) my %iub = ( "M" => "AC", "K" => "GT", "Y" => "CT", "R" => "AG", "W" => "AT", "S" => "CG", "D" => "AGT", "B" => "CGT", "H" => "ACT", "V" => "ACG", "N" => "ACGT" ); my %reverse = reverse %iub; # Load the refseqs into a hash of arrays (for statistics only) print "Storing reference sequences... "; open (REF, "<$ref") or die "Couldn't open ref seq file ($!)\n";; my $id; while () { if (/^>([\w\#\|]+)\s*?.*?\n/) { $id = $1; next; } chomp; my @bases = split //, $_; push @{$seq{$id}},@bases; } close REF; print "done\n"; # With big $DEBUG this code block will produce the length distribution of # refseqs for analysis (a minority interest) if ($DEBUG > 10) { foreach my $key(keys %seq) { my $length = scalar(@{$seq{$key}}); $dist{$length}++; } print "Length distribution of pseudo refseqs\n"; foreach my $len(sort {$a <=> $b} keys %dist) { print "$len\t$dist{$len}\n"; } } # We will use numerical references to the hashes of the parental SNPs my @hrefs = (0,1); # Read in the filtered SNP data, using references to hashes, # (0,1). These are keyed by refseq ID and base position print "Reading in SNP data... "; my $snp = 0; foreach my $href(0..1) { open (*FH, "<$ARGV[$href]") or die "Couldn't open input file $ARGV[$href]($!)\n"; while () { my @cols = split; push @{$$href{$cols[0]}{$cols[1]}},@cols; $snp++; } close FH; } print "done\n"; # Now compare with the raw pileups refs (2,3) & delete any SNP whose opposing # genotype data has poor read depth or contains high-quality calls of same base(s) print "Comparing SNP calls with opposing raw reads\n"; foreach my $href(2..3) { open (*FH, "<$ARGV[$href]") or die "Couldn't open input file $ARGV[$href]($!)\n"; # Pair up SNP data with opposing raw data, i.e. (2=>1; 3=>0) my $that_snp_ref = abs($href-3); my $this_snp_ref = ($that_snp_ref == 0) ? 1 : 0; print "Genotype $that_snp_ref vs Genotype $this_snp_ref... "; while () { my ($seq,$pos,$reference,$depth,$reads,$qualities) = split; #### debug - to examine a given SNP position in utter detail # (Distinct from general $DEBUG level) #my $debug = 1 if ($seq eq 'JCVI_38860' && $pos == 417); #### # Bilaterally delete SNP calls if opposing base has poor read depth # This check used to come later - but it should be the primary criterion # when we have rather shallow datasets if ($depth < $depth_cutoff) { if (exists $$that_snp_ref{$seq}{$pos}) { delete $$that_snp_ref{$seq}{$pos}; $delete{1}++; print OUT "Delete (1) - $seq:$pos\n" if $DEBUG; } if (exists $$this_snp_ref{$seq}{$pos}) { delete $$this_snp_ref{$seq}{$pos}; $delete{1}++; } next; } next unless (exists $$that_snp_ref{$seq}{$pos} || exists $$this_snp_ref{$seq}{$pos}); # Leave in any SNP calls at 'N' bases in ref seq at this stage next if ($reference eq 'N'); # Skip over agreed corrections to ref seq, otherwise they'd be discarded here if (exists $$this_snp_ref{$seq}{$pos} && exists $$that_snp_ref{$seq}{$pos} && ($$that_snp_ref{$seq}{$pos}[3] eq $$this_snp_ref{$seq}{$pos}[3])) { if ($$this_snp_ref{$seq}{$pos}[3] !~ /[AGCT]/ ) { $ambiguity_corrections++; print OUT "(Same ambiguity SNP)\t$seq:$pos\t[$reference]\t$$this_snp_ref{$seq}{$pos}[3]\t$$that_snp_ref{$seq}{$pos}[3]\n" if $DEBUG; } next; } # Else, we evaluate the reads and qualities over this position my %calls = (); my $hq_calls = 0; my @reads = split //,$reads; # Get the corresponding base qualities and transform to Phred scores my @qualities = split //,$qualities; foreach my $q (1 .. $#qualities) { # Skips the '@' character $qualities[$q] = ord($qualities[$q]) - 33; } foreach my $r(1 .. $#reads) { $reads[$r] =~ s/[.,]/$reference/; $reads[$r] =~ tr/acgt/ACGT/; # We're ignoring the use of case in pileup output # Count the high quality calls only $calls{$reads[$r]}++ if ($qualities[$r] > $base_quality); $hq_calls++ if ($qualities[$r] > $base_quality); print "Read $reads[$r]\tQuality $qualities[$r]\n" if $debug; } # If there is an opposing SNP called at this position, do some analysis if (exists $$that_snp_ref{$seq}{$pos}) { # Expand any uncertainty code called by Maq as a SNP into a small array push my @expanded_snp, ($$that_snp_ref{$seq}{$pos}[3] =~ /[ACGT]/) ? $$that_snp_ref{$seq}{$pos}[3] : split //, $iub{$$that_snp_ref{$seq}{$pos}[3]}; my $match = 0; foreach my $base(@expanded_snp) { $match++ if (exists $calls{$base}); # Aggressive, could use a ratio? } print "\n Match $match, keys calls " . scalar (keys %calls) . "\n" if $debug; # This is a fairly straightforward case to eliminate - bilaterally delete # calls if high quality base mixture is same as that of the expanded SNP call if ($match == 2) { delete $$that_snp_ref{$seq}{$pos}; $delete{2}++; if (exists $$this_snp_ref{$seq}{$pos}) { delete $$this_snp_ref{$seq}{$pos}; $delete{2}++; } print OUT "Delete (2) - $seq:$pos\n" if $DEBUG; next; } if ($debug) { print "Matching with " . scalar(@expanded_snp) . " bases, match is $match, $expanded_snp[0] calls $calls{$expanded_snp[0]} from total of $hq_calls quality calls\n" } # Delete if almost all calls match the opposing SNP call (very rare # condition?). These instances form a subclass of the "corrections" # (characterised by asymmetric SNP calls) if ($match == 1 && @expanded_snp == 1) { if ($calls{$expanded_snp[0]} >= ($hq_calls*(1-$noise_factor))){ delete $$that_snp_ref{$seq}{$pos}; $delete{3}++; ($reference eq 'N') ? $asym_N_corrs++ : $asym_corrs++; print OUT "(Correction) - $seq:$pos\t[$seq{$seq}[$pos-1]]\t$expanded_snp[0]\n" if $DEBUG; next; } # Delete if there's a more complex (>2) mixture of high quality calls if ($match == 1 && @expanded_snp == 1 && keys %calls > 2 ) { delete $$that_snp_ref{$seq}{$pos}; $delete{4}++; print OUT "Delete (4) - $seq:$pos\n" if $DEBUG; next; } # This is debatable - eliminate extreme minority calls if ($calls{$expanded_snp[0]} < ($hq_calls*$noise_factor)) { delete $$that_snp_ref{$seq}{$pos}; $delete{5}++; print OUT "Delete (5) - $seq:$pos\n" if $DEBUG; next; } } # These instances should be stone-cold, simple SNPs that are worth # remembering for purely illustrative purposes if ($match == 0 && keys %calls == 1 && !exists $calls{$reference} ) { print OUT "(Simple SNP) - $seq:$pos\n" if $DEBUG; next; } } # Irrespective of whether an opposing SNP is called, do a recall at this pos. # If we have reason, replace Maq's ambiguous or resolved SNP calls with # our codes into a separate hash (which will be referred to later) next if (exists $this_snp_ref{$seq}{$pos} && $this_snp_ref{$seq}{$pos}[3] !~ /[ACGT]/); # Ambiguity already called my @mixture = (); my $mixture = ''; next unless (keys %calls); # No quality calls at this position? foreach my $call (keys %calls) { next if ($calls{$call} < ($hq_calls*$noise_factor)); # Contentious? push @mixture, $call; } @mixture = sort {$a cmp $b} @mixture; print "Mixture is @mixture , number of call types " . scalar(keys %calls) . " \n" if $debug; $mixture = join '', @mixture; # We may be recalling where Maq SNP_filter has failed to call $corrections{$this_snp_ref}{$seq}{$pos} = (@mixture > 1) ? $reverse{$mixture}: $mixture[0]; if ($debug) { print "Correcting @mixture to "; (@mixture > 1) ? print "$reverse{$mixture}\n" : print "$mixture[0]\n"; } # Why is this counter not working? $recall++ if (exists $this_snp_ref{$seq}{$pos} && ($corrections{$this_snp_ref}{$seq}{$pos} ne $this_snp_ref{$seq}{$pos}[3])); } close FH; print "done\n"; } # A second pass to identify asymmetric SNPs removed through our recalling # method overriding Maq's original calls print "Cleaning out SNP losses due to re-calling... "; foreach my $this(@hrefs) { my $that = ($this == 0) ? 1:0; foreach my $seq(sort keys %$this) { foreach my $pos(sort {$a <=> $b} keys %{$$this{$seq}}) { if (exists $corrections{$this}{$seq}{$pos} && exists $corrections{$that}{$seq}{$pos} && ($corrections{$this}{$seq}{$pos} eq $corrections{$that}{$seq}{$pos})) { delete $$this{$seq}{$pos}; delete $$that{$seq}{$pos}; $delete{6}++; print OUT "Delete (6) - $seq:$pos\n" if $DEBUG; } elsif (exists $corrections{$this}{$seq}{$pos} && exists $$that{$seq}{$pos} && ($corrections{$this}{$seq}{$pos} eq $$that{$seq}{$pos}[3])) { delete $$that{$seq}{$pos}; $delete{6}++; print OUT "Delete (6) - $seq:$pos\n" if $DEBUG; } } } } print "done\n"; # Assemble a hash to record the union of the two modified SNP datasets with # counters keyed by {$seq}{$pos} my %union = (); # Initialise scalar counters for various classes of instance my $specifics = $differences = $corrections = $N_corrections = $hemi = 0; print "Extracting high-quality SNPs... "; # Find genotype-specific SNP positions & also different calls at shared positions foreach my $this(@hrefs) { my $that = ($this == 0) ? 1:0; foreach my $seq(sort keys %$this) { foreach my $pos(sort {$a <=> $b} keys %{$$this{$seq}}) { $union{$seq}{$pos}++ unless (exists $$that{$seq}{$pos} || $$this{$seq}{$pos}[2] eq 'N'); $union{$seq}{$pos}++ if (exists $$that{$seq}{$pos} && $$this{$seq}{$pos}[3] ne $$that{$seq}{$pos}[3] && $$this{$seq}{$pos}[2] =~ /[ACGT]/ ); if (exists $$that{$seq}{$pos} && $$this{$seq}{$pos}[3] eq $$that{$seq}{$pos}[3] && $$this{$seq}{$pos}[3] =~ /[ACGT]/) { $corrections++ if ($$this{$seq}{$pos}[2] =~ /[ACGT]/); $N_corrections++ if ($$this{$seq}{$pos}[2] eq 'N'); print OUT "(Correction) - $seq:$pos\t[$seq{$seq}[$pos-1]]\t$$this{$seq}{$pos}[3]\t$$that{$seq}{$pos}[3]\n" if $DEBUG; } } } } print "done\n" ; my %seen = (); # Traverse the union hash to assemble statistics and print SNPs foreach my $seq(sort keys %union) { foreach my $pos(sort {$a <=> $b} keys %{$union{$seq}}) { $specifics++ if ($union{$seq}{$pos} == 1); $differences++ if ($union{$seq}{$pos} == 2); if ($union{$seq}{$pos} >= 1) { print OUT "$seq:$pos\t"; foreach my $this(@hrefs) { my $that = ($this == 0) ? 1:0; if (exists $corrections{$this}{$seq}{$pos}) { print OUT "$corrections{$this}{$seq}{$pos}\t"; if ($corrections{$this}{$seq}{$pos} !~ /[ACGT]/) { $hemi++ unless $seen{$seq}{$pos}++; } } elsif (exists $$this{$seq}{$pos}) { print OUT "$$this{$seq}{$pos}[3]\t"; if ($$this{$seq}{$pos}[3] !~ /[ACGT]/) { $hemi++ unless $seen{$seq}{$pos}++; } } else { print OUT "$$that{$seq}{$pos}[2]\t"; if ($$that{$seq}{$pos}[3] !~ /[ACGT]/) { $hemi++ unless $seen{$seq}{$pos}++; } } } print OUT "\n"; } # Extract the read depth for genotype-specific SNPs here if ($union{$seq}{$pos} == 1) { foreach my $this(@hrefs) { if (exists $$this{$seq}{$pos}) { $depth{$$this{$seq}{$pos}[5]}++; # For interest, print IDs of the high-abundance transcripts #print "$seq\n" if ($$this{$seq}{$pos}[5] == 255 && !$seen{$seq}++); # Reported SNPs at max read depths } } } } push @densities,(scalar(keys %{$union{$seq}})/($#{$seq{$seq}}+1))*1000; # SNPs/kb } # Frequency analysis of SNPs by read depth (integers, so easy) print OUT "\nFrequency distribution of SNP read-depths\n"; print OUT "Depth\tNumber\n"; foreach my $depth(sort {$a <=> $b} keys %depth) { print OUT "$depth\t$depth{$depth}\n"; } # Frequency analysis of EST refseqs by SNP densities # (distribution of real numbers binned into integer classes) my %frequency = (); my @sorted = sort{ $a <=> $b} @densities; foreach my $density(@sorted) { #print "$density\n"; $frequency{int($density)}++; } print OUT "\nFrequency distribution of SNP densities\n"; print OUT "SNPs/kb\tNumber\n"; foreach my $bin (sort {$a <=> $b} keys %frequency) { print OUT "$bin\t$frequency{$bin}\n" } # Print summary statistics to STDOUT; print "\nStatistics: read-depth cutoff $depth_cutoff, base-quality theshold $base_quality\n"; foreach my $type(sort {$a <=> $b}keys %delete) { $delete += $delete{$type}; print "SNP deletions (type $type) - $delete{$type}; "; } print "\n$delete of $snp raw SNP calls deleted\n"; print "" . scalar(keys %union) . " ref seqs show >= 1 SNP\n"; print "SNP densities range from $sorted[0] to $sorted[$#sorted]\n"; print "$specifics genotype-specific (unshared) SNPs\n"; print "$differences polymorphic (shared) SNPs\n"; print "Of ". ($specifics+$differences) ." SNPs $hemi are hemi-SNPs\n"; print "$recall re-calls of Maq consensus/SNP bases\n"; print "Ref seq corrections:\n"; print "" . (($corrections/2) + $asym_corrs) . " concerted corrections to resolved ref bases\n"; print "" . (($N_corrections/2) + $asym_N_corrs) . " concerted corrections to unresolved ref bases\n"; print "" . ($ambiguity_corrections/2) . " concerted corrections with ambiguity codes\n"; close OUT;