#! /usr/bin/perl
# Victor Amin 2010

# Input a SAMtools consensus pileup, output contiguous sequences in various
# formats.

# Contig calling algorithm adapted from samtools.pl, part of the SAMtools
# suite [1] at <http://samtools.sourceforge.net>

# 1. Li H.*, Handsaker B.*, Wysoker A., Fennell T., Ruan J., Homer N., Marth G.,
# Abecasis G., Durbin R. and 1000 Genome Project Data Processing Subgroup (2009)
# The Sequence alignment/map (SAM) format and SAMtools. Bioinformatics, 25,
# 2078-9. [PMID: 19505943]

use strict;
use warnings;

use Getopt::Std;
$Getopt::Std::STANDARD_HELP_VERSION = 1;

my %options = (d=>3, D=>1000, Q=>25, G=>25, l=>10);
getopts('p:d:D:Q:G:l:fqcbh', \%options);

if ($options{h}) {Getopt::Std->version_mess(); HELP_MESSAGE(\*STDERR)}

sub HELP_MESSAGE {
	my $fh = shift;
	print $fh "\nThis program converts contiguous sequences in a SAMtools consensus pileup to FASTA, BED, or coordinate table.\n";
	print $fh "\tOPTIONS:\n";
	print $fh "\t-p contig id prefix (default: contig)\n";
	print $fh "\t-f ouput to FASTA (default)\n";
	print $fh "\t-q output to FASTQ (NOT YET IMPLEMENTED)\n";
	print $fh "\t-b output to BED (CHROM   START   END   ID   AVG_DEPTH)\n";
	print $fh "\t-c output to coordinate table (ID,CHROM,POS,DEPTH)\n\n";
	
	print $fh "\t-d INT    minimum depth [3]\n";
    print $fh "\t-D INT    maximum depth [1000]\n";
    print $fh "\t-Q INT    min RMS mapQ [25]\n";
    print $fh "\t-G INT    minimum indel score [25]\n";
    print $fh "\t-l INT    indel filter winsize [10]\n\n";
	exit;
}

my $prefix = $options{p} ? $options{p} : "contig";
my $FASTA = $options{c} || $options{q} || $options{b} ? 0 : 1;
my $FASTQ = $options{q} ? $options{q} : 0;

my $_Q = $options{Q};
my $_d = $options{d};
my $_D = $options{D};
my $_G = $options{G};
my $_l = $options{l};

my $current_chrom = "";
my $start_pos = 0;
my $current_pos = 0;
my $current_contig = "";
my $current_contig_size = 0;
my $current_contig_read_bases = 0;
my $current_contig_consensus_quality = 0;
my @coverage = ();
my @gaps = ();
my $id = 0;
while (<>) {
	my ($chrom, $pos, $reference_base, $consensus_base, $consensus_quality, $snp_quality, $max_mapping_quality, $read_bases) = split;
	if ($current_pos+1 != $pos && $current_contig_size > 0) {	
		$id = &print_contig($id, $current_chrom, $start_pos, $current_pos, $current_contig_read_bases/$current_contig_size, $current_contig_consensus_quality/$current_contig_size, \$current_contig, \@coverage, \@gaps, \%options);
		$current_chrom = $chrom;
		$start_pos = $pos;
		$current_pos = $pos;
		$current_contig = ($max_mapping_quality >= $_Q && $read_bases >= $_d && $read_bases <= $_D) ? uc($consensus_base) : lc($consensus_base);
		$current_contig_size = 1;
		$current_contig_read_bases = $read_bases;
		$current_contig_consensus_quality = $consensus_quality;
		@coverage = ();
		push @coverage, $read_bases;
		@gaps = ();
	} else {
		if ($reference_base eq '*') {
			push(@gaps, $pos) if ($snp_quality >= $_G);
		} else {
			$current_contig .= ($max_mapping_quality >= $_Q && $read_bases >= $_d && $read_bases <= $_D) ? uc($consensus_base) : lc($consensus_base);
			$current_contig_size++;
			$current_contig_read_bases += $read_bases;
			$current_contig_consensus_quality += $consensus_quality;
			push @coverage, $read_bases;
		}
	}
	$current_pos = $pos;
}
$id = &print_contig($id, $current_chrom, $start_pos, $current_pos, $current_contig_read_bases/$current_contig_size, $current_contig_consensus_quality/$current_contig_size, \$current_contig, \@coverage, \@gaps, \%options);

sub print_contig
{
	my ($id, $chrom, $start, $end, $avg_depth, $avg_quality, $contig, $coverage, $gaps, $options) = @_;
	
	my $_Q = $$options{Q};
	my $_d = $$options{d};
	my $_D = $$options{D};
	my $_l = $$options{l};
	
	&filter_gaps($contig, $gaps, $_l);
	
	if ($end - $start > 10 && $avg_depth >= $_d && $avg_depth <= $_D && $avg_quality >= $_Q) {
		if ($FASTA == 1) {
			my $rounded_avg_depth = sprintf "%.3f", $avg_depth;
			print ">${prefix}_$id|$chrom:$start-$end|coverage:$rounded_avg_depth\n";
			my @sequence_parts = $$contig =~ /(.{1,70})/g;
			for my $seq (@sequence_parts) {
				print "$seq\n";
			}
		} elsif ($FASTQ == 1) {
		
		} else {
			my @coverage = @$coverage;
			if ($options{b}) {
				$start--; # bed start is 0-based
				print "$chrom\t$start\t$end\t${prefix}_$id\t$avg_depth\n";
			} else {
				for my $pos ($start..$end) {
					my $cov = $coverage[$pos-$start];
					print "${prefix}_$id,$chrom,$pos,$cov\n";	
				}
			}
		}
		$id++;
	}
	return $id;
}

sub filter_gaps {
  my ($seq, $gaps, $l) = @_;
  for my $g (@$gaps) {
	my $x = $g > $l ? $g - $l : 0;
	substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l));
  }
}
