#! /usr/bin/perl
# Victor Amin 2010

use strict;
use warnings;

use Getopt::Std;
$Getopt::Std::STANDARD_HELP_VERSION = 1;

my %options;
getopts('b:w:h', \%options);

# General algorithm:
# for each chr
# for each position
# scan window of width 100
# if there are i starts in the window (one for each lane)
# find earliest start, latest corresponding end out of all lanes
# output chr first_start last_end

if ($options{h} || !$options{b}) {Getopt::Std->version_mess(); HELP_MESSAGE(\*STDERR)}
sub HELP_MESSAGE {
	my $fh = shift;
	print $fh "\nThis script finds common sequences from contig BEDs (these can be generated from a SAMtools consensus pileup with pileup-convert.pl).\n";
	print $fh "\tOPTIONS:\n";
	print $fh "\t-b [comma-separated contig BED files] [required]\n";
	print $fh "\t-w [window size] [default: 100]\n";
	print $fh "\n";
	exit;
}

my $window = $options{w} ? $options{w} : 100;
my @range_files = split(",", $options{r});

print STDERR "\nLoading BEDs...\n";
my %compare;
my %chrom_max;
my $i;
for my $range_file (@range_files) {
	$i++;
	open BED, "<$range_file" or die "Could not open file: $!\n";
	while (<BED>) {
		chomp;
		my ($chrom, $start, $end, $id) = split;
		$start++; # BED start is 0-based
		$compare{$chrom}{$i}{$start} = $end;
		if (!exists $chrom_max{$chrom} || $chrom_max{$chrom} < $end) {
			$chrom_max{$chrom} = $end;
		}
	}
	close BED;
}

print STDERR "\nCalculating common ranges...\n";

my $id;
for my $chrom (keys %compare) {
	my $count = 0;
	my %buffer = ();
	my %decrement = ();
	my $latest_start = 0;
	print STDERR "$chrom\n";
	print STDERR "$chrom_max{$chrom}\n";
	for my $pos (1..$chrom_max{$chrom}) {
		my $buffstart = $pos-$window;
		my $buffend = $pos+$window;
		for my $j (1..$i) {
			if (exists $compare{$chrom}{$j}{$pos}) {
				$count++;
				if (!exists $buffer{$pos} || $buffer{$pos} > $compare{$chrom}{$j}{$pos}) {
					$buffer{$pos} = $compare{$chrom}{$j}{$pos};
				}
				$decrement{$buffend}++;
			}
		}
		if (exists $decrement{$pos}) {
			$count = $count - $decrement{$pos};
		}
		delete $buffer{$buffstart};
		if ($count >= $i) {
			my $last_start = 0;
			my $first_end = $chrom_max{$chrom};
			for my $start (keys %buffer) {
				if ($start > $last_start) {$last_start = $start}
				if ($buffer{$start} < $first_end) {$first_end = $buffer{$start}}
			}
			if ($latest_start < $last_start) {
				$latest_start = $last_start;
				if ($first_end <= $last_start) {}#print STDERR "Bad range.\n"}
				else {
					$id++;
					$last_start--; # BED start is 0-based
					print "$chrom\t$last_start\t$first_end\tcommon_contig_${id}\n";
				}
			}
		}
	}
}

print STDERR "\n$id common ranges found.\n";

