#!/usr/bin/perl -w
# ================================================================
# unigene_accumulation.pl
# Script to generate a unigne accumulation curve from
# coverage statistics generated with "secondary_assembly_stats.pl"
#
#
# by Joshua Der
# Updated September 2010
# ================================================================
use strict;
use List::Util 'shuffle';
use Statistics::Descriptive;

if (!$ARGV[0]) {
    print "USAGE: unigene_accumulation.pl <coverage_stats_file> ['number_replicate_randomizations']";
    exit(1);
}

# intialize global variables
my $coverage_file = $ARGV[0];
my $nreps = 0;
if (exists $ARGV[1]) {
	$nreps = $ARGV[1];
}
else {
	$nreps = 1;
	print "WARNING: No parameter for number of replicate randomizations was given. Default = 1 replicate.\n";
}
my $curve_output = substr($coverage_file, 0, - 4) . $nreps . '-reps.unigene_accumulation.txt';
my @unigenes = ();
my %unigene_EST_count = ();
my @unigenes_by_frequency = ();
my %seen = ();
my $num_est = 0;
my $num_unigenes = 0;
my %replicate_data = ();

# open coverage file
open(COV, "$coverage_file") || die("Cannot open $coverage_file file");
print "Reading the coverage file...\n";
# read the coverage file
while ($_ = <COV>){
	# skip the first line
	if ($_ =~ /^#/) {next;}
	# split the lines at tab
	my @unigene_stats = split (/\t/, $_);
	# pull out the unigene id
	my $unigene_id = shift @unigene_stats;
	# store the ids in an array
	push @unigenes, $unigene_id;
	# store the EST count in a hash
	$unigene_EST_count{$unigene_id} = $unigene_stats[1];
}
close COV;

# make a HUGE array of unigene ids with one element for each EST in the unigene
# NOTE: this method is not memory efficient.
print "Building the proportional unigene hash\n";
foreach my $unigene_id ( @unigenes ) {
	for (my $i = 1; $i <= $unigene_EST_count{$unigene_id}; $i++) {
		push @unigenes_by_frequency, $unigene_id;
	}
}

open (OUT, ">$curve_output") || die ("Cannot open $curve_output output file");
if ($nreps == 1) {
	print OUT "number_EST\tnumber_unigenes\n";
	# shuffle the huge unigene array
	@unigenes_by_frequency = shuffle(@unigenes_by_frequency);
	# run through the big list to generate the rarefaction output
	foreach my $unigene_id (@unigenes_by_frequency) {
		if (exists $seen{$unigene_id}) {
			$num_est++;
		}
		else {
			$seen{$unigene_id} = 1;
			$num_unigenes++;
			$num_est++;
		}
		print OUT "$num_est\t$num_unigenes\n"
	}
	print "Output file is: \"$curve_output\"\nFormat: number_EST\tnumber_unigenes\n";
}
else {
	print OUT "number_EST\tmean_number_unigenes\ttop_95\tbottom_95\n";
	for (my $i = 1; $i <= $nreps; $i++) {
		print "Replicate $i\n";
		# shuffle the huge unigene array
		@unigenes_by_frequency = shuffle(@unigenes_by_frequency);
		# run through the big list to generate the rarefaction output
		$num_est = 0;
		$num_unigenes = 0;
		%seen = ();
		foreach my $unigene_id (@unigenes_by_frequency) {
			if (exists $seen{$unigene_id}) {
				$num_est++;
			}
			else {
				$seen{$unigene_id} = 1;
				$num_unigenes++;
				$num_est++;
			}
			push @{$replicate_data{$num_est}}, $num_unigenes;
		}
	}
	print "Summarizing replicates...\n";
	foreach $num_est (sort {$a <=> $b} keys %replicate_data){
		my $stat_unigenes_array = Statistics::Descriptive::Full->new();
		$stat_unigenes_array -> add_data(@{$replicate_data{$num_est}});
		my $mean_num_unigenes = $stat_unigenes_array->mean();
		my $top_95 = $stat_unigenes_array->percentile(97.5);
		my $bottom_95 = $stat_unigenes_array->percentile(2.5);
		print OUT join("\t", $num_est, $mean_num_unigenes, $top_95, $bottom_95), "\n";
	}
	print "Output file is: \"$curve_output\"\nFormat: number_EST mean_number_unigenes top_95 bottom_95\n";
}
close OUT;
