#!/usr/bin/perl -w

#sup2.pl USAGE

#Input (fasta file):
#
#  >1300_8769_5430 length=258 urnand=JHSK987KJSH2KJHJK8777
#   AGTCCCCCGGGGTTTAAAGGGGCCCCTTTTAAAAAAGTCGTCAATGCGGTTAAC
#   AGTCTGCAAAAAAATTTCCCCCCCCCCGGGGGGGGGGGTAGCCGTATGCATACG
#   AGTCGGGGGCCCCCCCTTTTGGGGGTAAACGTTGCATATGCATGCTAGCTAGCT       
#The first few base pairs are part of a primer/bar code/MID as speicfied in the input "MIDs csv" file given by user
#Input (MIDs csv file):
#    Sample1,ATAGTGA
#    Sample2,ATGCATG

#Output: A fasta file of the remaining sequence after removing the primer/bar code/MID with corresponding header attached as specified in the input "MIDs csv" file

use strict;

my $seq1;
my $newline="\n";
my $underscore="_";
my $i;
my @chars;
my $header;
my $primerLength=0;
my $primercutstart="";
my $readsubstrlength;
my $subseq2;
my @arr1;
my @arr2;
my $wrongmidcount;
my $seq2;
my $finalSeq;
my $primerSeq;
my $finalHeader;
my $totalInfo;
my $temp;
my $count=0;
my $addheader="";
my $numberMIDs="";
my $j;

if( !exists $ARGV[3] ) 
{
	print "#USAGE: sup2.pl {Reads FASTA format file} {Primers/MIDs/Barcodes with corresponding headers in csv format} {# bases from start of primer to the beginning of the barcode} {New FASTA filename to be written into} \n";
	exit;
}

print "#NOTE:Please make sure that the input MIDs csv file has the 1st field as header and 2nd field as corresponding primer/MID/barcode with no space between two fields \n";

# Store csv file into an array
open(CSVFILE,"$ARGV[1]");	
@arr1=<CSVFILE>;
	foreach $j(@arr1){
	$numberMIDs ++;
	}
#print $numberMIDs;                         
close(CSVFILE);

open(FASTAFILE,"$ARGV[0]");

while(<FASTAFILE>) {
    	               chomp;
    	               
	 	          if(/>/)
	 	            {  
	 	              $header="";
	 	     	      $header=$_;
	 	     	      #print "$_";
	 	            } 
                         else 
                            {   
                              $seq1="";
                              $seq1=$_;
          # Loop through the csv file to match the primer/MID/barcode information here
                              foreach $i(@arr1)
                                {
                                  @arr2="";
                                  $seq2="";
                                  @arr2=split(/,/,$i);
                                  $primerLength=0; # its ok if primers are of varying length 
                                  $primerLength=length($arr2[1]);
# $ARGV[2] = number of bases from start of primer to end of TCAG key
				$primercutstart="";
				$primercutstart= $ARGV[2];
			        $readsubstrlength = $primerLength - $primercutstart;
#print "Debug: $readsubstrlength $primercutstart $primerLength "; 
                                  $seq2= substr $seq1,0,$readsubstrlength;
         # match the first portion of the primercutstartFASTA file sequence to a primer in the csv file
				  $primerSeq = substr ($arr2[1], $primercutstart, 10);
#print "Debug: $seq2 $primerSeq\n";
				 $subseq2 = substr ($seq2, 0, 10);
                                  if($subseq2 eq $primerSeq)
                                    {
                                      $finalSeq="";
                                      $finalHeader="";
                                      $totalInfo="";

        # remove the primer/barcode/MID portion in FASTA file sequence
                                     $finalSeq= substr $seq1,$primerLength-1;

        # add the corresponding header information 
                                     $finalHeader=$header.$underscore.$arr2[0].$newline;

        # Print the new header and modified sequence into Newfile
                                     $totalInfo=$finalHeader.$finalSeq.$newline;

                                      open(NEWFILE,">>$ARGV[3]");
#			              print "*DEBUG* $totalInfo";
				      print NEWFILE $totalInfo;
			              close(NEWFILE);
					$wrongmidcount =0;
                                    } #if
   				else{
					$wrongmidcount ++;
					if($wrongmidcount == $numberMIDs)	{
						open(NEWFILE2, ">>$ARGV[3].nomid.fna");
						print NEWFILE2 "$header\n$seq1\n";
						close(NEWFILE2);
						$wrongmidcount = 0;
					} #end if
				    } #end else
                                } #foreach
			      		
                             } #else

         } #while for FASTAFILE
close(FASTAFILE);

