#!/usr/bin/perl
#xmlvocab.pl
#
#This Perl script was created by Jules J. Berman
#
#The software is provided "as is", without warranty of any kind,
#express or implied, including but not limited to the warranties
#of merchantability, fitness for a particular purpose and
#noninfringement. in no event shall the authors or copyright
#holders be liable for any claim, damages or other liability,
#whether in an action of contract, tort or otherwise, arising
#from, out of or in connection with the software or the use or
#other dealings in the software.
#
#As a courtesy, users of this script should cite the following
#publication:
#
#Berman JJ. Tumor taxonomy for the developmental lineage classification 
#of neoplasms.  Submitted, BMC Cancer, July 4, 2004. 
#
#neocl.xml is the classification of all neoplastic lesions.
#This script has three purposes:
#1. It checks that neocl.xml is well-formed xml
#2. It checks that an NCI code in one class is not repeated in any 
#other class within neocl.xml
#3. It checks that a term in one class is not repeated in any 
#other class within neocl.xml
#
use XML::Parser;
my $parser = XML::Parser->new( Handlers => {
  Init => \&handle_doc_start,
  Final => \&handle_doc_end,
  });
$file = "neocl.xml";
$parser -> parsefile($file);

sub handle_doc_start
{
print "\nBegining to parse $file now\n\n";
}

sub handle_doc_end
{
print "\nFinished. $file is a well-formed XML File.\n"
}

#<name nci-code = "C6208000">childhood cns embryonal carcinoma</name>
open (TEXT, $file);
open (OUT,">neocl.out");
my $countcode = 0;
my $line = " ";
while ($line ne "")
  {
  $line = <TEXT>;
  last if ($line =~ /\-\-\>/);    #this loop moves file forward to the
  }                               #end of the comment section
while ($line ne "")
  {
  $line = <TEXT>;
  if ($line =~ /\<([a-z\_]+)\>/)  #the only lines matching this regex
     {                            #will be the class metatags
     $classname = $1;
     }
  if ($line =~ /C0000000/)
     {
     next;
     }
  if ($line =~ /C([0-9]{7})/)
     {
     if (exists $code{$1})
        {
        if ($code{$1} ne $classname)
           {
           print "$1 is a problem\n";
           next;
           }
        }
     else
        {
        $code{$1} = $classname;
        $countcode++;
        }
     }
  }
close TEXT;
print "The total number of concepts is $countcode\n";
open (TEXT, $file);
my $line = " ";
while ($line ne "")
  {
  $line = <TEXT>;
  next if ($line =~ /C0000000/);
  if ($line =~ /\"(C[0-9]{7})\"/)
     {
     $line =~ /\"\> ?(.+) ?\<\//;
     $phrase = $1;
     $item{$phrase}++;
     if ($item{$phrase} >1)
       {
       print "$phrase\n";
       }
     }
  }
close TEXT;
open (TEXT, $file);
my $line = " ";
while ($line ne "")
  {
  $line = <TEXT>;
  print OUT $line;
  last if ($line =~ /\-\-\>/);
  }
while ($line ne "")
  {
  $line = <TEXT>;
  if ($line =~ /\<([a-z\_]+)\>/)
     {
     print OUT $line;
     $classname = $1;
     }
  if ($line !~ /name/)
     {
     if ($line =~ /\<\/([a-z\_]+)\>/)
        {
        print OUT $line;
        }  
     }
  if ($line =~ /C0000000/)
     {   
     print OUT $line;
     next;
     }
  if ($line =~ /C[0-9]{7}\"\>(.+)\</)
     {
     if (exists $code{$1})
        {
        if ($code{$1} ne $classname)
           {
           print $line;
           }
        }
     else
        {
        print OUT $line;
        #print "-";     #use this if you like re-assurance that the
                        #script is doing something
        $code{$1} = $classname;
        }
     }
  }
exit;
