<?xml version='1.0'?>
<!DOCTYPE art SYSTEM 'http://www.biomedcentral.com/xml/article.dtd'>
<art>
   <ui>gb-2006-7-s1-s2</ui>
   <ji>GBJ</ji>
   <fm>
      <dochead>Review</dochead>
      <bibl>
         <title>
            <p>EGASP: the human ENCODE Genome Annotation Assessment Project</p>
         </title>
         <aug>
            <au id="A1" ce="yes" ca="yes">
               <snm>Guig&#243;</snm>
               <fnm>Roderic</fnm>
               <insr iid="I1"/>
               <insr iid="I11"/>
               <email>rguigo@imim.es</email>
            </au>
            <au id="A2" ce="yes">
               <snm>Flicek</snm>
               <fnm>Paul</fnm>
               <insr iid="I2"/>
            </au>
            <au id="A3" ce="yes">
               <snm>Abril</snm>
               <mi>F</mi>
               <fnm>Josep</fnm>
               <insr iid="I1"/>
            </au>
            <au id="A4">
               <snm>Reymond</snm>
               <fnm>Alexandre</fnm>
               <insr iid="I3"/>
            </au>
            <au id="A5">
               <snm>Lagarde</snm>
               <fnm>Julien</fnm>
               <insr iid="I1"/>
            </au>
            <au id="A6">
               <snm>Denoeud</snm>
               <fnm>France</fnm>
               <insr iid="I1"/>
            </au>
            <au id="A7">
               <snm>Antonarakis</snm>
               <fnm>Stylianos</fnm>
               <insr iid="I4"/>
            </au>
            <au id="A8">
               <snm>Ashburner</snm>
               <fnm>Michael</fnm>
               <insr iid="I5"/>
               <insr iid="I12"/>
            </au>
            <au id="A9">
               <snm>Bajic</snm>
               <mi>B</mi>
               <fnm>Vladimir</fnm>
               <insr iid="I6"/>
               <insr iid="I12"/>
            </au>
            <au id="A10">
               <snm>Birney</snm>
               <fnm>Ewan</fnm>
               <insr iid="I2"/>
               <insr iid="I11"/>
            </au>
            <au id="A11">
               <snm>Castelo</snm>
               <fnm>Robert</fnm>
               <insr iid="I1"/>
            </au>
            <au id="A12">
               <snm>Eyras</snm>
               <fnm>Eduardo</fnm>
               <insr iid="I1"/>
            </au>
            <au id="A13">
               <snm>Ucla</snm>
               <fnm>Catherine</fnm>
               <insr iid="I4"/>
            </au>
            <au id="A14">
               <snm>Gingeras</snm>
               <mi>R</mi>
               <fnm>Thomas</fnm>
               <insr iid="I7"/>
               <insr iid="I12"/>
            </au>
            <au id="A15">
               <snm>Harrow</snm>
               <fnm>Jennifer</fnm>
               <insr iid="I8"/>
               <insr iid="I11"/>
            </au>
            <au id="A16">
               <snm>Hubbard</snm>
               <fnm>Tim</fnm>
               <insr iid="I8"/>
               <insr iid="I11"/>
            </au>
            <au id="A17">
               <snm>Lewis</snm>
               <mi>E</mi>
               <fnm>Suzanna</fnm>
               <insr iid="I9"/>
               <insr iid="I12"/>
            </au>
            <au id="A18" ce="yes" ca="yes">
               <snm>Reese</snm>
               <mi>G</mi>
               <fnm>Martin</fnm>
               <insr iid="I10"/>
               <insr iid="I12"/>
               <email>mreese@omicia.com</email>
            </au>
         </aug>
         <insg>
            <ins id="I1">
               <p>Centre de Regulaci&#243; Gen&#242;mica, Institut Municipal d'Investigaci&#243; M&#232;dica-Universitat Pompeu Fabra, E08003 Barcelona, Catalonia, Spain</p>
            </ins>
            <ins id="I2">
               <p>European Bioinformatics Institute, Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK</p>
            </ins>
            <ins id="I3">
               <p>Center for Integrative Genomics, University of Lausanne, Switzerland</p>
            </ins>
            <ins id="I4">
               <p>University of Geneva Medical School and University Hospitals of Geneva, 1211 Geneva, Switzerland</p>
            </ins>
            <ins id="I5">
               <p>Department of Genetics, University of Cambridge, Cambridge CB3 2EH, UK</p>
            </ins>
            <ins id="I6">
               <p>South African National Bioinformatics Institute (SANBI), University of Western Cape, Bellville 7535, South Africa</p>
            </ins>
            <ins id="I7">
               <p>Affymetrix Inc., Santa Clara, California 95051, USA</p>
            </ins>
            <ins id="I8">
               <p>Wellcome Trust Sanger Institute, Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SA, UK</p>
            </ins>
            <ins id="I9">
               <p>Department of Molecular and Cellular Biology, University of California, Berkeley, California 94792, USA</p>
            </ins>
            <ins id="I10">
               <p>Omicia Inc., Christie Ave., Emeryville, California 94608, USA</p>
            </ins>
            <ins id="I11">
               <p>Member of the EGASP Organizing Committee</p>
            </ins>
            <ins id="I12">
               <p>Member of the EGASP Advisory Board</p>
            </ins>
         </insg>
         <source>Genome Biology</source>
         <supplement>
            <title>
               <p>EGASP '05: ENCODE Genome Annotation Assessment Project</p>
            </title>
            <editor>Roderic Guig&#243;, Martin G Reese</editor>
            <note>Research</note>
         </supplement>
         <issn>1465-6906</issn>
         <pubdate>2006</pubdate>
         <volume>7</volume>
         <issue>Suppl 1</issue>
         <fpage>S2</fpage>
         <url>http://genomebiology.com/2006/7/S1/S2</url>
         <xrefbib>
            <pubidlist>
               <pubid idtype="pmpid">16925836</pubid>
               <pubid idtype="doi">10.1186/gb-2006-7-s1-s2</pubid>
            </pubidlist>
         </xrefbib>
      </bibl>
      <history>
         <pub>
            <date>
               <day>7</day>
               <month>8</month>
               <year>2006</year>
            </date>
         </pub>
      </history>
      <cpyrt>
         <year>2006</year>
         <collab>BioMed Central Ltd.</collab>
      </cpyrt>
      <abs>
         <sec>
            <st>
               <p>Abstract</p>
            </st>
            <sec>
               <st>
                  <p>Background</p>
               </st>
               <p>We present the results of EGASP, a community experiment to assess the state-of-the-art in genome annotation within the ENCODE regions, which span 1% of the human genome sequence. The experiment had two major goals: the assessment of the accuracy of computational methods to predict protein coding genes; and the overall assessment of the completeness of the current human genome annotations as represented in the ENCODE regions. For the computational prediction assessment, eighteen groups contributed gene predictions. We evaluated these submissions against each other based on a 'reference set' of annotations generated as part of the GENCODE project. These annotations were not available to the prediction groups prior to the submission deadline, so that their predictions were blind and an external advisory committee could perform a fair assessment.</p>
            </sec>
            <sec>
               <st>
                  <p>Results</p>
               </st>
               <p>The best methods had at least one gene transcript correctly predicted for close to 70% of the annotated genes. Nevertheless, the multiple transcript accuracy, taking into account alternative splicing, reached only approximately 40% to 50% accuracy. At the coding nucleotide level, the best programs reached an accuracy of 90% in both sensitivity and specificity. Programs relying on mRNA and protein sequences were the most accurate in reproducing the manually curated annotations. Experimental validation shows that only a very small percentage (3.2%) of the selected 221 computationally predicted exons outside of the existing annotation could be verified.</p>
            </sec>
            <sec>
               <st>
                  <p>Conclusion</p>
               </st>
               <p>This is the first such experiment in human DNA, and we have followed the standards established in a similar experiment, GASP1, in <it>Drosophila melanogaster</it>. We believe the results presented here contribute to the value of ongoing large-scale annotation projects and should guide further experimental methods when being scaled up to the entire human genome sequence.</p>
            </sec>
         </sec>
      </abs>
   </fm>
   <bdy>
      <sec>
         <st>
            <p>Background</p>
         </st>
         <p>During the first decade of the 21st century the sequencing of whole genomes has become a routine biological practice. The list of chordates with assembled genome sequences now numbers nearly two dozen, while the total number of sequenced bacteria, archea, and eukaryota is approaching 2,000. The genome sequence is said to be an organism's blueprint: the set of instructions dictating its biological traits. In higher eukaryotic organisms, however, these traits are apparently encoded by only a small fraction of the genome sequence that is functional (possibly less than 5% in the case of the human genome). The genes are a major component of this functional sequence. While there is growing evidence for many functional non-protein coding RNA genes, such as miRNAs and snoRNAs, the largest and best studied subset of the human genes comprise the protein coding genes, genes specifying the amino acid sequence of the proteins. Thus, locating the genes in a newly sequenced genome is a first, essential step toward understanding how the organism translates its genome sequence into biological function. This paper focuses on the identification of protein coding genes, if not otherwise noted.</p>
         <p>Maybe to the surprise of many, five years after the first drafts of the human genome sequence became available <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr></abbrgrp>, and nearly three years after the announcement of the completion of the sequencing <abbrgrp><abbr bid="B3">3</abbr></abbrgrp>, a complete set of protein coding genes encoded in the human genome does not exist. One reason for the lack of a complete gene set is that an appropriately rigorous standard has been set for the human genome: every gene, exactly correct. And as shown in this paper, only very few of the human genes seem to be missing from the computational predictions, but the exact genomic structure of these genes is estimated to be correct for only 50% of the predicted genes. In other words, only very few protein coding genes appear to have been totally missed today. Nevertheless, getting the entire genomic structure of a protein coding gene right is still a very difficult task, compounded by the large amount of alternative splicing characterizing human genes. Our assessment here tries to quantify the status of these differences in the current human genome annotations and computational prediction programs.</p>
         <sec>
            <st>
               <p>Automatic genome annotation methods</p>
            </st>
            <p>To date, accurate automatic annotation of the human genome (and of other genomes with significant cDNA libraries) strongly relies on an elaborate mapping of these known gene sequences onto the genome sequence. This method of genome annotation requires high quality and a nearly complete set of cDNA sequences. Datasets trying to achieve this goal, but are still works in progress, are the RefSeq database <abbrgrp><abbr bid="B4">4</abbr></abbrgrp> and those currently being produced by the Mammalian Gene Collection (MGC) <abbrgrp><abbr bid="B5">5</abbr></abbrgrp>. As the MGC project - and similar efforts to deepen the coverage of the fraction of the human genome being transcribed - continues, cDNA mapping based gene identification methods are becoming increasingly accurate. While few organisms will have the rich cDNA libraries that are currently being developed for the human genome, the availability of protein sequence data from evolutionarily close relatives has been effectively used in addition to cDNA data for automatic gene prediction across many of the currently sequenced mammals. The most commonly used annotation pipelines are the ENSEMBL pipeline <abbrgrp><abbr bid="B6">6</abbr></abbrgrp>, the UCSC genome browser's <abbrgrp><abbr bid="B7">7</abbr></abbrgrp> Known Genes (KG II) pipeline, and the Gnomon pipeline at the NCBI <abbrgrp><abbr bid="B8">8</abbr></abbrgrp>. It remains unclear, however, what fraction of the low and specifically expressed transcripts and of alternatively spliced isoforms can be effectively recovered from cDNA libraries. Additionally, orthologous proteins from other species may not align genes that are rapidly evolving. For these reasons, current cDNA and protein-based methods are likely to provide an incomplete picture of the protein coding gene content of the human genome. These methods will be less accurate for genomes with fewer expressed sequences and comparative options.</p>
            <p>For automatic annotation of genomes without deep expressed sequence libraries, any available cDNA or expressed sequence tag (EST) based annotation is often complemented by dual (or multiple) genome comparative predictions. These predictions are obtained by means of the analysis of the patterns of sequence conservation between genome sequences of evolutionarily related organisms. As examples, programs such SGP2 <abbrgrp><abbr bid="B9">9</abbr></abbrgrp>, SLAM <abbrgrp><abbr bid="B10">10</abbr><abbr bid="B11">11</abbr></abbrgrp> and TWINSCAN <abbrgrp><abbr bid="B12">12</abbr><abbr bid="B13">13</abbr></abbrgrp> have contributed efficiently to the annotation of a number of vertebrate genomes, including mouse <abbrgrp><abbr bid="B14">14</abbr></abbrgrp>, rat <abbrgrp><abbr bid="B15">15</abbr></abbrgrp>, and chicken <abbrgrp><abbr bid="B16">16</abbr></abbrgrp>. This type of comparative-based automatic gene prediction can produce highly accurate gene sets when the sequence of related species is available, but few ESTs have been sequenced, such as the case with the fungus <it>Cryptococcus neoformans </it><abbrgrp><abbr bid="B17">17</abbr></abbrgrp>.</p>
            <p>Occasionally, the so-called single genome <it>ab initio </it>predictors - programs that use statistical sequence patterns, such as the coding reading frame, codon usage or splice site consensus sequences, for gene identification - are also used to complement cDNA and comparative based methods. When no genome exists at the appropriate phylogenetic distance, and the cDNA or EST coverage of the transcriptome is shallow, single genome <it>ab initio </it>predictions play an important role in genome annotation, such as those obtained, for example, by the programs GENSCAN <abbrgrp><abbr bid="B18">18</abbr></abbrgrp> and GENEID <abbrgrp><abbr bid="B19">19</abbr></abbrgrp> in the initial annotation of the genome of the fish <it>Tetraodon nigroviridis </it><abbrgrp><abbr bid="B20">20</abbr></abbrgrp>.</p>
            <p>In summary, despite substantial progress in the past decade and the existence of highly accurate gene sets in a number of organisms, current gene identification methods are, as yet, not able to produce a complete catalogue of the set of protein coding genes in higher eukaryotic genomes (see <abbrgrp><abbr bid="B21">21</abbr></abbrgrp> for a recent review).</p>
         </sec>
         <sec>
            <st>
               <p>Assessing the accuracy of automatic genome annotation</p>
            </st>
            <p>Over the past quarter century, a large number of automated gene prediction algorithms have been introduced, which can be loosely grouped based on the general strategies described above. These methods vary widely in the details of their implementation and in the number and location of predicted protein coding genes. Thus, the issue of evaluating the accuracy of the predictive methods has been recurrent within the field of computational gene prediction. The early work of Burset and Guig&#243; <abbrgrp><abbr bid="B22">22</abbr></abbrgrp>, and the subsequent analysis of Bajic <abbrgrp><abbr bid="B23">23</abbr></abbrgrp>, Baldi <it>et al</it>. <abbrgrp><abbr bid="B24">24</abbr></abbrgrp>, Guig&#243; <it>et al</it>. <abbrgrp><abbr bid="B25">25</abbr></abbrgrp>, Rogic <it>et al</it>. <abbrgrp><abbr bid="B26">26</abbr></abbrgrp> and others, provide a framework - a set of metrics and a protocol - to consistently evaluate gene prediction methods. Essentially, a set of well-annotated sequences are used as a test set. The gene prediction programs are run on these sequences, and the predictions obtained are compared with the annotations. A number of measures are computed to evaluate how well the predictions reproduce the annotation. Typically, predictions are evaluated at nucleotide, exon and gene levels. At all three levels, two basic measures are computed: sensitivity, the proportion of annotated features (nucleotide, exon, gene) that have been predicted; and specificity, the proportion of predicted features that is annotated. One problem with this approach is that, until recently, very few large genomic sequences were well annotated and only the coordinates of the coding exons within a gene could be considered. Moreover, because methods did not exist to predict alternative splicing, the test sets used to evaluate computational gene predictions consisted of a few hundred short sequences encoding single genes from which alternatively spliced isoforms had been removed. This led to an oversimplification of the problem and, in turn, to an overestimation of the real accuracy of the programs <abbrgrp><abbr bid="B25">25</abbr></abbrgrp>. Furthermore, many programs were developed in-house and were, therefore, not accessible for independent evaluation.</p>
            <p>To address the problem of independent, objective assessment of the state-of-the-art in automated tools and techniques for annotating large contiguous genomic DNA regions and eventually complete genomes, a first Genome Annotation Assessment Project (GASP1) was organized in 1999 <abbrgrp><abbr bid="B27">27</abbr></abbrgrp>. In many ways, GASP1 was set up similarly to CASP (Critical Assessment of Techniques for Protein Structure Prediction) <abbrgrp><abbr bid="B28">28</abbr></abbrgrp>. In short, at GASP1, a genomic region in <it>Drosophila melanogaster</it>, including auxiliary training data, was provided to the community and gene finding experts were invited to send the annotation files they had generated to the organizers before a fixed deadline. Then, a set of standards were developed to evaluate submissions against the later published annotations <abbrgrp><abbr bid="B29">29</abbr></abbrgrp>, which had been withheld until after the submission stage. Next, the evaluation results were assessed by an independent advisory team and publicly presented at a workshop at the Intelligent Systems in Molecular Biology (ISMB) 1999 meeting. This community experiment was then published as a collection of methods and evaluation papers in <it>Genome Research </it><abbrgrp><abbr bid="B27">27</abbr></abbrgrp>.</p>
         </sec>
         <sec>
            <st>
               <p>The ENCODE Genome Annotation Assessment Project</p>
            </st>
            <p>Inspired by GASP1, and within the context of the ENCyclopedia Of DNA Elements (ENCODE) project, we organized the ENCODE GASP (EGASP) community experiment, which followed closely the model of its predecessor, GASP1 <abbrgrp><abbr bid="B27">27</abbr></abbrgrp>. The ENCODE project was launched two years ago by the National Human Genome Research Institute (NHGRI) with the aim of identifying all functional elements in the genome sequence through the collaborative effort of computational and laboratory-based scientists <abbrgrp><abbr bid="B30">30</abbr></abbrgrp>. The pilot phase of the project is focused on a selected 30 Mb of sequence within 44 selected regions (Table <tblr tid="T1">1</tblr>) across the human genome, which represents approximately 1% of the genome sequence.</p>
            <tbl id="T1">
               <title>
                  <p>Table 1</p>
               </title>
               <caption>
                  <p>The 44 selected sequences within the ENCODE region</p>
               </caption>
               <tblbdy cols="6">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c cspan="3" ca="center">
                        <p>Random picks Mouse homology</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c cspan="3">
                        <hr/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Sequence Set</p>
                     </c>
                     <c ca="center">
                        <p>Manual picks</p>
                     </c>
                     <c ca="center">
                        <p>Low</p>
                     </c>
                     <c ca="center">
                        <p>Medium</p>
                     </c>
                     <c ca="center">
                        <p>High</p>
                     </c>
                     <c ca="center">
                        <p>Gene density</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="6">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Training</p>
                     </c>
                     <c ca="center">
                        <p>ENm006</p>
                     </c>
                     <c ca="center">
                        <p>ENr132</p>
                     </c>
                     <c ca="center">
                        <p>ENr231</p>
                     </c>
                     <c ca="center">
                        <p>ENr333</p>
                     </c>
                     <c ca="center">
                        <p>High</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENr232</p>
                     </c>
                     <c ca="center">
                        <p>ENr334</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm004</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>ENr222</p>
                     </c>
                     <c ca="center">
                        <p>ENr323</p>
                     </c>
                     <c ca="center">
                        <p>Medium</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENr223</p>
                     </c>
                     <c ca="center">
                        <p>ENr324</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>ENr111</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>Low</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENr114</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Test</p>
                     </c>
                     <c ca="center">
                        <p>ENm002</p>
                     </c>
                     <c ca="center">
                        <p>ENr131</p>
                     </c>
                     <c ca="center">
                        <p>ENr233</p>
                     </c>
                     <c ca="center">
                        <p>ENr331</p>
                     </c>
                     <c ca="center">
                        <p>High</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm005</p>
                     </c>
                     <c ca="center">
                        <p>ENr133</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENr332</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm007</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm008</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm009</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm010</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm011</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm001</p>
                     </c>
                     <c ca="center">
                        <p>ENr121</p>
                     </c>
                     <c ca="center">
                        <p>ENr221</p>
                     </c>
                     <c ca="center">
                        <p>ENr321</p>
                     </c>
                     <c ca="center">
                        <p>Medium</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm003</p>
                     </c>
                     <c ca="center">
                        <p>ENr122</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENr322</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm012</p>
                     </c>
                     <c ca="center">
                        <p>ENr123</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm013</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENm014</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>-</p>
                     </c>
                     <c ca="center">
                        <p>ENr112</p>
                     </c>
                     <c ca="center">
                        <p>ENr211</p>
                     </c>
                     <c ca="center">
                        <p>ENr311</p>
                     </c>
                     <c ca="center">
                        <p>Low</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENr113</p>
                     </c>
                     <c ca="center">
                        <p>ENr212</p>
                     </c>
                     <c ca="center">
                        <p>ENr312</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>ENr213</p>
                     </c>
                     <c ca="center">
                        <p>ENr313</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
               </tblbdy>
               <tblfn>
                  <p>ENCODE sequences were assigned to either the training or the test set based on annotation data availability (see the section 'The EGASP experiment'). For the performance evaluation, only the test set sequences were used. The numeric code for the randomly picked sequence names correspond to the non-exonic conservation with the mouse genome, the density of previously identified genes, and the sequence number, respectively; numbers vary from 1 (low), to 3 (high). Manually selected sequences range in size from 500 kbp to 2 Mbp, while random regions are 500 kbp. The selection and stratification criteria for all the sequences is described at the ENCODE project web site [34].</p>
               </tblfn>
            </tbl>
            <p>Within the ENCODE project, the GENCODE consortium <abbrgrp><abbr bid="B31">31</abbr></abbrgrp> was set up. This group, in collaboration with the HAVANA team <abbrgrp><abbr bid="B32">32</abbr></abbrgrp> at the Sanger Institute, has produced a high quality annotation of the gene content of the ENCODE regions through a combined manual, computational and experimental strategy <abbrgrp><abbr bid="B33">33</abbr></abbrgrp>. The EGASP experiment was organized with the main goal of evaluating how well automatic methods are able to reproduce this annotation produced by GENCODE. A second goal of EGASP was to assess the completeness of the GENCODE annotation and, in this regard, EGASP was designed such that, in a follow-up step, a number of computational gene predictions not included in GENCODE were tested experimentally.</p>
            <p>In what follows, we first describe the organization and structure of the EGASP experiment. We then present the results of the evaluation of the submitted predictions against the GENCODE annotation, and finally we present the results of the experimental verification of the novel predictions.</p>
         </sec>
         <sec>
            <st>
               <p>The EGASP experiment</p>
            </st>
            <sec>
               <st>
                  <p>Data: the benchmark sequence of 44 selected ENCODE regions</p>
               </st>
               <sec>
                  <st>
                     <p>Description of the sequence</p>
                  </st>
                  <p>The 44 ENCODE regions represent 30 Mb (approximately 1%) of the human genome <abbrgrp><abbr bid="B30">30</abbr></abbrgrp>. Approximately half of the sequence corresponds to a set of 14 manually selected regions including well-studied genes and for which a significant amount of prior comparative sequence data was available. The remaining 30 genomic regions were chosen based on a stratified random sampling based on two measures: gene density (from previous annotations) and non-exonic conservation with the mouse genome sequence. Briefly, each portion of the human genome sequence was classified as high, medium, or low if it fell in the top 20%, the middle 30%, or the bottom 50%, respectively, of the above two measures. Several 500 kb sequences were chosen from each of the nine classifications created by this stratification procedure.</p>
                  <p>Table <tblr tid="T1">1</tblr> lists the 44 selected sequences within the ENCODE region and classifies them based on random/manual selection, previously known gene density and non-exonic conservation to the mouse genome. It also describes the size differences between the sequences. Information about the criteria used to select the regions and their characteristics can be found on the ENCODE website <abbrgrp><abbr bid="B34">34</abbr></abbrgrp>. The sequences of the ENCODE regions (as well as multiple functional annotations) can be downloaded from the UCSC ENCODE browser <abbrgrp><abbr bid="B35">35</abbr></abbrgrp>.</p>
                  <p>We defined the sequences used for the EGASP experimental evaluation by taking advantage of the prior work of the HAVANA team <abbrgrp><abbr bid="B32">32</abbr></abbrgrp>, which had previously comprehensively annotated and released annotation for several human chromosomes <abbrgrp><abbr bid="B36">36</abbr><abbr bid="B37">37</abbr><abbr bid="B38">38</abbr><abbr bid="B39">39</abbr><abbr bid="B40">40</abbr><abbr bid="B41">41</abbr><abbr bid="B42">42</abbr></abbrgrp>. Updated annotation for the 13 ENCODE regions on these chromosomes was released in January 2005 as a 'training' set for the EGASP experiment. The manual annotation of the other 31 ENCODE regions was held back from release until after the automated gene predictions had been received. The 31 EGASP test regions represent a total of 21.6 million base-pairs (bp) of sequence. Further information is available at the GENCODE website <abbrgrp><abbr bid="B31">31</abbr></abbrgrp>.</p>
               </sec>
               <sec>
                  <st>
                     <p>The reference gene set: the GENCODE annotations</p>
                  </st>
                  <p>The ENCODE regions had been subjected to an exhaustive annotation strategy prior to EGASP by the HAVANA team. In short, the annotators initially build coding transcripts manually based on alignments of known mRNA, EST and protein sequences to the human genome. The initial gene map delineated in this way was then experimentally refined through reverse transcription (RT)-PCR and rapid amplification of cDNA ends (RACE), which essentially confirmed the existence of the mRNA sequences of the hypothesized genes. Finally, the initial annotation was refined by the annotators based on these experimental results. While the initial annotation by the HAVANA team is augmented by some experimentally verified <it>ab initio </it>and dual-genome gene predictions without <it>a priori </it>transcript sequence support, these constitute a marginal fraction of the entire GENCODE annotation set. The strategy is described in detail elsewhere in this issue <abbrgrp><abbr bid="B33">33</abbr></abbrgrp>. We used this final annotation as the reference set for EGASP, and refer to it as the GENCODE annotation.</p>
                  <p>The protein coding GENCODE annotation for all 44 ENCODE regions consists of 2,471 total transcripts representing 434 unique protein coding gene loci. There are 1,097 coding transcripts that code for 993 unique proteins. The annotation identifies 5.7 total transcripts per locus, with an average of 2.52 coding transcripts. Of the 434 coding loci, 393 contain multi-exon transcripts. In line with earlier estimates <abbrgrp><abbr bid="B43">43</abbr></abbrgrp>, 86% of the multi-exon loci exhibit alternative splicing in either the coding or non-coding transcripts. Sixty percent of multi-exon loci have alternative coding transcripts. See <abbrgrp><abbr bid="B33">33</abbr></abbrgrp> in this issue for additional details.</p>
               </sec>
               <sec>
                  <st>
                     <p>Incomplete annotation</p>
                  </st>
                  <p>The GENCODE annotation includes incomplete genes and transcripts. These are caused both by the truncation of some features at the end of the ENCODE regions and by transcript annotations that may be incomplete due to lack of evidence. In the rare case that an exon crossed an ENCODE region boundary, the exon was truncated at the ENCODE region boundary in both the annotations and the predictions to ensure that the nucleotide level evaluation statistics were computed correctly (see Materials and methods).</p>
               </sec>
            </sec>
            <sec>
               <st>
                  <p>EGASP: a community experiment</p>
               </st>
               <p>To determine an automatic method's ability to reproduce the GENCODE annotation, we organized EGASP in the following way: In January 2005, the GENCODE annotation for 13 of the 44 ENCODE regions (the 'training regions' defined above) was publicly released. With the release of this annotation, EGASP was officially announced: gene and other DNA feature prediction groups world-wide were asked to submit genome annotations on the remaining 31 ENCODE regions, for which the GENCODE annotations would not be released until the deadline for submission expired. Participating groups had access to the annotation of the 13 training regions, as well as to the sequences and all additional publicly available data for all 44 ENCODE regions. No other pre-defined and pre-selected auxiliary data, such as cDNA databases, EST sequences or other genome alignments, were given to the submitters. However, many of the 31 test regions had been previously and extensively annotated by other groups. For example, <it>ENm001</it>, the greater cystic fibrosis transmembrane receptor (CFTR) region, has been extensively studied <abbrgrp><abbr bid="B44">44</abbr></abbrgrp>.</p>
               <p>Participants were asked to submit their genome annotations on the 31 ENCODE test regions, using whatever methods and data were available to them. To be able to better compare different DNA feature prediction methods, we predefined the following prediction categories and asked the submitters to indicate in which category they were submitting: methods using any type of available information; single-genome <it>ab initio </it>methods; EST-, mRNA-, and protein-based methods; dual- or multiple-genome based methods; methods predicting unusual genes (non-canonical splicing, short intron-less genes, and so on); and exon-only predictions.</p>
               <p>Finally, we allowed an extra category (category 7) for methods predicting other annotation features, including pseudogenes and promoters. Bajic <it>et al</it>. <abbrgrp><abbr bid="B45">45</abbr></abbrgrp> have conducted a comprehensive evaluation of the promoter predictions and see Zheng and Gerstein <abbrgrp><abbr bid="B46">46</abbr></abbrgrp> for a paper on pseudogenes.</p>
               <p>A web server (Figure <figr fid="F1">1</figr>) <abbrgrp><abbr bid="B47">47</abbr></abbrgrp> was set up to collect all the submissions and each group was able to submit predictions for more than one category. The submitted predictions, as well as the GENCODE annotations for the test sequence set, were kept confidential until the submission deadline on 15 April 2005. The format for submissions was the Gene Transfer Format (GTF) <abbrgrp><abbr bid="B48">48</abbr></abbrgrp>. An advisory committee (Table <tblr tid="T2">2</tblr>) was formed to oversee the submission and evaluation processes and provide advice for the evaluation.</p>
               <fig id="F1">
                  <title>
                     <p>Figure 1</p>
                  </title>
                  <caption>
                     <p>A screenshot of the EGASP submission server [47]</p>
                  </caption>
                  <text>
                     <p>A screenshot of the EGASP submission server [47]. The server was user-authenticated in order to keep the submitted predictions in private before the EGASP workshop. Initially, there were eight suggested submission categories. However, after the workshop, category 5 was not used at all and removed. Promoter and pseudogene predictions from category 8 were then kept as a new category 7, which is not analyzed in this paper (see [45] instead).</p>
                  </text>
                  <graphic file="gb-2006-7-s1-s2-1"/>
               </fig>
               <tbl id="T2">
                  <title>
                     <p>Table 2</p>
                  </title>
                  <caption>
                     <p>EGASP organizing and advisory committees</p>
                  </caption>
                  <tblbdy cols="2">
                     <r>
                        <c ca="left">
                           <p>Organizers</p>
                        </c>
                        <c ca="left">
                           <p>Advisory board</p>
                        </c>
                     </r>
                     <r>
                        <c cspan="2">
                           <hr/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>Jennifer Ashurst (Wellcome Trust Sanger Institute)</p>
                        </c>
                        <c ca="left">
                           <p>Michael Ashburner (Cambridge University)</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>Ewan Birney (European Bionformatics Institute)</p>
                        </c>
                        <c ca="left">
                           <p>Vladimir B Bajic (Institute for Infocomm Research)</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>Peter Good (National Human Genome Research Institute)</p>
                        </c>
                        <c ca="left">
                           <p>Tom Gingeras (Affymetrix, Inc.)</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>Roderic Guig&#243; (Institut Municipal d'Investigaci&#243; M&#232;dica)</p>
                        </c>
                        <c ca="left">
                           <p>Suzanna Lewis (Berkeley)</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>Tim Hubbard (Wellcome Trust Sanger Institute)</p>
                        </c>
                        <c ca="left">
                           <p>Martin Reese (Omicia, Inc.)</p>
                        </c>
                     </r>
                  </tblbdy>
               </tbl>
               <p>By the submission deadline on 15 April 2005, 18 groups had submitted 30 prediction sets (Table <tblr tid="T3">3</tblr>). All the submitted predictions together with the annotations are available through the GencodeDB Genome Browser (Figure <figr fid="F2">2</figr>) <abbrgrp><abbr bid="B49">49</abbr></abbrgrp>, as well as through the UCSC Genome Browser ('EGASP' tracks). They can also be downloaded from the ftp server as plain text GTF files <abbrgrp><abbr bid="B50">50</abbr></abbrgrp>.</p>
               <tbl id="T3">
                  <title>
                     <p>Table 3</p>
                  </title>
                  <caption>
                     <p>Summary of programs used to determine predictions submitted for each EGASP category</p>
                  </caption>
                  <tblbdy cols="4">
                     <r>
                        <c ca="left">
                           <p>Submission category</p>
                        </c>
                        <c ca="center">
                           <p>Program</p>
                        </c>
                        <c ca="center">
                           <p>Affiliation</p>
                        </c>
                        <c ca="center">
                           <p>Reference</p>
                        </c>
                     </r>
                     <r>
                        <c cspan="4">
                           <hr/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>1 (AUGUSTUS-any)</p>
                        </c>
                        <c ca="center">
                           <p>AUGUSTUS</p>
                        </c>
                        <c ca="center">
                           <p>Georg-August-Universit&#228;t, G&#246;ttingen</p>
                        </c>
                        <c ca="center">
                           <p>[58]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>2 (AUGUSTUS-abinit)</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3 (AUGUSTUS-EST)</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4 (AUGUSTUS-dual)</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>1</p>
                        </c>
                        <c ca="center">
                           <p>FGENESH++</p>
                        </c>
                        <c ca="center">
                           <p>Softberry Inc.</p>
                        </c>
                        <c ca="center">
                           <p>[56]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>1</p>
                        </c>
                        <c ca="center">
                           <p>JIGSAW</p>
                        </c>
                        <c ca="center">
                           <p>The Institute for Genomic Research (TIGR)</p>
                        </c>
                        <c ca="center">
                           <p>[59]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>1 (PAIRAGON-any)</p>
                        </c>
                        <c ca="center">
                           <p>PAIRAGON and NSCAN_EST</p>
                        </c>
                        <c ca="center">
                           <p>Washington University, Saint Louis (WUSTL)</p>
                        </c>
                        <c ca="center">
                           <p>[57]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3 (PAIRAGON+NSCAN_EST)</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>2</p>
                        </c>
                        <c ca="center">
                           <p>GENEMARK.hmm</p>
                        </c>
                        <c ca="center">
                           <p>Georgia Institute of Technology</p>
                        </c>
                        <c ca="center">
                           <p>[60]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>2</p>
                        </c>
                        <c ca="center">
                           <p>GENEZILLA</p>
                        </c>
                        <c ca="center">
                           <p>TIGR</p>
                        </c>
                        <c ca="center">
                           <p>[81]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>ACEVIEW</p>
                        </c>
                        <c ca="center">
                           <p>National Center for Biotechnology Information (NCBI)</p>
                        </c>
                        <c ca="center">
                           <p>[52]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>ENSEMBL</p>
                        </c>
                        <c ca="center">
                           <p>The Wellcome Trust Sanger Institute (WTSI) and</p>
                        </c>
                        <c ca="center">
                           <p>[64]</p>
                        </c>
                     </r>
                     <r>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>European Bioinformatics Institute (EBI)</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>EXOGEAN</p>
                        </c>
                        <c ca="center">
                           <p>Ecole Normale Superieure, Paris</p>
                        </c>
                        <c ca="center">
                           <p>[62]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>EXONHUNTER</p>
                        </c>
                        <c ca="center">
                           <p>University of Waterloo</p>
                        </c>
                        <c ca="center">
                           <p>[63]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4</p>
                        </c>
                        <c ca="center">
                           <p>ACESCAN*</p>
                        </c>
                        <c ca="center">
                           <p>Salk Institute</p>
                        </c>
                        <c ca="center">
                           <p>[82]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4</p>
                        </c>
                        <c ca="center">
                           <p>DOGFISH-C</p>
                        </c>
                        <c ca="center">
                           <p>WTSI</p>
                        </c>
                        <c ca="center">
                           <p>[67]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4</p>
                        </c>
                        <c ca="center">
                           <p>NSCAN</p>
                        </c>
                        <c ca="center">
                           <p>WUSTL</p>
                        </c>
                        <c ca="center">
                           <p>[57]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4</p>
                        </c>
                        <c ca="center">
                           <p>SAGA</p>
                        </c>
                        <c ca="center">
                           <p>University of California at Berkeley</p>
                        </c>
                        <c ca="center">
                           <p>[66]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4</p>
                        </c>
                        <c ca="center">
                           <p>MARS</p>
                        </c>
                        <c ca="center">
                           <p>WUSTL - EBI</p>
                        </c>
                        <c ca="center">
                           <p>[65]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>5</p>
                        </c>
                        <c ca="center">
                           <p>GENEID-U12</p>
                        </c>
                        <c ca="center">
                           <p>Institut Municipal d'Investigaci&#243;</p>
                        </c>
                        <c ca="center">
                           <p>-</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>5</p>
                        </c>
                        <c ca="center">
                           <p>SGP2-U12</p>
                        </c>
                        <c ca="center">
                           <p>M&#232;dica, Barcelona</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>6</p>
                        </c>
                        <c ca="center">
                           <p>ASPIC<sup>&#8224;</sup></p>
                        </c>
                        <c ca="center">
                           <p>Universit&#224; degli Studi di Milano</p>
                        </c>
                        <c ca="center">
                           <p>[83]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>6 (AUGUSTUS-exon)</p>
                        </c>
                        <c ca="center">
                           <p>AUGUSTUS</p>
                        </c>
                        <c ca="center">
                           <p>Georg-August-Universit&#228;t, G&#246;ttingen</p>
                        </c>
                        <c ca="center">
                           <p>[58]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>6</p>
                        </c>
                        <c ca="center">
                           <p>CSTMINER<sup>&#8225;</sup></p>
                        </c>
                        <c ca="center">
                           <p>Universit&#224; degli Studi di Milano</p>
                        </c>
                        <c ca="center">
                           <p>[84]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>6</p>
                        </c>
                        <c ca="center">
                           <p>DOGFISH-C-E<sup>&#167;</sup></p>
                        </c>
                        <c ca="center">
                           <p>WTSI</p>
                        </c>
                        <c ca="center">
                           <p>[67]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>6</p>
                        </c>
                        <c ca="center">
                           <p>SPIDA</p>
                        </c>
                        <c ca="center">
                           <p>EBI</p>
                        </c>
                        <c ca="center">
                           <p>[85]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>6</p>
                        </c>
                        <c ca="center">
                           <p>UNCOVER<sup>&#167;</sup></p>
                        </c>
                        <c ca="center">
                           <p>Duke University</p>
                        </c>
                        <c ca="center">
                           <p>[86]</p>
                        </c>
                     </r>
                     <r>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>1</p>
                        </c>
                        <c ca="center">
                           <p>CCDSGene</p>
                        </c>
                        <c ca="center">
                           <p>UCSC tracks [7]</p>
                        </c>
                        <c ca="center">
                           <p>[55]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>1</p>
                        </c>
                        <c ca="center">
                           <p>KNOWNGene</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[54]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>1</p>
                        </c>
                        <c ca="center">
                           <p>REFSEQ (REFGene)</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[4]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>2</p>
                        </c>
                        <c ca="center">
                           <p>GENEID</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[19]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>2</p>
                        </c>
                        <c ca="center">
                           <p>GENSCAN</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[18]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>ACEMBLY</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[52]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>ECGene</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[53]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>ENSEMBL (ENSGene)</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[6]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>3</p>
                        </c>
                        <c ca="center">
                           <p>MGCGene</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[5]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4</p>
                        </c>
                        <c ca="center">
                           <p>SGP2</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[9]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>4</p>
                        </c>
                        <c ca="center">
                           <p>TWINSCAN</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c ca="center">
                           <p>[12,13]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>-</p>
                        </c>
                        <c ca="center">
                           <p>CODING 20050607</p>
                        </c>
                        <c ca="center">
                           <p>GENCODE annotation</p>
                        </c>
                        <c ca="center">
                           <p>[33]</p>
                        </c>
                     </r>
                     <r>
                        <c ca="left">
                           <p>-</p>
                        </c>
                        <c ca="center">
                           <p>GENES 20050607</p>
                        </c>
                        <c>
                           <p/>
                        </c>
                        <c>
                           <p/>
                        </c>
                     </r>
                  </tblbdy>
                  <tblfn>
                     <p>A complete listing of the number of features for each sequence obtained by each method is available at the Supplementary material web page [51]. *The ACESCAN group submitted results only for the training set and, therefore, has not been evaluated. <sup>&#8224;</sup>ASPIC only provided results for the training regions and, therefore, has not been evaluated. Moreover, ASPIC submitted only intron annotations and should be considered in category 6. <sup>&#8225;</sup>CSTMINER predicts coding regions but does not provide strand information. <sup>&#167;</sup>DOGFISH-C-E and UNCOVER predict only novel exons; this makes it difficult to compare these methods with the others in the same category.</p>
                  </tblfn>
               </tbl>
               <fig id="F2">
                  <title>
                     <p>Figure 2</p>
                  </title>
                  <caption>
                     <p>The GencodeDB Genome Browser</p>
                  </caption>
                  <text>
                     <p>The GencodeDB Genome Browser. A screenshot of the GencodeDB Genome Browser [49], displaying the annotation features on 100 Kbp from the ENm001 region (chr7: 116,074,892-116,174,891). The annotations along with the predicted genes by each submitted method were made publicly available together with further experimental evidence, such as TARs/transfrags.</p>
                  </text>
                  <graphic file="gb-2006-7-s1-s2-2"/>
               </fig>
               <p>Predictions were compared with the reference set GENCODE annotations and assessed by members of the advisory and organizing committees (Table <tblr tid="T2">2</tblr>), all selected as independent experts in this field. The results of this assessment were presented at a workshop that took place at the Wellcome Trust Genome Campus in Hinxton, UK, on 6 and 7 May 2005. The advisory and organizing committees met on 4 May for a pre-evaluation of the predictions, and to determine a number of summary statistics. Each of the submitting groups was invited to present their methods and submissions at the workshop with a focus on what went right and what went wrong. In total, 16 groups were represented at the workshop. The final prediction evaluation results from the workshop are discussed in the next section.</p>
            </sec>
         </sec>
      </sec>
      <sec>
         <st>
            <p>Results</p>
         </st>
         <sec>
            <st>
               <p>The evaluation of the predictions against the annotation</p>
            </st>
            <sec>
               <st>
                  <p>The protocol to evaluate the predictions</p>
               </st>
               <p>The main goal of the EGASP experiment was to evaluate the ability of automatic methods of genome annotation to reproduce the manual and experimental annotation of the ENCODE regions described above. By this standard, a perfect prediction strategy would produce annotation completely consistent with the GENCODE annotation.</p>
               <p>For the purposes of evaluating the submitted predictions, we considered only the results for the 31 test ENCODE regions, which were the 'blinded' regions for which no GENCODE annotations were available during the submission phase. Potential biases introduced by this restriction will be addressed below. The statistics reported are computed globally for the test region, which means that the total number of prediction successes and failures for all 31 regions are compared directly to the total number of annotated exons, transcripts and genes for all 31 regions.</p>
               <p>We evaluated each set of submitted predictions at four distinct levels: nucleotide accuracy, exon accuracy, transcript accuracy, and gene accuracy. At the earlier GASP1 workshop, transcript accuracy levels were not assessed due to the limited transcript information and the lower levels of alternatively spliced transcripts in <it>Drosophila melanogaster </it><abbrgrp><abbr bid="B27">27</abbr></abbrgrp>. For this study we also made a distinction between the statistics calculated for the coding portions of the mRNA transcripts (coding sequence (CDS) evaluations) and the mRNA transcripts as a whole (mRNA evaluations).</p>
               <p>For each of the four levels, we calculated the sensitivity and specificity of the predictions as defined below. In some cases, we have also computed other standard measures previously used in the gene finding literature (see <abbrgrp><abbr bid="B22">22</abbr><abbr bid="B23">23</abbr><abbr bid="B24">24</abbr><abbr bid="B25">25</abbr><abbr bid="B26">26</abbr><abbr bid="B27">27</abbr></abbrgrp>). Many additional measures of accuracy have been computed on the EGASP predictions, and they are available through the Supplementary Material web page <abbrgrp><abbr bid="B51">51</abbr></abbrgrp>.</p>
            </sec>
            <sec>
               <st>
                  <p>Non-EGASP entries</p>
               </st>
               <p>To compare the EGASP results to existing community standards, we also evaluated the performance of 11 gene annotation tracks published in the UCSC Browser <abbrgrp><abbr bid="B7">7</abbr></abbrgrp> just before the start of the EGASP workshop. These tracks included two single genome <it>ab initio </it>prediction methods (GENSCAN <abbrgrp><abbr bid="B18">18</abbr></abbrgrp> and GENEID <abbrgrp><abbr bid="B19">19</abbr></abbrgrp>) and two dual-genome prediction methods (TWINSCAN <abbrgrp><abbr bid="B12">12</abbr><abbr bid="B13">13</abbr></abbrgrp> and SGP2 <abbrgrp><abbr bid="B9">9</abbr></abbrgrp>). We also considered four methods we classified as using expressed sequence (ENSGENE <abbrgrp><abbr bid="B6">6</abbr></abbrgrp>, ACEMBLY <abbrgrp><abbr bid="B52">52</abbr></abbrgrp>, MGCGENES <abbrgrp><abbr bid="B5">5</abbr></abbrgrp>, and ECGENE <abbrgrp><abbr bid="B53">53</abbr></abbrgrp>) and three we classified as using any information (UCSC 'KNOWN' genes <abbrgrp><abbr bid="B54">54</abbr></abbrgrp>, REFSEQ genes <abbrgrp><abbr bid="B4">4</abbr></abbrgrp>, and CCDSGENES <abbrgrp><abbr bid="B55">55</abbr></abbrgrp>).</p>
            </sec>
            <sec>
               <st>
                  <p>Measures used for evaluating predictions: definitions</p>
               </st>
               <p>Nucleotide level accuracy is a comparison of the annotated nucleotides with the predicted nucleotides. Individual nucleotides appearing in more than one transcript in either the annotation or the predictions are considered only once for the nucleotide level statistics (Figure <figr fid="F3">3a</figr>). Nucleotide predictions must be on the same strand as the annotations to be counted as correct. At the nucleotide level, sensitivity (Sn) is the proportion of annotated nucleotides (as being coding or part of an mRNA molecule) that is correctly predicted, and specificity (Sp) the proportion of predicted nucleotides (as being coding or part of an mRNA molecule) that is so annotated. As a summary measure, we have computed either the simple average of these two measures, or the correlation coefficient between the annotated and the predicted nucleotides (see <abbrgrp><abbr bid="B22">22</abbr><abbr bid="B23">23</abbr><abbr bid="B24">24</abbr><abbr bid="B25">25</abbr><abbr bid="B26">26</abbr><abbr bid="B27">27</abbr></abbrgrp>).</p>
               <fig id="F3">
                  <title>
                     <p>Figure 3</p>
                  </title>
                  <caption>
                     <p>Gene Feature Projection for evaluation</p>
                  </caption>
                  <text>
                     <p>Gene Feature Projection for evaluation. The process of projecting genic features into unique nucleotide and exon coordinates in order to compute the accuracy values (see text for details).</p>
                  </text>
                  <graphic file="gb-2006-7-s1-s2-3"/>
               </fig>
               <p>The exon level accuracy is calculated with the requirement that an exon in the prediction must have identical start and end coordinates as an exon in the annotation to be counted correct. Only the unique exons in each set are considered (see Figure <figr fid="F3">3b</figr> for a graphical example of how unique exons are collected from both the annotation and prediction sets; also see <abbrgrp><abbr bid="B22">22</abbr><abbr bid="B23">23</abbr><abbr bid="B24">24</abbr><abbr bid="B25">25</abbr><abbr bid="B26">26</abbr><abbr bid="B27">27</abbr></abbrgrp> for more details on these definitions). At the exon level, sensitivity is computed as the proportion of annotated exons correctly predicted, and specificity as the proportion of predicted exons that is annotated. As a summary measure, we have computed the average of these two measures. In addition, we have computed 'missing exons' (MEs), the proportion of annotated exons totally missed by the predictions (that is, there is no overlap by a predicted exon by at least 1 bp), and 'wrong exons' (WEs), the proportion of predicted exons not overlapping annotated exons by at least 1 bp. A subset of predicted exons falling in regions annotated as intergenic have been tested experimentally (see the section 'The experimental test of unannotated predictions' below for details). Nucleotide and exon level accuracy are calculated for the CDS evaluation and for the mRNA evaluation. Comparison of the results of these evaluation strategies highlights the differences for those programs that attempt to predict untranslated regions (UTRs) of genes.</p>
               <p>The transcript and gene level accuracy measures are more stringent. We consider a transcript accurately predicted for the CDS evaluation if the beginning and end of translation are correctly annotated and each of the 5' and 3' splice sites for the coding exons are correct. Similarly, for the mRNA evaluation, a transcript is counted correct if all of the exons from the start of transcription to the end of transcription are correctly predicted. Thus, at the transcript level, sensitivity is the proportion of annotated transcripts that is correctly predicted, and specificity is the proportion of predicted transcripts that is correct. A gene is counted correct if at least one transcript in the locus is correct as defined above, and sensitivity and specificity are defined accordingly. Using these definitions, transcript accuracy is the most stringent measure for both the CDS evaluation and for the mRNA evaluation (Figure <figr fid="F4">4</figr>).</p>
               <fig id="F4">
                  <title>
                     <p>Figure 4</p>
                  </title>
                  <caption>
                     <p>Gene transcript evaluation</p>
                  </caption>
                  <text>
                     <p>Gene transcript evaluation. Computing sensitivity and specificity at transcript level: <b>(a) </b>complete transcript annotation; <b>(b) </b>incomplete transcript annotation. Transcripts marked with an asterisk are considered 'consistent with the annotation' and will be scored as correct.</p>
                  </text>
                  <graphic file="gb-2006-7-s1-s2-4"/>
               </fig>
               <p>The accuracy of the prediction methods must be considered in the context of the annotation, which contains a significant fraction of incomplete transcripts. In the case of an incomplete transcript, we made the distinction that if a prediction is completely consistent with the annotation, it will be counted correct. For example, if the annotation contains an incomplete transcript with three exons and a prediction method includes a transcript with these exons plus an additional exon, we consider the prediction to be completely consistent with the annotation and count it as a correct prediction. For the CDS evaluation, if the annotation contains a complete coding transcript, it must be predicted correctly and no additional exons are allowed (Figure <figr fid="F4">4</figr>).</p>
            </sec>
         </sec>
         <sec>
            <st>
               <p>Global results and trends</p>
            </st>
            <p>The evaluation statistics discussed above for the CDS evaluation are provided in Tables <tblr tid="T4">4</tblr> and <tblr tid="T5">5</tblr> and for the mRNA evaluation in Table <tblr tid="T6">6</tblr>, which only lists methods that predict full mRNA transcripts. Figures <figr fid="F5">5</figr>, <figr fid="F6">6</figr>, <figr fid="F7">7</figr>, <figr fid="F8">8</figr> display the results for the CDS evaluation at the nucleotide, exon, transcript and gene levels. Values are given for programs in categories 1 to 4 (see previous section and Table <tblr tid="T3">3</tblr>), which constitute the bulk of the submitted predictions. The accuracies of the programs in other categories are often not strictly comparable and, therefore, not shown in these figures. They are, however, given in the Supplementary material <abbrgrp><abbr bid="B51">51</abbr></abbrgrp>. The top panel in Figures <figr fid="F5">5</figr>, <figr fid="F6">6</figr>, <figr fid="F7">7</figr>, <figr fid="F8">8</figr> is a dotplot of sensitivity versus specificity, where each dot represents the performance of one program. The bottom panel includes a boxplot for each program displaying the average of sensitivity and specificity (that is, (Sn + Sp)/2) for the given program on each of 27 test sequences (see Materials and methods). Four test sequences (ENr112, ENr113, ENr311, ENr313) were removed from the original set of 31 because they did not contain any annotated protein coding genes and, therefore, sensitivity and specificity could not be computed for them. The dotplot intends to capture the global balance between sensitivity and specificity for each program, while the boxplots provide the dispersion of the accuracy of each program predictions across test sequences. At similar average accuracies, programs providing more consistent predictions across sequences may be preferable since their behavior can be better anticipated.</p>
            <tbl id="T4">
               <title>
                  <p>Table 4</p>
               </title>
               <caption>
                  <p>CDS assessment: summary of accuracy measures for CDS features at the nucleotide and exon levels</p>
               </caption>
               <tblbdy cols="8">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="3" ca="center">
                        <p>Nucleotide</p>
                     </c>
                     <c cspan="4" ca="center">
                        <p>Exon</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="3">
                        <hr/>
                     </c>
                     <c cspan="4">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>N<it>Sn</it></p>
                     </c>
                     <c ca="center">
                        <p>N<it>Sp</it></p>
                     </c>
                     <c ca="center">
                        <p>N <it>CC</it></p>
                     </c>
                     <c ca="center">
                        <p>E<it>Sn</it></p>
                     </c>
                     <c ca="center">
                        <p>E<it>Sp</it></p>
                     </c>
                     <c ca="center">
                        <p>ME</p>
                     </c>
                     <c ca="center">
                        <p>WE</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="8">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Category 1</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>AUGUSTUS-any</p>
                     </c>
                     <c ca="center">
                        <p>94.42%</p>
                     </c>
                     <c ca="center">
                        <p>82.43%</p>
                     </c>
                     <c ca="center">
                        <p>0.88</p>
                     </c>
                     <c ca="center">
                        <p>74.67%</p>
                     </c>
                     <c ca="center">
                        <p>76.76%</p>
                     </c>
                     <c ca="center">
                        <p>8.25%</p>
                     </c>
                     <c ca="center">
                        <p>16.29%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>FGENESH++</p>
                     </c>
                     <c ca="center">
                        <p>91.09%</p>
                     </c>
                     <c ca="center">
                        <p>76.89%</p>
                     </c>
                     <c ca="center">
                        <p>0.83</p>
                     </c>
                     <c ca="center">
                        <p>75.18%</p>
                     </c>
                     <c ca="center">
                        <p>69.31%</p>
                     </c>
                     <c ca="center">
                        <p>9.73%</p>
                     </c>
                     <c ca="center">
                        <p>24.64%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>JIGSAW</p>
                     </c>
                     <c ca="center">
                        <p>94.56%</p>
                     </c>
                     <c ca="center">
                        <p>92.19%</p>
                     </c>
                     <c ca="center">
                        <p>0.93</p>
                     </c>
                     <c ca="center">
                        <p>80.61%</p>
                     </c>
                     <c ca="center">
                        <p>89.33%</p>
                     </c>
                     <c ca="center">
                        <p>6.22%</p>
                     </c>
                     <c ca="center">
                        <p>7.78%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>PAIRAGON-any</p>
                     </c>
                     <c ca="center">
                        <p>87.77%</p>
                     </c>
                     <c ca="center">
                        <p>92.78%</p>
                     </c>
                     <c ca="center">
                        <p>0.90</p>
                     </c>
                     <c ca="center">
                        <p>76.85%</p>
                     </c>
                     <c ca="center">
                        <p>88.91%</p>
                     </c>
                     <c ca="center">
                        <p>11.18%</p>
                     </c>
                     <c ca="center">
                        <p>6.82%</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Category 2</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>AUGUSTUS-abinit</p>
                     </c>
                     <c ca="center">
                        <p>78.65%</p>
                     </c>
                     <c ca="center">
                        <p>75.29%</p>
                     </c>
                     <c ca="center">
                        <p>0.76</p>
                     </c>
                     <c ca="center">
                        <p>52.39%</p>
                     </c>
                     <c ca="center">
                        <p>62.93%</p>
                     </c>
                     <c ca="center">
                        <p>29.09%</p>
                     </c>
                     <c ca="center">
                        <p>24.82%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>GENEMARK.hmm-A</p>
                     </c>
                     <c ca="center">
                        <p>78.43%</p>
                     </c>
                     <c ca="center">
                        <p>37.97%</p>
                     </c>
                     <c ca="center">
                        <p>0.53</p>
                     </c>
                     <c ca="center">
                        <p>50.58%</p>
                     </c>
                     <c ca="center">
                        <p>29.01%</p>
                     </c>
                     <c ca="center">
                        <p>27.86%</p>
                     </c>
                     <c ca="center">
                        <p>63.27%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>GENEMARK.hmm-B</p>
                     </c>
                     <c ca="center">
                        <p>76.09%</p>
                     </c>
                     <c ca="center">
                        <p>62.94%</p>
                     </c>
                     <c ca="center">
                        <p>0.69</p>
                     </c>
                     <c ca="center">
                        <p>48.15%</p>
                     </c>
                     <c ca="center">
                        <p>47.25%</p>
                     </c>
                     <c ca="center">
                        <p>31.77%</p>
                     </c>
                     <c ca="center">
                        <p>40.68%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>GENEZILLA</p>
                     </c>
                     <c ca="center">
                        <p>87.56%</p>
                     </c>
                     <c ca="center">
                        <p>50.93%</p>
                     </c>
                     <c ca="center">
                        <p>0.66</p>
                     </c>
                     <c ca="center">
                        <p>62.08%</p>
                     </c>
                     <c ca="center">
                        <p>50.25%</p>
                     </c>
                     <c ca="center">
                        <p>19.14%</p>
                     </c>
                     <c ca="center">
                        <p>41.93%</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Category 3</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>ACEVIEW</p>
                     </c>
                     <c ca="center">
                        <p>90.94%</p>
                     </c>
                     <c ca="center">
                        <p>79.14%</p>
                     </c>
                     <c ca="center">
                        <p>0.84</p>
                     </c>
                     <c ca="center">
                        <p>85.75%</p>
                     </c>
                     <c ca="center">
                        <p>56.98%</p>
                     </c>
                     <c ca="center">
                        <p>4.38%</p>
                     </c>
                     <c ca="center">
                        <p>16.69%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>AUGUSTUS-EST</p>
                     </c>
                     <c ca="center">
                        <p>92.62%</p>
                     </c>
                     <c ca="center">
                        <p>83.45%</p>
                     </c>
                     <c ca="center">
                        <p>0.88</p>
                     </c>
                     <c ca="center">
                        <p>74.10%</p>
                     </c>
                     <c ca="center">
                        <p>77.40%</p>
                     </c>
                     <c ca="center">
                        <p>9.01%</p>
                     </c>
                     <c ca="center">
                        <p>15.61%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>ENSEMBL</p>
                     </c>
                     <c ca="center">
                        <p>90.18%</p>
                     </c>
                     <c ca="center">
                        <p>92.02%</p>
                     </c>
                     <c ca="center">
                        <p>0.91</p>
                     </c>
                     <c ca="center">
                        <p>77.53%</p>
                     </c>
                     <c ca="center">
                        <p>82.65%</p>
                     </c>
                     <c ca="center">
                        <p>9.99%</p>
                     </c>
                     <c ca="center">
                        <p>9.22%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>EXOGEAN</p>
                     </c>
                     <c ca="center">
                        <p>84.18%</p>
                     </c>
                     <c ca="center">
                        <p>94.33%</p>
                     </c>
                     <c ca="center">
                        <p>0.89</p>
                     </c>
                     <c ca="center">
                        <p>79.34%</p>
                     </c>
                     <c ca="center">
                        <p>83.45%</p>
                     </c>
                     <c ca="center">
                        <p>9.88%</p>
                     </c>
                     <c ca="center">
                        <p>5.06%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>EXONHUNTER</p>
                     </c>
                     <c ca="center">
                        <p>90.46%</p>
                     </c>
                     <c ca="center">
                        <p>59.67%</p>
                     </c>
                     <c ca="center">
                        <p>0.73</p>
                     </c>
                     <c ca="center">
                        <p>64.44%</p>
                     </c>
                     <c ca="center">
                        <p>41.77%</p>
                     </c>
                     <c ca="center">
                        <p>14.29%</p>
                     </c>
                     <c ca="center">
                        <p>50.94%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>PAIRAGON+NSCAN_EST</p>
                     </c>
                     <c ca="center">
                        <p>87.56%</p>
                     </c>
                     <c ca="center">
                        <p>92.77%</p>
                     </c>
                     <c ca="center">
                        <p>0.90</p>
                     </c>
                     <c ca="center">
                        <p>76.63%</p>
                     </c>
                     <c ca="center">
                        <p>88.95%</p>
                     </c>
                     <c ca="center">
                        <p>11.51%</p>
                     </c>
                     <c ca="center">
                        <p>6.85%</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Category 4</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>AUGUSTUS-dual</p>
                     </c>
                     <c ca="center">
                        <p>88.86%</p>
                     </c>
                     <c ca="center">
                        <p>80.15%</p>
                     </c>
                     <c ca="center">
                        <p>0.84</p>
                     </c>
                     <c ca="center">
                        <p>63.06%</p>
                     </c>
                     <c ca="center">
                        <p>69.14%</p>
                     </c>
                     <c ca="center">
                        <p>16.82%</p>
                     </c>
                     <c ca="center">
                        <p>19.60%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>DOGFISH</p>
                     </c>
                     <c ca="center">
                        <p>64.81%</p>
                     </c>
                     <c ca="center">
                        <p>88.24%</p>
                     </c>
                     <c ca="center">
                        <p>0.74</p>
                     </c>
                     <c ca="center">
                        <p>53.11%</p>
                     </c>
                     <c ca="center">
                        <p>77.34%</p>
                     </c>
                     <c ca="center">
                        <p>32.67%</p>
                     </c>
                     <c ca="center">
                        <p>11.70%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>MARS</p>
                     </c>
                     <c ca="center">
                        <p>84.25%</p>
                     </c>
                     <c ca="center">
                        <p>74.13%</p>
                     </c>
                     <c ca="center">
                        <p>0.78</p>
                     </c>
                     <c ca="center">
                        <p>65.56%</p>
                     </c>
                     <c ca="center">
                        <p>61.65%</p>
                     </c>
                     <c ca="center">
                        <p>20.26%</p>
                     </c>
                     <c ca="center">
                        <p>26.10%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>NSCAN</p>
                     </c>
                     <c ca="center">
                        <p>85.38%</p>
                     </c>
                     <c ca="center">
                        <p>89.02%</p>
                     </c>
                     <c ca="center">
                        <p>0.87</p>
                     </c>
                     <c ca="center">
                        <p>67.66%</p>
                     </c>
                     <c ca="center">
                        <p>82.05%</p>
                     </c>
                     <c ca="center">
                        <p>17.11%</p>
                     </c>
                     <c ca="center">
                        <p>10.93%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>SAGA</p>
                     </c>
                     <c ca="center">
                        <p>52.54%</p>
                     </c>
                     <c ca="center">
                        <p>81.39%</p>
                     </c>
                     <c ca="center">
                        <p>0.65</p>
                     </c>
                     <c ca="center">
                        <p>38.82%</p>
                     </c>
                     <c ca="center">
                        <p>50.73%</p>
                     </c>
                     <c ca="center">
                        <p>40.48%</p>
                     </c>
                     <c ca="center">
                        <p>27.85%</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>UCSC Tracks</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>ACEMBLY</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>96.43%</p>
                     </c>
                     <c ca="center">
                        <p>58.47%</p>
                     </c>
                     <c ca="center">
                        <p>0.74</p>
                     </c>
                     <c ca="center">
                        <p>84.66%</p>
                     </c>
                     <c ca="center">
                        <p>38.32%</p>
                     </c>
                     <c ca="center">
                        <p>2.71%</p>
                     </c>
                     <c ca="center">
                        <p>28.55%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>CCDSgene</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>56.87%</p>
                     </c>
                     <c ca="center">
                        <p>99.52%</p>
                     </c>
                     <c ca="center">
                        <p>0.75</p>
                     </c>
                     <c ca="center">
                        <p>51.95%</p>
                     </c>
                     <c ca="center">
                        <p>97.75%</p>
                     </c>
                     <c ca="center">
                        <p>40.38%</p>
                     </c>
                     <c ca="center">
                        <p>0.27%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>ECgene</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>96.36%</p>
                     </c>
                     <c ca="center">
                        <p>47.30%</p>
                     </c>
                     <c ca="center">
                        <p>0.66</p>
                     </c>
                     <c ca="center">
                        <p>86.22%</p>
                     </c>
                     <c ca="center">
                        <p>35.08%</p>
                     </c>
                     <c ca="center">
                        <p>2.64%</p>
                     </c>
                     <c ca="center">
                        <p>45.92%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>ENSgene</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>91.39%</p>
                     </c>
                     <c ca="center">
                        <p>91.92%</p>
                     </c>
                     <c ca="center">
                        <p>0.92</p>
                     </c>
                     <c ca="center">
                        <p>77.71%</p>
                     </c>
                     <c ca="center">
                        <p>82.39%</p>
                     </c>
                     <c ca="center">
                        <p>9.80%</p>
                     </c>
                     <c ca="center">
                        <p>9.21%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>GENEID</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>76.77%</p>
                     </c>
                     <c ca="center">
                        <p>76.48%</p>
                     </c>
                     <c ca="center">
                        <p>0.76</p>
                     </c>
                     <c ca="center">
                        <p>53.84%</p>
                     </c>
                     <c ca="center">
                        <p>61.08%</p>
                     </c>
                     <c ca="center">
                        <p>27.86%</p>
                     </c>
                     <c ca="center">
                        <p>27.26%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>GENSCAN</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>84.17%</p>
                     </c>
                     <c ca="center">
                        <p>60.60%</p>
                     </c>
                     <c ca="center">
                        <p>0.71</p>
                     </c>
                     <c ca="center">
                        <p>58.65%</p>
                     </c>
                     <c ca="center">
                        <p>46.37%</p>
                     </c>
                     <c ca="center">
                        <p>19.50%</p>
                     </c>
                     <c ca="center">
                        <p>42.91%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>KNOWNgene</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>89.10%</p>
                     </c>
                     <c ca="center">
                        <p>93.61%</p>
                     </c>
                     <c ca="center">
                        <p>0.91</p>
                     </c>
                     <c ca="center">
                        <p>78.11%</p>
                     </c>
                     <c ca="center">
                        <p>82.28%</p>
                     </c>
                     <c ca="center">
                        <p>10.27%</p>
                     </c>
                     <c ca="center">
                        <p>4.30%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>MGCgene</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>44.06%</p>
                     </c>
                     <c ca="center">
                        <p>97.56%</p>
                     </c>
                     <c ca="center">
                        <p>0.65</p>
                     </c>
                     <c ca="center">
                        <p>42.95%</p>
                     </c>
                     <c ca="center">
                        <p>93.61%</p>
                     </c>
                     <c ca="center">
                        <p>49.28%</p>
                     </c>
                     <c ca="center">
                        <p>2.68%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>REFgene</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>85.34%</p>
                     </c>
                     <c ca="center">
                        <p>98.50%</p>
                     </c>
                     <c ca="center">
                        <p>0.92</p>
                     </c>
                     <c ca="center">
                        <p>73.23%</p>
                     </c>
                     <c ca="center">
                        <p>94.67%</p>
                     </c>
                     <c ca="center">
                        <p>15.38%</p>
                     </c>
                     <c ca="center">
                        <p>1.22%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>SGPgene</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>82.81%</p>
                     </c>
                     <c ca="center">
                        <p>82.20%</p>
                     </c>
                     <c ca="center">
                        <p>0.82</p>
                     </c>
                     <c ca="center">
                        <p>60.56%</p>
                     </c>
                     <c ca="center">
                        <p>65.16%</p>
                     </c>
                     <c ca="center">
                        <p>19.36%</p>
                     </c>
                     <c ca="center">
                        <p>22.85%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>
                           <it>TWINSCAN</it>
                        </p>
                     </c>
                     <c ca="center">
                        <p>78.16%</p>
                     </c>
                     <c ca="center">
                        <p>84.59%</p>
                     </c>
                     <c ca="center">
                        <p>0.81</p>
                     </c>
                     <c ca="center">
                        <p>58.43%</p>
                     </c>
                     <c ca="center">
                        <p>73.11%</p>
                     </c>
                     <c ca="center">
                        <p>24.64%</p>
                     </c>
                     <c ca="center">
                        <p>16.30%</p>
                     </c>
                  </r>
               </tblbdy>
               <tblfn>
                  <p>CC, correlation coefficient.</p>
               </tblfn>
            </tbl>
            <tbl id="T5">
               <title>
                  <p>Table 5</p>
               </title>
               <caption>
                  <p>CDS assessment at the transcript and gene levels</p>
               </caption>
               <tblbdy cols="6">
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="2" ca="center">
                        <p>Transcript</p>
                     </c>
                     <c cspan="2" ca="center">
                        <p>Gene</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c cspan="2">
                        <hr/>
                     </c>
                     <c cspan="2">
                        <hr/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="center">
                        <p>T<it>Sn</it></p>
                     </c>
                     <c ca="center">
                        <p>T<it>Sp</it></p>
                     </c>
                     <c ca="center">
                        <p>G<it>Sn</it></p>
                     </c>
                     <c ca="center">
                        <p>G<it>Sp</it></p>
                     </c>
                     <c ca="center">
                        <p>Ratio CDS/UTR</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="6">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Category 1</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>AUGUSTUS-any</p>
                     </c>
                     <c ca="center">
                        <p>22.65%</p>
                     </c>
                     <c ca="center">
                        <p>35.59%</p>
                     </c>
                     <c ca="center">
                        <p>47.97%</p>
                     </c>
                     <c ca="center">
                        <p>35.59%</p>
                     </c>
                     <c ca="center">
                        <p>100.00%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>FGENESH++</p>
                     </c>
                     <c ca="center">
                        <p>36.21%</p>
                     </c>
                     <c ca="center">
                        <p>41.61%</p>
                     </c>
                     <c ca="center">
                        <p>69.93%</p>
                     </c>
                     <c ca="center">
                        <p>42.09%</p>
                     </c>
                     <c ca="center">
                        <p>78.25%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>JIGSAW</p>
                     </c>
                     <c ca="center">
                        <p>34.05%</p>
                     </c>
                     <c ca="center">
                        <p>65.95%</p>
                     </c>
                     <c ca="center">
                        <p>72.64%</p>
                     </c>
                     <c ca="center">
                        <p>65.95%</p>
                     </c>
                     <c ca="center">
                        <p>100.00%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>PAIRAGON-any</p>
                     </c>
                     <c ca="center">
                        <p>39.29%</p>
                     </c>
                     <c ca="center">
                        <p>60.34%</p>
                     </c>
                     <c ca="center">
                        <p>69.59%</p>
                     </c>
                     <c ca="center">
                        <p>61.32%</p>
                     </c>
                     <c ca="center">
                        <p>62.92%</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Category 2</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>AUGUSTUS-abinit</p>
                     </c>
                     <c ca="center">
                        <p>11.09%</p>
                     </c>
                     <c ca="center">
                        <p>17.22%</p>
                     </c>
                     <c ca="center">
                        <p>24.32%</p>
                     </c>
                     <c ca="center">
                        <p>17.22%</p>
                     </c>
                     <c ca="center">
                        <p>100.00%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>GENEMARK.hmm-A</p>
                     </c>
                     <c ca="center">
                        <p>6.93%</p>
                     </c>
                     <c ca="center">
                        <p>3.24%</p>
                     </c>
                     <c ca="center">
                        <p>15.20%</p>
                     </c>
                     <c ca="center">
                        <p>3.24%</p>
                     </c>
                     <c ca="center">
                        <p>100.00%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>GENEMARK.hmm-B</p>
                     </c>
                     <c ca="center">
                        <p>7.70%</p>
                     </c>
                     <c ca="center">
                        <p>7.91%</p>
                     </c>
                     <c ca="center">
                        <p>16.89%</p>
                     </c>
                     <c ca="center">
                        <p>7.91%</p>
                     </c>
                     <c ca="center">
                        <p>100.00%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>GENEZILLA</p>
                     </c>
                     <c ca="center">
                        <p>9.09%</p>
                     </c>
                     <c ca="center">
                        <p>8.84%</p>
                     </c>
                     <c ca="center">
                        <p>19.59%</p>
                     </c>
                     <c ca="center">
                        <p>8.84%</p>
                     </c>
                     <c ca="center">
                        <p>100.00%</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Category 3</p>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                     <c>
                        <p/>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>ACEVIEW</p>
                     </c>
                     <c ca="center">
                        <p>44.68%</p>
                     </c>
                     <c ca="center">
                        <p>19.31%</p>
                     </c>
                     <c ca="center">
                        <p>63.51%</p>
                     </c>
                     <c ca="center">
                        <p>48.65%</p>
                     </c>
                     <c ca="center">
                        <p>49.15%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>AUGUSTUS-EST</p>
                     </c>
                     <c ca="center">
                        <p>22.50%</p>
                     </c>
                     <c ca="center">
                        <p>37.01%</p>
                     </c>
                     <c ca="center">
                        <p>47.64%</p>
                     </c>
                     <c ca="center">
                        <p>37.01%</p>
                     </c>
                     <c ca="center">
                        <p>100.00%</p>
                     </c>
                  </r>
                  <r>
                     <c indent="1" ca="left">
                        <p>ENSEMBL</p>
                     </c>
                     <c ca="center">
                        <p>39.75%</p>
                     </c>
                     <c ca="center">
                        <p>54.64%</p>
                     </c>
                     <c ca="center">
                        <p>71.62%</p>
                     </c>
  