<?xml version='1.0'?>
<!DOCTYPE art SYSTEM 'http://www.biomedcentral.com/xml/article.dtd'>
<art>
   <ui>gb-2008-9-s1-s2</ui>
   <ji>GBJ</ji>
   <fm>
      <dochead>Research</dochead>
      <bibl>
         <title>
            <p>A critical assessment of <it>Mus musculus </it>gene function prediction using integrated genomic evidence</p>
         </title>
         <aug>
            <au id="A1">
               <snm>Pe&#241;a-Castillo</snm>
               <fnm>Lourdes</fnm>
               <insr iid="I1"/>
            </au>
            <au id="A2">
               <snm>Tasan</snm>
               <fnm>Murat</fnm>
               <insr iid="I2"/>
            </au>
            <au id="A3">
               <snm>Myers</snm>
               <mi>L</mi>
               <fnm>Chad</fnm>
               <insr iid="I3"/>
            </au>
            <au id="A4">
               <snm>Lee</snm>
               <fnm>Hyunju</fnm>
               <insr iid="I4"/>
            </au>
            <au id="A5">
               <snm>Joshi</snm>
               <fnm>Trupti</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A6">
               <snm>Zhang</snm>
               <fnm>Chao</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A7">
               <snm>Guan</snm>
               <fnm>Yuanfang</fnm>
               <insr iid="I3"/>
            </au>
            <au id="A8">
               <snm>Leone</snm>
               <fnm>Michele</fnm>
               <insr iid="I6"/>
            </au>
            <au id="A9">
               <snm>Pagnani</snm>
               <fnm>Andrea</fnm>
               <insr iid="I6"/>
            </au>
            <au id="A10">
               <snm>Kim</snm>
               <mnm>Kyu</mnm>
               <fnm>Wan</fnm>
               <insr iid="I7"/>
            </au>
            <au id="A11">
               <snm>Krumpelman</snm>
               <fnm>Chase</fnm>
               <insr iid="I8"/>
            </au>
            <au id="A12">
               <snm>Tian</snm>
               <fnm>Weidong</fnm>
               <insr iid="I2"/>
            </au>
            <au id="A13">
               <snm>Obozinski</snm>
               <fnm>Guillaume</fnm>
               <insr iid="I9"/>
            </au>
            <au id="A14">
               <snm>Qi</snm>
               <fnm>Yanjun</fnm>
               <insr iid="I10"/>
            </au>
            <au id="A15">
               <snm>Mostafavi</snm>
               <fnm>Sara</fnm>
               <insr iid="I11"/>
            </au>
            <au id="A16">
               <snm>Lin</snm>
               <mnm>Ning</mnm>
               <fnm>Guan</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A17">
               <snm>Berriz</snm>
               <mi>F</mi>
               <fnm>Gabriel</fnm>
               <insr iid="I2"/>
            </au>
            <au id="A18">
               <snm>Gibbons</snm>
               <mi>D</mi>
               <fnm>Francis</fnm>
               <insr iid="I2"/>
            </au>
            <au id="A19">
               <snm>Lanckriet</snm>
               <fnm>Gert</fnm>
               <insr iid="I12"/>
            </au>
            <au id="A20">
               <snm>Qiu</snm>
               <fnm>Jian</fnm>
               <insr iid="I13"/>
            </au>
            <au id="A21">
               <snm>Grant</snm>
               <fnm>Charles</fnm>
               <insr iid="I13"/>
            </au>
            <au id="A22">
               <snm>Barutcuoglu</snm>
               <fnm>Zafer</fnm>
               <insr iid="I14"/>
            </au>
            <au id="A23">
               <snm>Hill</snm>
               <mi>P</mi>
               <fnm>David</fnm>
               <insr iid="I15"/>
            </au>
            <au id="A24">
               <snm>Warde-Farley</snm>
               <fnm>David</fnm>
               <insr iid="I11"/>
            </au>
            <au id="A25">
               <snm>Grouios</snm>
               <fnm>Chris</fnm>
               <insr iid="I1"/>
            </au>
            <au id="A26">
               <snm>Ray</snm>
               <fnm>Debajyoti</fnm>
               <insr iid="I16"/>
            </au>
            <au id="A27">
               <snm>Blake</snm>
               <mi>A</mi>
               <fnm>Judith</fnm>
               <insr iid="I15"/>
            </au>
            <au id="A28">
               <snm>Deng</snm>
               <fnm>Minghua</fnm>
               <insr iid="I17"/>
            </au>
            <au id="A29">
               <snm>Jordan</snm>
               <mi>I</mi>
               <fnm>Michael</fnm>
               <insr iid="I18"/>
            </au>
            <au id="A30">
               <snm>Noble</snm>
               <mi>S</mi>
               <fnm>William</fnm>
               <insr iid="I19"/>
            </au>
            <au id="A31">
               <snm>Morris</snm>
               <fnm>Quaid</fnm>
               <insr iid="I1"/>
               <insr iid="I11"/>
               <insr iid="I20"/>
            </au>
            <au id="A32">
               <snm>Klein-Seetharaman</snm>
               <fnm>Judith</fnm>
               <insr iid="I21"/>
            </au>
            <au id="A33">
               <snm>Bar-Joseph</snm>
               <fnm>Ziv</fnm>
               <insr iid="I10"/>
            </au>
            <au id="A34">
               <snm>Chen</snm>
               <fnm>Ting</fnm>
               <insr iid="I22"/>
            </au>
            <au id="A35">
               <snm>Sun</snm>
               <fnm>Fengzhu</fnm>
               <insr iid="I22"/>
            </au>
            <au id="A36">
               <snm>Troyanskaya</snm>
               <mi>G</mi>
               <fnm>Olga</fnm>
               <insr iid="I3"/>
            </au>
            <au id="A37">
               <snm>Marcotte</snm>
               <mi>M</mi>
               <fnm>Edward</fnm>
               <insr iid="I7"/>
            </au>
            <au id="A38">
               <snm>Xu</snm>
               <fnm>Dong</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A39" ca="yes">
               <snm>Hughes</snm>
               <mi>R</mi>
               <fnm>Timothy</fnm>
               <insr iid="I1"/>
               <insr iid="I20"/>
               <email>t.hughes@utoronto.ca</email>
            </au>
            <au id="A40" ca="yes">
               <snm>Roth</snm>
               <mi>P</mi>
               <fnm>Frederick</fnm>
               <insr iid="I2"/>
               <insr iid="I23"/>
               <email>fritz_roth@hms.harvard.edu</email>
            </au>
         </aug>
         <insg>
            <ins id="I1">
               <p>Donnelly Centre for Cellular and Biomolecular Research, University of Toronto, Toronto, ON M5S3E1, Canada</p>
            </ins>
            <ins id="I2">
               <p>Department of Biological Chemistry and Molecular Pharmacology, Harvard Medical School, Boston, MA 02115, USA</p>
            </ins>
            <ins id="I3">
               <p>Lewis-Sigler Institute for Integrative Genomics and Department of Molecular Biology, Princeton University, Princeton, NJ 08544, USA</p>
            </ins>
            <ins id="I4">
               <p>Department of Information and Communications, Gwangju Institute of Science and Technology, Gwangju, 500-712 Republic of Korea</p>
            </ins>
            <ins id="I5">
               <p>Digital Biology Laboratory, Computer Science Department and Christopher S Bond Life Sciences Center, University of Missouri, Columbia, MO 65211, USA</p>
            </ins>
            <ins id="I6">
               <p>ISI Foundation, Torino, 10133, Italy</p>
            </ins>
            <ins id="I7">
               <p>Center for Systems and Synthetic Biology, Institute for Cellular and Molecular Biology, University of Texas at Austin, Austin, TX 78712, USA</p>
            </ins>
            <ins id="I8">
               <p>Department of Electrical and Computer Engineering, Institute for Cellular and Molecular Biology, University of Texas at Austin, Austin, TX 78712, USA</p>
            </ins>
            <ins id="I9">
               <p>Department of Statistics, UC Berkeley, Berkeley, CA 94720-3860, USA</p>
            </ins>
            <ins id="I10">
               <p>School of Computer Science, Carnegie Mellon University, Pittsburgh, PA 15213, USA</p>
            </ins>
            <ins id="I11">
               <p>Department of Computer Science, University of Toronto, Toronto, ON M5S3G4, Canada</p>
            </ins>
            <ins id="I12">
               <p>Department of Electrical and Computer Engineering, UC San Diego, La Jolla, CA 92093-0407, USA</p>
            </ins>
            <ins id="I13">
               <p>Department of Genome Sciences, University of Washington, Seattle, WA 98195-5065, USA</p>
            </ins>
            <ins id="I14">
               <p>Department of Computer Science, Princeton University, Princeton, NJ 08544, USA</p>
            </ins>
            <ins id="I15">
               <p>Bioinformatics and Computational Biology, The Jackson Laboratory, Bar Harbor, ME 04609, USA</p>
            </ins>
            <ins id="I16">
               <p>Gatsby Computational Neuroscience Unit, London, WC1N 3AR, UK</p>
            </ins>
            <ins id="I17">
               <p>School of Mathematical Sciences and Center for Theoretical Biology, Peking University, Beijing 100871, PRC</p>
            </ins>
            <ins id="I18">
               <p>Department of Electrical Engineering and Computer Science, and Department of Statistics, UC Berkeley, Berkeley, CA 94720-1776, USA</p>
            </ins>
            <ins id="I19">
               <p>Department of Genome Sciences, and Department of Computer Science and Engineering, University of Washington, Seattle, WA 98195, USA</p>
            </ins>
            <ins id="I20">
               <p>Banting and Best Department of Medical Research, University of Toronto, Toronto, ON M5S 3E1, Canada</p>
            </ins>
            <ins id="I21">
               <p>Department of Structural Biology, University of Pittsburgh School of Medicine, Pittsburgh, PA 15260, USA</p>
            </ins>
            <ins id="I22">
               <p>Molecular and Computational Biology Program, Department of Biological Sciences, University of Southern California, Los Angeles, CA 90089, USA</p>
            </ins>
            <ins id="I23">
               <p>Center for Cancer Systems Biology, Dana-Farber Cancer Institute, Boston, MA 02115, USA</p>
            </ins>
         </insg>
         <source>Genome Biology</source>
         <supplement>
            <title>
               <p>Quantitative inference of gene function from diverse large-scale datasets</p>
            </title>
            <editor>Timothy R Hughes and Frederick P Roth </editor>
            <note>Research</note>
         </supplement>
         <issn>1465-6906</issn>
         <pubdate>2008</pubdate>
         <volume>9</volume>
         <issue>Suppl 1</issue>
         <fpage>S2</fpage>
         <url>http://genomebiology.com/2008/9/S1/S2</url>
         <xrefbib>
            <pubidlist>
               <pubid idtype="pmpid">18613946</pubid>
               <pubid idtype="doi">10.1186/gb-2008-9-s1-s2</pubid>
            </pubidlist>
         </xrefbib>
      </bibl>
      <history>
         <pub>
            <date>
               <day>27</day>
               <month>6</month>
               <year>2008</year>
            </date>
         </pub>
      </history>
      <cpyrt>
         <year>2008</year>
         <collab>Pe&#241;a-Castillo et al; licensee BioMed Central Ltd.</collab>
         <note>This is an open access article distributed under the terms of the Creative Commons Attribution License (<url>http://creativecommons.org/licenses/by/2.0</url>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</note>
      </cpyrt>
      <abs>
         <sec>
            <st>
               <p>Abstract</p>
            </st>
            <sec>
               <st>
                  <p>Background:</p>
               </st>
               <p>Several years after sequencing the human genome and the mouse genome, much remains to be discovered about the functions of most human and mouse genes. Computational prediction of gene function promises to help focus limited experimental resources on the most likely hypotheses. Several algorithms using diverse genomic data have been applied to this task in model organisms; however, the performance of such approaches in mammals has not yet been evaluated.</p>
            </sec>
            <sec>
               <st>
                  <p>Results:</p>
               </st>
               <p>In this study, a standardized collection of mouse functional genomic data was assembled; nine bioinformatics teams used this data set to independently train classifiers and generate predictions of function, as defined by Gene Ontology (GO) terms, for 21,603 mouse genes; and the best performing submissions were combined in a single set of predictions. We identified strengths and weaknesses of current functional genomic data sets and compared the performance of function prediction algorithms. This analysis inferred functions for 76% of mouse genes, including 5,000 currently uncharacterized genes. At a recall rate of 20%, a unified set of predictions averaged 41% precision, with 26% of GO terms achieving a precision better than 90%.</p>
            </sec>
            <sec>
               <st>
                  <p>Conclusion:</p>
               </st>
               <p>We performed a systematic evaluation of diverse, independently developed computational approaches for predicting gene function from heterogeneous data sources in mammals. The results show that currently available data for mammals allows predictions with both breadth and accuracy. Importantly, many highly novel predictions emerge for the 38% of mouse genes that remain uncharacterized.</p>
            </sec>
         </sec>
      </abs>
   </fm>
   <bdy>
      <sec>
         <st>
            <p>Background</p>
         </st>
         <p>Determination of gene function is a central goal of modern biology, and is a starting point for detailed mechanistic studies. Computational approaches can provide predictions of gene function based on the integration of heterogeneous data sources <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr><abbr bid="B3">3</abbr><abbr bid="B4">4</abbr><abbr bid="B5">5</abbr><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr><abbr bid="B8">8</abbr><abbr bid="B9">9</abbr><abbr bid="B10">10</abbr></abbrgrp>. These predictions can serve as a principled method of 'triage', focusing experimental resources on the hypotheses (predictions) that are more likely to be true. Moreover, predictions that are associated with measures of confidence allow experimental biologists to adjust the number of predictions they are willing to consider based on the trade-off between false positive rate, the importance of the biological question, and the cost of follow-up experiments. For example, mouse researchers have been faced for years with the problem of deciding which genes to mutate in reverse-genetic studies, and the problem of deciding which physiological and molecular phenotypes to assay for each mutant strain. Today, there are thousands of Gene Trap alleles <abbrgrp><abbr bid="B11">11</abbr></abbrgrp>, and within a few years investigators will have access to a virtually complete collection of engineered knockouts <abbrgrp><abbr bid="B12">12</abbr></abbrgrp>. Issues of both expense and ethics that are associated with model organism experiments motivate the thoughtful justification of planned experiments.</p>
         <p>Several algorithms have been applied to heterogeneous data sources to predict gene function <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr><abbr bid="B3">3</abbr><abbr bid="B4">4</abbr><abbr bid="B5">5</abbr><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr><abbr bid="B8">8</abbr><abbr bid="B9">9</abbr><abbr bid="B10">10</abbr><abbr bid="B13">13</abbr></abbrgrp>, with the integration of these sources clearly improving prediction performance <abbrgrp><abbr bid="B14">14</abbr><abbr bid="B15">15</abbr></abbrgrp>. However, these studies have been primarily focused on the yeast <it>Saccharomyces cerevisiae </it>and other non-mammalian model organisms <abbrgrp><abbr bid="B16">16</abbr><abbr bid="B17">17</abbr><abbr bid="B18">18</abbr></abbrgrp>, and it has not been clear how well such algorithms will scale to the large genomes and networks of mammals, despite the basic genetic, biochemical and cellular organizational principles that are shared across the eukaryotic kingdom <abbrgrp><abbr bid="B19">19</abbr><abbr bid="B20">20</abbr><abbr bid="B21">21</abbr></abbrgrp>. Moreover, it is unclear whether accurate function predictions can be made given the amount and quality of genomic and function annotation data available for mammals. (Although genes with even a single annotation are often referred to as genes of 'known function', only a minority has been exhaustively studied. Therefore, most 'known function' genes are still incompletely annotated.) Although comparisons using standardized data sets and performance criteria are the best way to assess the strengths and weaknesses of the algorithms employed <abbrgrp><abbr bid="B22">22</abbr><abbr bid="B23">23</abbr><abbr bid="B24">24</abbr></abbrgrp>, our ability to predict gene function using integrated genomic data has not been systematically compared in this way across multiple bioinformatics groups in any organism.</p>
         <p>We assembled a large collection of <it>Mus musculus </it>data, independently developed nine different computational methods using these data to predict gene functions, and compared the predictive performance of each submission using held-out genes, a prospective evaluation, and a focused literature-based assessment of the top novel predictions. We have provided confidence scores and estimates of prediction accuracy (precision) at different levels of sensitivity (recall), and combined the best submissions in a single set of predictions. We report thousands of predicted functions for mouse genes that are supported by multiple data types and algorithms, and share the results via a web resource that facilitates searching and browsing in the context of the underlying supporting evidence.</p>
         <p>This community effort has suggested new function assignments or refinements of previous annotations for the majority of mouse genes. Based on a prospective evaluation of entirely novel predictions, including many for uncharacterized (without any function annotations) genes, we expect that predictions provided here will productively guide further experimentation towards more likely hypotheses.</p>
      </sec>
      <sec>
         <st>
            <p>Results</p>
         </st>
         <sec>
            <st>
               <p>Organization of a community function prediction comparison</p>
            </st>
            <p>The overall structure of our study was to provide groups of investigators (participants) with a collection of data sets in which the gene identifiers were standardized and associated with known functional annotations. The participants then used their algorithms to assign a score reflecting confidence in whether each gene had each function. To enable evaluation of the results, and to calibrate confidence scores for novel predictions within each category, a subset of genes with known functions was 'held out' (that is, function annotations were not given to the participants).</p>
            <p>We therefore began by assembling an extensive collection of <it>M. musculus </it>data, including gene expression across multiple tissues, protein sequence pattern annotations, protein-protein interactions, phenotype annotations, disease associations (of human orthologs), gene function annotations, and phylogenetic profiles from a variety of publicly available sources. (Table <tblr tid="T1">1</tblr> summarizes the data sources; for a full description of the data see the references cited in Table <tblr tid="T1">1</tblr>.) These data sets were chosen because they encompass many genes, and have been shown to contain information reflecting gene function <abbrgrp><abbr bid="B7">7</abbr><abbr bid="B21">21</abbr><abbr bid="B25">25</abbr><abbr bid="B26">26</abbr><abbr bid="B27">27</abbr></abbrgrp>. Protein interaction data include 'interologs' transferred from other organisms via orthology <abbrgrp><abbr bid="B28">28</abbr><abbr bid="B29">29</abbr></abbrgrp>. To avoid circularity, the data collection did not directly include protein or DNA sequences, since homology was employed in establishing many of the annotations, but allowed sequenced-based inference indirectly via phylogenetic profiles and matches to protein sequence patterns. The complete data collection is available from the MouseFunc I website <abbrgrp><abbr bid="B30">30</abbr></abbrgrp>.</p>
            <tbl id="T1">
               <title>
                  <p>Table 1</p>
               </title>
               <caption>
                  <p>Data collection description: summary of the data sources</p>
               </caption>
               <tblbdy cols="3">
                  <r>
                     <c ca="left">
                        <p>Data type</p>
                     </c>
                     <c ca="left">
                        <p>Description</p>
                     </c>
                     <c ca="left">
                        <p>Representation</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="3">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Gene expression</p>
                     </c>
                     <c ca="left">
                        <p>Expression data from oligonucleotide arrays for 13,566 genes across 55 mouse tissues (Zhang <it>et al</it>. <abbrgrp><abbr bid="B21">21</abbr></abbrgrp>)</p>
                     </c>
                     <c ca="left">
                        <p>Median-subtracted, arcsinh intensity measurements</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="left">
                        <p>Expression data from Affymetrix arrays for 18,208 genes across 61 mouse tissues (Su <it>et al</it>. <abbrgrp><abbr bid="B44">44</abbr></abbrgrp>)</p>
                     </c>
                     <c ca="left">
                        <p>gcRMA-condensed intensity measurements</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="left">
                        <p>Tag counts at quality 0.99 cut-off from 139 SAGE libraries for 16,726 genes <abbrgrp><abbr bid="B45">45</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>Average and total tag counts</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Sequence patterns</p>
                     </c>
                     <c ca="left">
                        <p>Protein sequence pattern annotations from Pfam-A (release 19) for 15,569 genes with 3,133 protein families <abbrgrp><abbr bid="B46">46</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>Binary annotation patterns</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="left">
                        <p>Protein sequence pattern annotations from InterPro (release 12.1) for 16,965 genes with 5,404 sequence patterns <abbrgrp><abbr bid="B47">47</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>Binary annotation patterns</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Protein interactions</p>
                     </c>
                     <c ca="left">
                        <p>Protein-protein interactions from OPHID for 7,125 genes <abbrgrp><abbr bid="B28">28</abbr></abbrgrp> (downloaded on 20 April 2006)</p>
                     </c>
                     <c ca="left">
                        <p>Binary interaction patterns and shortest path between genes</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Phenotypes</p>
                     </c>
                     <c ca="left">
                        <p>Phenotype annotations from MGI for 3,439 genes with 33 phenotypes <abbrgrp><abbr bid="B48">48</abbr></abbrgrp> (downloaded on 21 February 2006 from <abbrgrp><abbr bid="B49">49</abbr></abbrgrp>)</p>
                     </c>
                     <c ca="left">
                        <p>Binary annotation patterns</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Conservation profile</p>
                     </c>
                     <c ca="left">
                        <p>Conservation pattern from Ensembl (v38) for 15,939 genes across 18 species <abbrgrp><abbr bid="B50">50</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>Binary conservation patterns and conservation scores</p>
                     </c>
                  </r>
                  <r>
                     <c>
                        <p/>
                     </c>
                     <c ca="left">
                        <p>Conservation pattern from Inparanoid (v4.0) for 15,703 genes across 21 species <abbrgrp><abbr bid="B51">51</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>Binary conservation patterns and Inparanoid scores</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>Disease associations</p>
                     </c>
                     <c ca="left">
                        <p>Disease associations from OMIM for 1,938 genes to 2,488 diseases/phenotypes <abbrgrp><abbr bid="B52">52</abbr><abbr bid="B53">53</abbr></abbrgrp> (downloaded on 6 June 2006 from <abbrgrp><abbr bid="B54">54</abbr></abbrgrp>)</p>
                     </c>
                     <c ca="left">
                        <p>Binary annotation patterns</p>
                     </c>
                  </r>
               </tblbdy>
               <tblfn>
                  <p>gcRMA, robust multi-array analysis with background adjustment for GC content of probes; OMIM, Online Mendelian Inheritance in Man; OPHID, Online Predicted Human Interaction Database; SAGE, serial analysis of gene expression.</p>
               </tblfn>
            </tbl>
            <p>To integrate these diverse data sets and associate them with functional annotations, we mapped the gene (or gene product) identifiers used in each data set to a common set of Mouse Genome Informatics (MGI) gene identifiers (as defined 21 February 2006), which are, in turn, associated with Gene Ontology (GO) terms curated by MGI <abbrgrp><abbr bid="B31">31</abbr><abbr bid="B32">32</abbr></abbrgrp>. Thus, annotations for each gene were the union of annotations made to the set of the gene products for that gene. We excluded GO annotations based solely on the 'inferred from electronic annotation' (IEA) evidence code, since many of these annotations are themselves computational predictions that have not been reviewed by a curator <abbrgrp><abbr bid="B33">33</abbr></abbrgrp>. We also excluded GO terms with too few training examples, that is, those annotated to fewer than three genes in the training set, expecting that it would be extremely difficult for current classifiers to deal with such a limited number of positive training examples. To focus on predictions most likely to suggest specific follow-up experiments, we considered only GO terms associated with 300 or fewer mouse genes in the training set. (This threshold was chosen by manually examining GO terms ranked in descending order by the number of genes currently annotated to each term, and subjectively assessing whether predictions of that GO term would immediately suggest a follow-up validation experiment.) The final data collection contained information on 21,603 MGI genes, of which 8,506 were associated with at least one of the 2,815 individual GO terms we considered.</p>
            <p>An invitation to participate in this assessment was circulated among research groups known to work in gene function prediction. Nine groups ultimately participated by submitting predictions. (For a brief description of the methods used by each, see Table <tblr tid="T2">2</tblr>; for more details see Additional data files <supplr sid="S20">20</supplr> and <supplr sid="S21">21</supplr>.) The data and annotations were distributed in a form intended to prevent participants from using additional data sources, and to enable cross-validation. First, data were distributed to participants in an 'anonymized' form, with each MGI gene identifier replaced with a randomly generated identifier and presented to participants in permuted order. Thus, participants made predictions without knowing the gene identities or any gene information outside the training data. Second, annotations were omitted for a randomly selected 10% of genes (the 'held-out set').</p>
            <tbl id="T2">
               <title>
                  <p>Table 2</p>
               </title>
               <caption>
                  <p>Brief description of function prediction methods used</p>
               </caption>
               <tblbdy cols="4">
                  <r>
                     <c ca="left">
                        <p>Submission identifier</p>
                     </c>
                     <c ca="left">
                        <p>Approach</p>
                     </c>
                     <c ca="left">
                        <p>Name</p>
                     </c>
                     <c ca="left">
                        <p>Author initials</p>
                     </c>
                  </r>
                  <r>
                     <c cspan="4">
                        <hr/>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>A</p>
                     </c>
                     <c ca="left">
                        <p>Compute several kernel matrices (SVM) for each data matrix, train one GO term specific SVM per kernel, and map SVMs' discriminants to probabilities using logistic regression</p>
                     </c>
                     <c ca="left">
                        <p>Calibrated ensembles of SVMs</p>
                     </c>
                     <c ca="left">
                        <p>GO, GL, JQ, CG, MJ, and WSN</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>B</p>
                     </c>
                     <c ca="left">
                        <p>Four different kernels are used per data set. Integration of best kernels and data sources is done using the kernel logistic regression model</p>
                     </c>
                     <c ca="left">
                        <p>Kernel logistic regression <abbrgrp><abbr bid="B55">55</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>HL, MD, TC, and FS</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>C</p>
                     </c>
                     <c ca="left">
                        <p>Construct similarity kernels, assign a weight to each kernel using linear regression, combine the weighted kernels, and use a graph based algorithm to obtain the score vector</p>
                     </c>
                     <c ca="left">
                        <p>geneMANIA</p>
                     </c>
                     <c ca="left">
                        <p>SM, DW-F, CG, DR, and QM</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>D</p>
                     </c>
                     <c ca="left">
                        <p>Train SVM classifiers on each GO term and individual data sets, construct several Bayesian networks that incorporate diverse data sources and hierarchical relationships, and chose for each GO term the Bayes net or the SVM yielding the highest AUC</p>
                     </c>
                     <c ca="left">
                        <p>Multi-label hierarchical classification <abbrgrp><abbr bid="B56">56</abbr></abbrgrp> and Bayesian integration</p>
                     </c>
                     <c ca="left">
                        <p>YG, CLM, ZB, and OGT</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>E</p>
                     </c>
                     <c ca="left">
                        <p>Combination of an ensemble of classifiers (na&#239;ve Bayes, decision tree, and boosted tree) with guilt-by-association in a functional linkage network, choosing the maximum score</p>
                     </c>
                     <c ca="left">
                        <p>Combination of classifier ensemble and gene network</p>
                     </c>
                     <c ca="left">
                        <p>WKK, CK, and EMM</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>F</p>
                     </c>
                     <c ca="left">
                        <p>Code the relationship between functional similarity and the data into a functional linkage graph and predict gene functions using Boltzmann machine and simulated annealing</p>
                     </c>
                     <c ca="left">
                        <p>GeneFAS (gene function annotation system) <abbrgrp><abbr bid="B2">2</abbr><abbr bid="B3">3</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>TJ, CZ, GNL, and DX</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>G</p>
                     </c>
                     <c ca="left">
                        <p>Two methods with scores combined by logistic regression: guilt-by-association using a weighted functional linkage graph generated by probabilistic decision trees; and random forests trained on all binary gene attributes</p>
                     </c>
                     <c ca="left">
                        <p>Funckenstein</p>
                     </c>
                     <c ca="left">
                        <p>WT, MT, FDG, and FPR</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>H</p>
                     </c>
                     <c ca="left">
                        <p>Pairwise similarity features for gene pairs were derived from the available data. A Random Forest classifier was trained using pairs of genes for each GO term. Predictions are based on similarity between the query gene and the positive examples for that GO term</p>
                     </c>
                     <c ca="left">
                        <p>Function prediction through query retrieval</p>
                     </c>
                     <c ca="left">
                        <p>YQ, JK, and ZB</p>
                     </c>
                  </r>
                  <r>
                     <c ca="left">
                        <p>I</p>
                     </c>
                     <c ca="left">
                        <p>Construct an interaction network per data set, merge data set graphs into a single graph, and apply a belief propagation algorithm to compute the probability for each protein to have a specific function given the functions assigned to the proteins in the rest of the graph</p>
                     </c>
                     <c ca="left">
                        <p>Function prediction with message passing algorithms <abbrgrp><abbr bid="B57">57</abbr></abbrgrp></p>
                     </c>
                     <c ca="left">
                        <p>ML and AP</p>
                     </c>
                  </r>
               </tblbdy>
               <tblfn>
                  <p>AUC, area under the receiver operating characteristic curve; GO, Gene Ontology.</p>
               </tblfn>
            </tbl>
            <p>Each group developed and implemented their prediction methodology independently. Each submission was required, for each gene-GO term combination, to include a score (ranging from 0 to 1) reflecting prediction confidence. The data collection was released in July 2006 (with GO annotations obtained from the GO website on 17 February 2006; version 1.612). Initial prediction results were submitted in October 2006, with seven groups submitting complete prediction sets. After viewing performance measures (but not gene identities or information on the veracity of any specific prediction), it was noted that some groups did not provide a complete set of predictions; also, one group withdrew their predictions upon discovering a coding error. In an effort to increase the number and quality of submitted predictions, all groups were given the opportunity to alter their methods and submit new predictions for a second December 2006 deadline, and five groups did so.</p>
         </sec>
         <sec>
            <st>
               <p>Performance evaluation</p>
            </st>
            <p>To evaluate each set of predictions, we first used the set of held-out genes. GO annotations are an evolving target (annotations are continuously added, deleted, and modified), which enabled us also to perform a prospective evaluation. For this purpose, we also identified the set of genes that had newly acquired an association to a GO term during the eight months since downloading of the version of MGI GO annotation used in training. The GO annotations used for prospective evaluation were obtained from the GO website on 20 October 2006 (version 1.641). To obtain a baseline performance against which to compare predictions from each approach, we employed a na&#239;ve Bayes 'straw man' approach. To train this 'straw man' classifier, we used the six sets of binary gene features that are natively in the (gene, property) form, and did not use feature selection (Additional data file <supplr sid="S21">21</supplr>). We assessed success for each GO term using area under the receiver operating characteristic (ROC) curve (AUC) <abbrgrp><abbr bid="B34">34</abbr></abbrgrp>; precision was assessed at several fixed recall values (all measures used are defined in Materials and methods). For evaluation purposes, we grouped GO terms in twelve evaluation categories corresponding to all combinations of the three GO branches - Biological process, Molecular function, or Cellular component - with four ranges of 'specificity', that is, the number of genes in the training set with which each term is annotated ({3-10}, {11-30}, {31-100}, and {101-300}).</p>
            <p>Figure <figr fid="F1">1</figr> shows some performance measures of the first round of submissions. Note that team I submitted partial results and was, therefore, not assessed for overall performance in each evaluation category. Team E's results for the prospective evaluation were based on a partial implementation of their algorithm (see details in Additional data file <supplr sid="S20">20</supplr>, Box 5). Figure <figr fid="F1">1a,b</figr> shows the mean AUC of GO terms within each evaluation category, evaluated using the held-out and newly annotated genes, respectively. Figure <figr fid="F1">1c,d</figr> shows for each submission how often its AUC value was significantly better (or worse) than the AUC value of another submission. We assessed significance of difference in AUC between two submissions for each GO term (&#945; = 0.05) using a Z-test <abbrgrp><abbr bid="B34">34</abbr></abbrgrp>.</p>
            <fig id="F1">
               <title>
                  <p>Figure 1</p>
               </title>
               <caption>
                  <p>Measures of performance for the initial round of GO term predictions</p>
               </caption>
               <text>
                  <p>Measures of performance for the initial round of GO term predictions. <b>(a) </b>Mean area under the receiver operating characteristic curve (AUC) within each evaluation category, evaluated using the held-out genes. Gene Ontology Biological process (GO-BP), Cellular component (GO-CC), and Molecular function (GO-MF) branches are indicated on the x-axis, grouped by specificity (indicated by the minimum number of genes in the training set associated with each GO term in a given category). Upper case letters associated with the color code correspond to submission identifier. <b>(b) </b>Mean AUC within each evaluation category, evaluated prospectively using newly annotated genes. <b>(c) </b>For each pair of submissions X and Y, we test for difference in AUC value for every GO term (evaluated using held-out genes). Color bars indicate fraction of pairwise comparisons for which X's AUC is significantly higher (blue), not significantly different (beige), and significantly lower (maroon). <b>(d) </b>As (c), except evaluated using the newly annotated genes. <b>(e) </b>The fraction of GO terms exceeding the indicated precision at 20% recall (P20R) value, evaluated using held-out genes. The black line corresponds to the fraction of GO terms for which the 'straw man' approach achieved the indicated precision. <b>(f) </b>As (e), except with P20R values derived prospectively from newly annotated genes.</p>
               </text>
               <graphic file="gb-2008-9-s1-s2-1"/>
            </fig>
            <p>In this analysis, most submissions beat the 'straw man' in all categories (both by mean AUC and by number of wins and losses); however, the overall differences among groups were not dramatic. (See Additional data file <supplr sid="S1">1</supplr> for a summary of the number of significant wins and losses per evaluation category.) The complete set of performance measures evaluated with the held-out gene set may be found in Additional data file <supplr sid="S7">7</supplr> (initial predictions) and Additional data file <supplr sid="S9">9</supplr> (revised predictions), while the corresponding prospective evaluation results may be found in Additional data files <supplr sid="S8">8</supplr> and <supplr sid="S10">10</supplr>. Performance measures reported here are conservative in the sense that false positive predictions (genes predicted as having a GO term that were not currently annotated with that GO term) may actually be correct but not yet annotated as such.</p>
            <p>In contrast to AUC, the precision at fixed recall values was dramatically higher for all submissions than for the 'straw man'; Figure <figr fid="F1">1e,f</figr> shows the proportion of GO annotations reaching various precision values at 20% recall (a threshold selected as 'midrange' for display). Additional data file <supplr sid="S2">2</supplr> shows the mean precision at 20% recall for GO terms within each evaluation category, evaluated using both held-out and newly annotated genes. Due to the small number of positives (genes carrying a given annotation) relative to negatives (genes that do not carry the annotation), this characteristic would usually be reflected only in the very left part of the ROC, and is not generally captured by the more commonly used AUC measure. However, precision is a more relevant measure to many end users, since it reflects the proportion of validation experiments for top-scoring predictions that would prove successful.</p>
            <p>Performance of all submissions differed markedly depending on whether evaluation was on the held-out genes or on newly annotated genes (Figure <figr fid="F1">1a,c,e</figr> compared with Figure <figr fid="F1">1b,d,f</figr>), suggesting that emerging annotations are qualitatively different from a random sample of previously existing annotations - a variable that is only rarely considered in large-scale predictions of gene function.</p>
            <p>In fact, the main type of evidence supporting the annotations differs between the new and the held-out annotations; while 50% and 2.5% of newly acquired annotations were derived from sequence or structural similarity (ISS) and reviewed computational analysis (RCA), respectively, the corresponding proportions for held-out annotations were 9% and 31% (Additional data file <supplr sid="S3">3</supplr>).</p>
            <p>Figure <figr fid="F2">2</figr> shows the performance of the second round of submissions (Additional data file <supplr sid="S2">2</supplr>). In most cases, revised predictions slightly outperform the original ones. All subsequent analyses described here used only one submission per group, choosing the most recent where there were two submissions. The complete evaluation results are available from the MouseFunc I website <abbrgrp><abbr bid="B30">30</abbr></abbrgrp>.</p>
            <fig id="F2">
               <title>
                  <p>Figure 2</p>
               </title>
               <caption>
                  <p>Measures of performance for the second round of GO term predictions</p>
               </caption>
               <text>
                  <p>Measures of performance for the second round of GO term predictions. <b>(a, b) </b>As described in Figure 1a, b, except that the gray color area indicates performance in the first set of submissions. <b>(c-f) </b>As described in Figure 1c-f, except that asterisks in (c) and (d) indicate second-round submissions and dashed lines in (e) and (f) indicate the performance of an earlier submission by the same group. GO, Gene Ontology.</p>
               </text>
               <graphic file="gb-2008-9-s1-s2-2"/>
            </fig>
         </sec>
         <sec>
            <st>
               <p>Factors affecting prediction performance</p>
            </st>
            <p>To ask whether some data sets were more useful than others, and how their value might vary among evaluation categories, we applied a simple guilt-by-association approach similar to a previously described method <abbrgrp><abbr bid="B35">35</abbr></abbrgrp>. The confidence score for gene X and GO term Y is simply the number of 'neighbors' of X that are currently annotated with Y (see Materials and methods). We evaluated performance after applying this method to only one data set at a time. Figure <figr fid="F3">3a</figr> shows precision at 20% recall (P20R) values obtained by each submission on every GO term, and by using each one of the data types as input to the guilt-by-association approach. A striking observation is that protein sequence pattern annotations are the most predictive data type overall and are especially useful for predicting Molecular function GO terms. Expression data, and phenotype and disease associations are important contributors for more general Cellular component and Biological process GO terms. Moreover, interaction data comprise a remarkably useful evidence source, considering that only a small proportion of protein interactions in mammals is known. Figure <figr fid="F3">3a</figr> also indicates that hard to learn GO terms are the ones where there is absence of predictive power in all data types. This is especially clear in the specificity range {3-10} in all GO branches. We also examined maximum coverage (number of genes present in a given data set with at least one annotated 'neighbor' when using the simple guilt-by-association method), noting that this coverage allowed functional associations for at most 30% of the 21,603 genes to be predicted given any single data set (Figure <figr fid="F3">3b</figr>).</p>
            <fig id="F3">
               <title>
                  <p>Figure 3</p>
               </title>
               <caption>
                  <p>Factors affecting prediction performance</p>
               </caption>
               <text>
                  <p>Factors affecting prediction performance. <b>(a) </b>Precision at 20% recall (P20R) values evaluated using held-out annotations on all Gene Ontology (GO) terms (vertical axis) within each of the 12 evaluation categories for each submission (left panel) and for a simple guilt-by association using each data set in turn as its sole evidence source (right panel). The number of genes in each evaluation category is shown in parentheses. GO-BP, GO Biological process; GO-CC, GO Cellular component; GO-MF, GO Molecular function; NB, na&#239;ve Bayes. Data sets are described in Table 1. <b>(b) </b>Fraction of the 21,603 genes in the data collection with at least one annotated neighbor per data set. <b>(c) </b>Analysis of variance (ANOVA), exploring the effects of various factors on P20R values. <b>(d) </b>Fraction of total variance in P20R values that is explained by each effect. Asterisks in (c, d) indicate interaction between two factors.</p>
               </text>
               <graphic file="gb-2008-9-s1-s2-3"/>
            </fig>
            <p>Analysis of variance (ANOVA; Additional data file <supplr sid="S11">11</supplr>) verified what is clear from Figures <figr fid="F1">1a,b</figr>, <figr fid="F2">2a,b</figr> and <figr fid="F3">3a</figr>; the branch of the ontology is the main factor to explain variance in performance as shown in Figure <figr fid="F3">3c,d</figr>. Biological process GO terms, which reflect what biologists would typically consider to be physiological function of genes and most related to phenotypes, are apparently more difficult to predict than Molecular function or Cellular component terms. As expected, more specific GO terms in each evaluation category were more difficult to predict.</p>
            <p>To explore whether there were commonalities in pattern of performance among the submissions, we examined the correlations among P20R values and grouped the submissions using hierarchical clustering (Additional data file <supplr sid="S4">4</supplr>). We identified three pairs of submissions that were grouped together by several correlation measures (data not shown). These pairs of submissions were ('F', 'G'), ('A', 'B'), and ('C', 'D'). Submissions 'F' and 'G' both employ functional linkage, while submissions 'A' and 'B' are mainly kernel-based methods. (Despite the fact that submissions 'E' and 'I' also used functional linkage, their results were uncorrelated with 'F' and 'G'.) Submissions 'A', 'B', 'C', and 'D' each used weighted combinations of diverse data sets, but neither 'A' nor 'B' gave highly correlated results with 'C' or 'D'. Since all participant methods combine several algorithms, require the use of multiple parameters, and vary the procedure for feature design and selection, it is not surprising that differences in results cannot be simply attributed to any one algorithmic choice.</p>
            <p>To assess the stability of the prediction performance, we measured the performance variability in five randomly chosen subsets of the training data and measured the standard deviations of AUC and P20R performance measures within each evaluation category. The median standard deviations of AUC and P20R across all evaluation categories were 0.01 and 0.02, respectively, suggesting that our performance measures were robustly determined (Additional data file <supplr sid="S12">12</supplr>).</p>
            <p>One of the major challenges in training a classifier is overfitting, that is, generating models that precisely fit the idiosyncrasies of training data at the expense of their accuracy when applied to new data. We assessed overfitting using a standard approach - examining the extent to which performance estimates are exaggerated when one calculates them based on the training data rather than on the held-out test set (Additional data file <supplr sid="S12">12</supplr>). For example, Biological process GO terms with specificity {31-100} had a mean P20R value that was increased by a factor of 1.3 (averaged over all submissions) when it was calculated based on the training data rather than the held-out gene set.</p>
            <p>We note that submissions 'C', 'D' and 'G' are among the top performers on most evaluation categories by various measures. The performance of submission 'C' was particularly strong with respect to AUC. Submission 'D' performs stably across the range of the number of genes annotated to each GO term and its performance was especially good for prospective predictions. Submission 'G' has a strong performance in precision across a range of recalls (Additional data files <supplr sid="S5">5</supplr> and <supplr sid="S6">6</supplr>). Submission 'E' and 'H' perform better for the most specific evaluation categories. Thus, different methods had different strengths and no prediction method was clearly superior by every criterion.</p>
         </sec>
         <sec>
            <st>
               <p>Integration of submissions in a single set of predictions</p>
            </st>
            <p>To simplify subsequent analyses for ourselves and other investigators, we derived a single set of prediction scores from the set of submitted scores. We unified the independent submissions for each evaluation category by adopting the scores from the submission with the best P20R value for that evaluation category (evaluated using held-out genes). The combined predictions averaged 41% precision at 20% recall with 26% of GO terms having a P20R value greater than 90%. Figure <figr fid="F4">4</figr> indicates the proportion of GO terms at different precision and recall values. (Also see Additional data file <supplr sid="S19">19</supplr>; Additional data file <supplr sid="S13">13</supplr> lists the precision achieved by the unified predictions at several recall values for each GO term.) To put this prediction performance into perspective, random predictions for a GO term with 30 genes left to be identified would be expected to yield a P20R value of 0.15%. In addition, these precision estimates are conservative since many predictions may ultimately prove correct despite not being currently annotated.</p>
            <fig id="F4">
               <title>
                  <p>Figure 4</p>
               </title>
               <caption>
                  <p>Distribution of GO terms at several precision/recall performance points</p>
               </caption>
               <text>
                  <p>Distribution of GO terms at several precision/recall performance points. Proportion of Gene Ontology (GO) terms per evaluation category with a precision/recall performance point that is both above and to the right of a given precision/recall point in the contour plots. GO-BP, GO Biological process; GO-CC, GO Cellular component; GO-MF, GO Molecular function.</p>
               </text>
               <graphic file="gb-2008-9-s1-s2-4"/>
            </fig>
         </sec>
         <sec>
            <st>
               <p>Impact of predictions among GO terms for which precision can be well estimated</p>
            </st>
            <p>To gain insight into the potential impact of predictions on the current state of gene function annotation, we more closely examined a subset of GO terms in the unified set of predictions. For each GO term, we established the lowest score at which a precision of 30% or better was achieved while recovering at least 10 true positives within the held-out test set (allowing precision to be well estimated). There were 71 GO terms with predictions meeting this criterion (tending to be the less specific GO terms due to the number of required positive genes in the training set). Figure <figr fid="F5">5</figr> shows the number of currently annotated and predicted genes for each GO term, including 9,429, 2,087, and 19,849 predictions in the Biological process, Cellular component, and Molecular function branches, respectively. (The maximum number of predictions displayed was limited to 1,000.) This figure illustrates the potential future impact of these predictions on the state of function annotation should the expected 30% or more of these predictions prove true.</p>
            <fig id="F5">
               <title>
                  <p>Figure 5</p>
               </title>
               <caption>
                  <p>Number of high-precision predictions among GO terms for which precision can be confidently estimated</p>
               </caption>
               <text>
                  <p>Number of high-precision predictions among GO terms for which precision can be confidently estimated. Number of currently annotated (green) versus predicted genes (orange, predictions expected to be correct; gray, predictions expected to be incorrect) for a subset of Gene Ontology (GO) terms for which 30% precision on held-out annotations was achieved while recovering at least 10 positives in the held-out set. The number of predicted genes displayed was limited to 1,000. GO terms were ordered according to similarity of prediction/annotation patterns. Terminal digits of GO term identifiers are shown in parentheses. GO-BP, GO Biological process; GO-CC, GO Cellular component; GO-MF, GO Molecular function.</p>
               </text>
               <graphic file="gb-2008-9-s1-s2-5"/>
            </fig>
            <p>While Figure <figr fid="F5">5</figr> shows the impact for more general GO terms, we note that performance for more specific GO terms was also quite good. For example, the mean P20R from the best-performing submission for the most specific {3-10} versus least specific {101-300} category was 21% versus 37%, 38% versus 50%, and 51% versus 53% for Biological process, Cellular component, and Molecular function branches, respectively. Thus, predictions for more specific GO terms offer a similarly high impact on current function annotation (and there are many more specific GO terms than general GO terms).</p>
            <p>Predictions have varying degrees of novelty, ranging from 're-predictions' and 'refinement predictions' to 'highly novel'. Re-predictions are cases in which the gene is currently annotated with that GO term based solely on IEA evidence; these are often unverified predictions made previously by others. Refinement predictions are cases in which the gene is currently annotated with an ancestor of the predicted GO term. We describe all other predictions as 'highly novel'. Among the number of predictions displayed in Figure <figr fid="F5">5</figr>, the percentages of refinements are 18%, 21%, and 17% for Biological process, Cellular component, and Molecular function branches, respectively, while the percentages of re-predictions are 43%, 37%, and 32%. Thus, 3,677 (39%), 877 (42%), and 10,123 (51%) predictions for Biological process, Cellular component, and Molecular function branches, respectively, were highly novel.</p>
         </sec>
         <sec>
            <st>
               <p>Literature evaluation for top-scoring predictions with a high degree of novelty</p>
            </st>
            <p>To gain intuition into the quality of those predictions with the highest degree of novelty, we performed a focused literature analysis on highly novel top-scoring predictions. For this, we identified the top three predictions from each of the twelve evaluation categories, excluding re-predictions and refinement predictions.</p>
            <p>To avoid over-weighting particular GO terms or genes, we also allowed only one prediction per evaluation category for any given gene or GO term. Investigators with extensive experience with literature curation and knowledge of mouse gene function (DPH and JAB) examined published literature relating to these 36 high-scoring highly novel predictions, and scored each prediction according to the nature of published evidence. Additional data file <supplr sid="S14">14</supplr> contains the list of highly novel predictions investigated.</p>
            <p>Out of the 36 high-scoring predictions examined, 21 (58%) were found to be true or likely to be true based on experimental data reported in the literature. Since six other cases could neither be confirmed nor refuted by current literature, we estimate that the true precision for top novel high-scoring predictions lies between 58% and 75%. Of the 21 found to be true, 9 (43%) were strongly supported but were not annotated simply because the literature had not yet been curated. For example, annotation of the gene encoding Slfn8 (schlafen 8) with the GO term 'negative regulation of cell proliferation' is supported <abbrgrp><abbr bid="B36">36</abbr></abbrgrp>, with evidence corresponding to the inferred from direct assay (IDA) evidence code <abbrgrp><abbr bid="B33">33</abbr></abbrgrp>. This gene currently does not have any functional annotation in the MGI system, and thus exemplifies the novel assignment of function to unannotated genes.</p>
            <p>Other reasonable annotations identified in this set of 36 examples include 12 cases where the genes are members of characterized gene families. It is likely that the genes play at least a similar role as predicted, although the evidence is not strong enough to support the annotation using GO Consortium annotation policy. An example of this is the mouse gene 4930430D24Rik, which is predicted to be involved in biological process 'protein amino acid methylation'. This gene is defined solely by cDNA clone data and has no experimental information associated with it. However, it has sequence similarity with the gene encoding Btg1, which has been documented as interacting with protein methyl transferases.</p>
            <p>Another 6 cases (17%) of the 36 examined could be neither confirmed nor refuted by current literature. For example, the gene <it>Klhl12 </it>(encoding Kelch-like 12) was associated with the cellular component term 'stress fiber'. This gene is homologous to members of the kelch family of genes found in <it>Drosophila</it>. The <it>Drosophila </it>gene products are found in a variety of cellular locations. Although some members of this family regulate stress fiber formation through the Ras pathway, there is evidence that the human ortholog binds proteins in a variety of locations and that this protein functions in the context of the ubiquitin E3 ligase complex. As a result, we currently cannot infer cellular location of this gene product and thereby judge the prediction.</p>
            <p>The remaining 9 (25%) of the 36 predictions examined were considered to be incorrect based on current literature (see Additional data file <supplr sid="S14">14</supplr> for the list of predictions investigated). For example, the gene <it>Grm4 </it>(encoding the metabotropic glutamate receptor 4) is predicted to have the molecular function 'calcium channel regulator activity'. However, although other G protein coupled receptors regulate calcium levels, there is no current evidence that this gene functions in this way.</p>
            <p>Taken together, these results suggest that high-scoring predictions based on large-scale data integration comprise a promising resource to guide both curators and experimentalists to correct hypotheses about gene function in mammals.</p>
         </sec>
         <sec>
            <st>
               <p>A resource for browsing predictions and underlying evidence</p>
            </st>
            <p>So that researchers may browse predictions and gain intuition about evidence that underlies predicted annotations, an online resource allowing browsing by GO term or gene is available <abbrgrp><abbr bid="B37">37</abbr></abbrgrp>. To facilitate follow-up experimental study, this resource contains links to existing Gene Trap alleles available as heterozygous mouse embryonic stem cell lines.</p>
         </sec>
         <sec>
            <st>
               <p>Illustration of the evidence underlying predictions for two GO terms</p>
            </st>
            <p>To gain insight into the prediction process and the nature of supporting evidence, we examined predictions for two specific GO terms in greater detail. Genes currently annotated with 'Cell adhesion' (Figure <figr fid="F6">6</figr>) and 'Mitochondrial part' (Figure <figr fid="F7">7</figr>) are shown together with genes newly predicted to have these GO terms, in the context of supporting evidence. These GO terms were chosen to illustrate different facets of biology and the utility of multiple data types. Based on the predictive power of each data source in isolation, protein sequence pattern annotations are the most useful source to predict genes involved in cell adhesion, while gene expression data are more relevant for predictions of mitochondrial part. (The value of each data set is based on precision of predictions at 20% recall based solely on that data set, considering genes present in each data set.)</p>
            <fig id="F6">
               <title>
                  <p>Figure 6</p>
               </title>
               <caption>
                  <p>Illustration of evidence underlying predictions for the GO term 'Cell adhesion'</p>
               </caption>
               <text>
                  <p>Illustration of evidence underlying predictions for the GO term 'Cell adhesion'. As an assessment of predictive usefulness, the precision at 20% recall (P20R) value based on each single data source is shown in parentheses. <b>(a) </b>Expression levels of annotated genes (dark green) and predictions (orange), grouped by Pearson correlation and complete-linkage hierarchical clustering. <b>(b) </b>Protein domains in common among predictions and annotated genes. <b>(c) </b>Largest protein-protein interaction network among predictions and annotated genes. OPHID, Online Predicted Human Interaction Database. <b>(d) </b>Disease and <b>(e) </b>phenotype annotations in common between predictions and annotated genes. Terminal digits of identifiers are shown in parentheses. OMIM, Online Mendelian Inheritance in Man.</p>
               </text>
               <graphic file="gb-2008-9-s1-s2-6"/>
            </fig>
            <fig id="F7">
               <title>
                  <p>Figure 7</p>
               </title>
               <caption>
                  <p>Illustration of evidence underlying predictions for the GO term 'Mitochondrial part'</p>
               </caption>
               <text>
                  <p>Illustration of evidence underlying predictions for the GO term 'Mitochondrial part'. <b>(a-e) </b>As described in Figure 6a-e. GO, Gene Ontology.</p>
               </text>
               <graphic file="gb-2008-9-s1-s2-7"/>
            </fig>
            <p>To further validate mitochondrial part predictions, we asked if mitochondrially localized proteins (according to <abbrgrp><abbr bid="B38">38</abbr></abbrgrp>) were enriched among mitochondrial part predictions. Indeed, out of 108 mitochondrial part predictions with available data <abbrgrp><abbr bid="B38">38</abbr></abbrgrp>, 83 were mitochondrially localized (<it>P </it>= 2.3 &#215; 10<sup>-7</sup>; cumulative hypergeometric test). Additional data file <supplr sid="S15">15</supplr> contains mitochondrial part predictions with available mitochondrial localization data <abbrgrp><abbr bid="B38">38</abbr></abbrgrp>.</p>
            <p>Figures <figr fid="F6">6</figr> and <figr fid="F7">7</figr> illustrate that, as intuitively expected, the patterns of expression and other data types among genes annotated and predicted in these categories are quite similar. In addition, the graph formed by protein interactions among annotated and predicted genes contains a connected component (that is, a subset of nodes that are mutually connected by some path) that is larger than expected by chance (<it>P </it>&lt; 0.0001; based on a permutation test of 10,000 random networks). Collectively, this figure illustrates the origin of predictions within diverse genomic and proteomic evidence (see Additional data files <supplr sid="S16">16</supplr> and <supplr sid="S17">17</supplr> for the data underlying Figures <figr fid="F6">6</figr> and <figr fid="F7">7</figr>).</p>
         </sec>
      </sec>
      <sec>
         <st>
            <p>Discussion</p>
         </st>
         <p>Prediction confidence scores fall along a continuum from 0 (predicted not to be true) to 1 (predicted to be true). Whether a score between 0 and 1 should be treated as a prediction for or against the annotation (or as a non-prediction) depends on the user's application-dependent trade-off between precision and recall, and an expert biologist may wish to filter the list further based on their knowledge and intuition before proceeding to carry out experiments. Users performing medium-scale genomic experiments may favor recall over precision and select predictions using a higher recall threshold where the search space (and costs) will be reduced without losing recall. Alternatively, users requiring higher precision can take only the top few predictions.</p>
         <p>The performance differences among the methods examined here could have a substantial practical impact. For example, suppose a user plans to order ten mouse mutant strains at a cost of $10,000 each to assay a physiological phenotype caused by 20 unidentified genes. Since the combined predictions averaged 41% precision at 20% recall, the user may expect to see four mutants showing the expected phenotype at a cost of $25,000 per successful experiment; on the other hand, if a simple guilt-by-association approach having only one source of evidence as input (with average precision at 20% recall of 10%) is used to select the genes to assay, the user may expect to see only one mutant with the desired phenotype at a cost of $100,000 per successful experiment.</p>
         <p>Annotation efforts such as FANTOM <abbrgrp><abbr bid="B39">39</abbr></abbrgrp> have populated a high-quality reference database of function assignments in which each annotation is highly likely to be true. This encyclopedic approach is valuable, but necessarily discards partial information, or 'medium-confidence' predictions. A full spectrum of confidence measures can serve as a form of principled triage, in which experimentalists are guided towards those hypotheses that are more likely to prove true but which have not yet been proven. Furthermore, quantitative function prediction should also prove useful as a resource to assist more qualitative encyclopedic efforts.</p>
         <p>Variation in performance between submissions is more substantial when the evaluation criterion is precision at a given recall, rather than AUC, as shown in Figure <figr fid="F3">3</figr>. The variation in performance between groups and between first and second submissions from the same group indicates that, as a community, we have not yet converged on an asymptotic limit to performance. Also, ANOVA results indicate that GO branch is a greater contributor to variation in performance than the prediction method used. The difficulty of predicting GO terms is highest in the Biological process branch followed by the Cellular component and then Molecular function branches. Also, the difficulty decreases as the number of genes currently annotated to that GO term increases.</p>
         <p>Our assessment indicates that many submissions were more successful in predicting for held-out genes than for the newly annotated set of genes. This suggests the problem of predicting novel annotations may be qualitatively different from the problem of predicting previously known but held-out annotations. Approximately 50% of new annotations were annotated on the basis of sequence or structural similarity (evidence code ISS; Additional data file <supplr sid="S3">3</supplr>), as opposed to 9% for held-out annotations. This indicates that a greater proportion of recent annotations has been made by transfer of annotation from other species via homology.</p>
         <p>Although we considered homology to proteins in other species through phylogenetic profiling and use of protein domain matches, we did not allow transfer of functions from other species via orthology for several reasons. First, function transfer by orthology is the most mature method for function prediction and we consider that the need is greatest to improve methods that integrate and analyze newer large-scale experimental data types. Second, use of GO annotation from other species would have rendered our cross-validation performance estimates uninterpretable by allowing circular predictions. For example, a held-out mouse GO annotation that had previously been transferred by homology from a mouse gene to a human gene might then be transferred back to mouse as a 'prediction'. Third, a function determined in a single organism can quickly spread via orthology to many organisms so that a single piece of evidence might be overcounted as an independent fact in multiple organisms. The second and third issues might be circumvented by only considering annotation from other species based on experiments carried out in that organism. While some evidence codes in GO annotations indicate within-organism support (for example, IDA, IMP [inferred from mutant phenotype], IEP [inferred from expression pattern], IPI [inferred from physical interaction]), other evidence codes such as TAS [traceable author statement], NAS [non-traceable author statement], ISS, and RCA are ambiguous <abbrgrp><abbr bid="B33">33</abbr></abbrgrp>. Careful curation of the organism from which function annotation evidence has been derived would greatly facilitate the use of orthology-based function transfer in future integrative studies.</p>
         <p>We found that submissions from every group were subject to overfitting in most GO categories. While the presence of overfitting is not surprising given the paucity of available training data, it does suggest that future performance gains will come from classifier training methodology that further limits overfitting. Another future improvement to predictions might be a unified score based on all submissions, via an ensemble or 'combination of experts' method <abbrgrp><abbr bid="B40">40</abbr></abbrgrp>. In addition, to facilitate interpretation, scores might be transformed to accurately reflect the probability that a prediction is correct. Another possible improvement would be the use of a more refined subset of GO terms as gold standard. For example, predictions could be judged according to a reduced subset of GO terms that are relatively independent of one another and each specific enough to suggest a follow-up experiment <abbrgrp><abbr bid="B24">24</abbr></abbrgrp>. Furthermore, to improve prediction accuracy in future function prediction efforts, data sources containing additional evolutionary, structural, enzymatic and sequence similarity information might be integrated. It would also be interesting to perform a factorial analysis on variations of the classifiers that performed best here, in order to obtain biological intuition or insight into why these classifiers performed well. Our prediction effort was focused on identifying 'errors of omission' in GO annotation. It would also be worthwhile to explore whether low prediction scores for current annotations (apparently 'false negatives') could be useful in recognizing erroneous functional annotations ('errors of commission').</p>
         <p>A major implication of our analysis is that protein sequence patterns from Pfam and InterPro are extremely useful evidence sources not only for Molecular function GO terms (as expected, since these primarily reflect biochemical activities) but also for inference of Cellular component and Biological process terms. This trend may be due, in part, to the incorporation of biochemical terms in the Biological process ontology (for example, 'protein amino acid phosphorylation' is listed as a Biological process, and its known members overlap with 'protein kinase activity', which is a Molecular function) as well as the fact that protein sequence patterns do relate to substrates associated with specific physiological processes and cellular compartments (for example, DNA-binding proteins are primarily found in the nucleus). Nevertheless, we note that the proportion of genes with protein sequence pattern annotations is much lower in the 8,851 unannotated genes (62%; this includes genes with annotations based solely on IEA evidence) than it is among the 12,752 annotated mouse genes (90%) in the data collection. This indicates that sequence features may be less useful in future predictions of function for currently uncharacterized genes. This is particularly true of Biological process terms, which are the least predictable using sequence features alone, and conceptually most closely related to phenotype. In future, it will be valuable to predict phenotypes as well as functions. Phenotype predictions are immediately testable, and phenotype data in mammalian organisms and cell culture models have a rapid rate of emergence that will permit prospective evaluation of predictions.</p>
      </sec>
      <sec>
         <st>
            <p>Conclusion</p>
         </st>
         <p>We performed a systematic evaluation of diverse, independently developed computational approaches for predicting gene function from heterogeneous data sources in mammals. The results show that currently available data for mammals allow predictions with both breadth and accuracy. At a recall rate of 20%, a unified set of predictions averaged 41% precision, with 26% of GO terms achieving a precision better than 90%. Predictions with comparable precision have been successfully used in yeast <abbrgrp><abbr bid="B41">41</abbr></abbrgrp>. A striking finding is that predictions for GO terms in the most specific evaluation category (ten or fewer annotated genes) have a precision comparable to that obtained in the more general evaluation categories. For Biological process GO terms, we achieved a mean precision at 20% recall for blinded predictions ranging from 28% to 46%, depending on evaluation category specificity. Corresponding performance for Cellular component and Molecular function terms was even higher, ranging from 38% to 58% and from 56% to 64%, respectively. Importantly, many highly novel function predictions emerge for the 38% of mouse genes that remain uncharacterized.</p>
      </sec>
      <sec>
         <st>
            <p>Materials and methods</p>
         </st>
         <sec>
            <st>
               <p>Performance statistics</p>
            </st>
            <p>To assess performance of function predictions by each method, we obtained the ROC curve and the AUC for each GO term using the trapezoidal rule <abbrgrp><abbr bid="B42">42</abbr></abbrgrp>. (The AUC corresponds to the probability that a random positive instance will be scored higher than a random negative instance.) For this assessment, GO annotations were up-propagated. That is, if a gene is associated with a GO term, then this gene is also associated with all the ancestor GO terms of that GO term. During evaluation, refinement predictions are considered false positives.</p>
            <p>We assessed whether observed differences in AUC between submissions X and Y were statistically significant <abbrgrp><abbr bid="B34">34</abbr></abbrgrp> and computed the precision at various recall rates as previously described <abbrgrp><abbr bid="B43">43</abbr></abbrgrp>. Precision is defined as the number of genes correctly classified as having a given GO term divided by the total number of genes classified as having that GO term <inline-formula><m:math name="gb-2008-9-S1-S2-i1" xmlns:m="http://www.w3.org/1998/Math/MathML"><m:semantics><m:mrow><m:mrow><m:mo>(</m:mo><m:mrow><m:mfrac><m:mrow><m:mi>T</m:mi><m:mi>P</m:mi></m:mrow><m:mrow><m:mi>T</m:mi><m:mi>P</m:mi><m:mo>+</m:mo><m:mi>F</m:mi><m:mi>P</m:mi></m:mrow></m:mfrac></m:mrow><m:mo>)</m:mo></m:mrow></m:mrow><m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaafiart1ev1aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8viVeY=Nipec8Eeeu0xXdbba9frFj0xb9qqpG0dXdb9aspeI8k8fiI+fsY=rqGqVepae9pg0db9vqaiVgFr0xfr=xfr=xc9adbaqaaeGaciGaaiaabeqaaeqabiWaaaGcbaWaaeWaaKqbagaadaWcaaqaaiaadsfacaWGqbaabaGaamivaiaadcfacqGHRaWkcaWGgbGaamiuaaaaaOGaayjkaiaawMcaaaaa@37B2@</m:annotation></m:semantics></m:math></inline-formula>. Recall is defined as the percentage of genes annotated with a given GO term that were classified as having that GO term <inline-formula><m:math name="gb-2008-9-S1-S2-i2" xmlns:m="http://www.w3.org/1998/Math/MathML"><m:semantics><m:mrow><m:mrow><m:mo>(</m:mo><m:mrow><m:mfrac><m:mrow><m:mi>T</m:mi><m:mi>P</m:mi></m:mrow><m:mrow><m:mi>T</m:mi><m:mi>P</m:mi><m:mo>+</m:mo><m:mi>F</m:mi><m:mi>N</m:mi></m:mrow></m:mfrac></m:mrow><m:mo>)</m:mo></m:mrow></m:mrow><m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaafiart1ev1aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8viVeY=Nipec8Eeeu0xXdbba9frFj0xb9qqpG0dXdb9aspeI8k8fiI+fsY=rqGqVepae9pg0db9vqaiVgFr0xfr=xfr=xc9adbaqaaeGaciGaaiaabeqaaeqabiWaaaGcbaWaaeWaaKqbagaadaWcaaqaaiaadsfacaWGqbaabaGaamivaiaadcfacqGHRaWkcaWGgbGaamOtaaaaaOGaayjkaiaawMcaaaaa@37B0@</m:annotation></m:semantics></m:math></inline-formula>. Other performance measures included the AUC up to the first 50 false positives, and the recall obtained at 1% false positive rate. False positive rate is defined as the fraction of genes not annotated with a given GO term that were classified as having that GO term <inline-formula><m:math name="gb-2008-9-S1-S2-i3" xmlns:m="http://www.w3.org/1998/Math/MathML"><m:semantics><m:mrow><m:mrow><m:mo>(</m:mo><m:mrow><m:mfrac><m:mrow><m:mi>F</m:mi><m:mi>P</m:mi></m:mrow><m:mrow><m:mi>F</m:mi><m:mi>P</m:mi><m:mo>+</m:mo><m:mi>T</m:mi><m:mi>N</m:mi></m:mrow></m:mfrac></m:mrow><m:mo>)</m:mo></m:mrow></m:mrow><m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaafiart1ev1aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8viVeY=Nipec8Eeeu0xXdbba9frFj0xb9qqpG0dXdb9aspeI8k8fiI+fsY=rqGqVepae9pg0db9vqaiVgFr0xfr=xfr=xc9adbaqaaeGaciGaaiaabeqaaeqabiWaaaGcbaWaaeWaaKqbagaadaWcaaqaaiaadAeacaWGqbaabaGaamOraiaadcfacqGHRaWkcaWGubGaamOtaaaaaOGaayjkaiaawMcaaaaa@37A2@</m:annotation></m:semantics></m:math></inline-formula>. Tables with the median, mean and standard deviation of all performance measures over the GO terms in each evaluation category are provided for each submission (Additional data files <supplr sid="S7">7</supplr> to <supplr sid="S10">10</supplr>).</p>
         </sec>
         <sec>
            <st>
               <p>Assessing the predictive value of each data type</p>
            </st>
            <p>To determine the value of each data type in predicting function, we used the following simple guilt-by-association method; for protein-protein interaction data, we counted the number of times each GO term is annotated among direct interaction partners ('neighbors'). For data sets composed of binary gene features, we considered the neighbors of gene X to be those genes annotated to have the same specific feature, for example, a specific phenotype, disease association, or protein sequence pattern annotation. In the case of non-binary data, for example, expression or phylogenetic profile, neighbors are genes that correlate with X (Pearson correlation coefficient > 0.5). After determining the neighbors of each gene, we sum for each GO term, based on the type of data, either the correlation coefficients, or the number of shared features per neighbor, or the number of the neighbors annotated with GO term X. This value is then used as a score of the function prediction. The contribution of each data set is then assessed considering genes with at least one annotated neighbor in the data set. Tables with the median, mean, and standard deviation of the performance measures over GO terms in each evaluation category per data set are provided in Additional data file <supplr sid="S18">18</supplr>.</p>
         </sec>
         <sec>
            <st>
               <p>Score transformation</p>
            </st>
            <p>Since scores were not necessarily calibrated across GO terms, we developed a monotonic transformation to make scores for different GO terms more comparable. Letting <it>n </it>be the total number of genes considered, <it>t </it>be the number of existing positive annotations for the current GO term, and <it>s</it><sub><it>i </it></sub>be the un-calibrated score for the <it>i</it><sup>th </sup>gene, the calibrated score for the <it>i</it><sup>th </sup>gene <inline-formula><m:math name="gb-2008-9-S1-S2-i4" xmlns:m="http://www.w3.org/1998/Math/MathML"><m:semantics><m:mrow><m:msubsup><m:mi>s</m:mi><m:mi>i</m:mi><m:mo>&#8727;</m:mo></m:msubsup></m:mrow><m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaafiart1ev1aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8viVeY=Nipec8Eeeu0xXdbba9frFj0xb9qqpG0dXdb9aspeI8k8fiI+fsY=rqGqVepae9pg0db9vqaiVgFr0xfr=xfr=xc9adbaqaaeGaciGaaiaabeqaaeqabiWaaaGcbaGaam4CamaaDaaaleaacaWGPbaabaGaey4fIOcaaaaa@32A5@</m:annotation></m:semantics></m:math></inline-formula> is defined as: <inline-formula><m:math name="gb-2008-9-S1-S2-i5" xmlns:m="http://www.w3.org/1998/Math/MathML"><m:semantics><m:mrow><m:msubsup><m:mi>s</m:mi><m:mi>i</m:mi><m:mo>&#8727;</m:mo></m:msubsup><m:mo>=</m:mo><m:mfrac><m:mrow><m:mi>L</m:mi><m:mo>&#8901;</m:mo><m:msub><m:mi>s</m:mi><m:mi>i</m:mi></m:msub></m:mrow><m:mrow><m:mi>L</m:mi><m:mo>&#8901;</m:mo><m:msub><m:mi>s</m:mi><m:mi>i</m:mi></m:msub><m:mo>&#8722;</m:mo><m:msub><m:mi>s</m:mi><m:mi>i</m:mi></m:msub><m:mo>+</m:mo><m:mn>1</m:mn></m:mrow></m:mfrac></m:mrow><m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaafiart1ev1aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8viVeY=Nipec8Eeeu0xXdbba9frFj0xb9qqpG0dXdb9aspeI8k8fiI+fsY=rqGqVepae9pg0db9vqaiVgFr0xfr=xfr=xc9adbaqaaeGaciGaaiaabeqaaeqabiWaaaGcbaGaam4CamaaDaaaleaacaWGPbaabaGaey4fIOcaaOGaeyypa0tcfa4aaSaaaeaacaWGmbGaeyyXICTaam4CamaaBaaabaGaamyAaaqabaaabaGaamitaiabgwSixlaadohadaWgaaqaaiaadMgaaeqaaiabgkHiTiaadohadaWgaaqaaiaadMgaaeqaaiabgUcaRiaaigdaaaaaaa@4328@</m:annotation></m:semantics></m:math></inline-formula> where <it>L </it>is the free (non-negative) parameter chosen such that <inline-formula><m:math name="gb-2008-9-S1-S2-i6" xmlns:m="http://www.w3.org/1998/Math/MathML"><m:semantics><m:mrow><m:mstyle displaystyle="true"><m:msubsup><m:mo>&#8721;</m:mo><m:mrow><m:mi>i</m:mi><m:mo>=</m:mo><m:mn>1</m:mn></m:mrow><m:mi>n</m:mi></m:msubsup><m:mrow><m:msubsup><m:mi>s</m:mi><m:mi>i</m:mi><m:mo>&#8727;</m:mo></m:msubsup><m:mo>=</m:mo><m:mi>t</m:mi></m:mrow></m:mstyle></m:mrow><m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaafiart1ev1aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8viVeY=Nipec8Eeeu0xXdbba9frFj0xb9qqpG0dXdb9aspeI8k8fiI+fsY=rqGqVepae9pg0db9vqaiVgFr0xfr=xfr=xc9adbaqaaeGaciGaaiaabeqaaeqabiWaaaGcbaWaaabmaeaacaWGZbWaa0baaSqaaiaadMgaaeaacqGHxiIkaaGccqGH9aqpcaWG0baaleaacaWGPbGaeyypa0JaaGymaaqaaiaad6gaa0GaeyyeIuoaaaa@3A52@</m:annotation></m:semantics></m:math></inline-formula>. <it>L </it>is found separately for each GO term via a MATLAB optimization routine. After this transformation, the average score for each GO term is equal to the fraction of genes currently annotated with that GO term.</p>
         </sec>
         <sec>
            <st>
               <p>Generating a list of high scoring novel predictions for manual investigation</p>
            </st>
            <p>To evaluate the quality of top-scoring predictions more closely, we identified the set of submitted predictions that performed best within each of the 12 evaluation categories (according to the P20R measure on held-out genes). Within each of the 12 evaluation categories, gene/term pairs were pooled and ranked by calibrated scores (described above). All currently annotated gene/term pairs were removed, resulting in a ranked list of predictions that are considered classification errors according to current GO annotations, but may in fact be correct. To focus on the highly novel predictions, we also excluded re-predictions and refinement predictions from the list.</p>
         </sec>
      </sec>
      <sec>
         <st>
            <p>Abbreviations</p>
         </st>
         <p>ANOVA, analysis of variance; AUC, area under the ROC curve; GO, Gene Ontology; IDA, inferred from direct assay; IEA, inferred from electronic annotation; ISS, inferred from sequence or structural similarity; MGI, Mouse Genome Informatics; P20R, precision at 20% recall; RCA, reviewed computational analysis; ROC, receiver operating characteristic.</p>
      </sec>
      <sec>
         <st>
            <p>Competing interests</p>
         </st>
         <p>The authors declare that they have no competing interests.</p>
      </sec>
      <sec>
         <st>
            <p>Authors' contributions</p>
         </st>
         <p>The study was designed and organized by LP-C, TRH, and FPR, with advice from many others. LP-C assembled the data set (with help from GFB), anonymized gene identifiers in isolation from all participants, and calculated performance measures. Team A analysis was performed by GO with contributions from GL, JQ, CG, and MJ, and design and supervision from WSN. Team B analysis was performed by HL with contributions from MD and design and supervision from TC and FS. Team C analysis was performed by SM with contributions from CG, DW-F, and DR, and design and supervision from QM. Team D analysis was performed by YG and CLM with contributions by ZB, and design and supervision from OGT. Team E analysis was performed by WKK and CK with design and supervision from EMM. Team F analysis was performed by TJ and CZ with contributions from GNL and design and supervision from DX. Team G analysis was performed by MT and WT with contributions from FDG, and design and supervision from FPR. Team H analysis was performed by YQ with design and supervision from JK and ZB. Team I analysis was designed and implemented by ML and AP. Post-submission analysis was performed by LP-C, except that CLM performed ANOVA on submission performance and MT generated 'straw man' predictions and classified prediction novelty. DPH and JAB performed literature evaluation. The manuscript was prepared by LP-C, TRH, and FPR and figures by LP-C. All authors read and approved the final manuscript.</p>
      </sec>
      <sec>
         <st>
            <p>Additional data files</p>
         </st>
         <p>The following additional data are available with the online version of this paper. Additional data file <supplr sid="S1">1</supplr> is a figure showing bar graphs of pairwise comparisons of AUC within each evaluation category. Additional data file <supplr sid="S2">2</supplr> is a figure showing bar graphs of mean P20R values within each evaluation category. Additional data file <supplr sid="S3">3</supplr> is a figure showing bar graphs comparing properties of GO annotations in the held-out gene set, in the newly annotated gene set, and in the training set. Additional data file <supplr sid="S4">4</supplr> is a figure showing a clustergram indicating Pearson correlation coefficients of the P20R performance measure among different submissions. Additional data file <supplr sid="S5">5</supplr> is a figure showing heatmaps of precision at several recall values evaluated using held-out annotations on all GO terms within each of the 12 evaluation categories for each submission. Additional data file <supplr sid="S6">6</supplr> is a figure showing a heatmap of median precision at several recall values evaluated using held-out annotations within each of the 12 evaluation categories per submission. Additional data file <supplr sid="S7">7</supplr> is a table listing performance measures for the initial round of GO term predictions within each evaluation category evaluated using held-out genes. Additional data file <supplr sid="S8">8</supplr> is a table listing performance measures for the initial round of GO term predictions within each evaluation category evaluated using the newly annotated genes (prospective evaluation). Additional data file <supplr sid="S9">9</supplr> is a table listing performance measures for the second round of GO term predictions within each evaluation category evaluated using held-out genes. Additional data file <supplr sid="S10">10</supplr> is a table listing performance measures for the second round of GO term predictions within each evaluation category evaluated using the newly annotated genes (prospective evaluation). Additional data file <supplr sid="S11">11</supplr> is a table listing the results of the analysis of variance in prediction performance. Additional data file <supplr sid="S12">12</supplr> is a table listing performance and variance on five subsets of the training data. Additional data file <supplr sid="S13">13</supplr> is a table listing performance measures of the unified predictions for each GO term. Additional data file <supplr sid="S14">14</supplr> is a table listing high-scoring predictions evaluated against existing literature. Additional data file <supplr sid="S15">15</supplr> is a table listing mitochondrial part predictions with data from a previous study <abbrgrp><abbr bid="B38">38</abbr></abbrgrp>. Additional data file <supplr sid="S16">16</supplr> is a table listing data underlying Figure <figr fid="F6">6</figr>. Additional data file <supplr sid="S17">17</supplr> is a table listing data underlying Figure <figr fid="F7">7</figr>. Additional data file <supplr sid="S18">18</supplr> is a table listing performance measures for various individual evidence sources within each evaluation category evaluated using held-out genes. Additional data file <supplr sid="S19">19</supplr> is a Flash animation showing a fraction of GO terms with higher precision and recall than a given precision/recall point for the unified predictions. Additional data file <supplr sid="S20">20</supplr> contains a 300 word description of the function prediction method used in each submission. Additional data file <supplr sid="S21">21</supplr> describes in detail the submission methods and the straw man classifier (57 pages in total).</p>
         <suppl id="S1">
            <title>
               <p>Additional data file 1</p>
            </title>
            <caption>
               <p>Bar graphs of pairwise comparisons of AUC within each evaluation category</p>
            </caption>
            <text>
               <p>Bar graphs of pairwise comparisons of AUC within each evaluation category.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S1.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S2">
            <title>
               <p>Additional data file 2</p>
            </title>
            <caption>
               <p>Bar graphs of mean P20R values within each evaluation category</p>
            </caption>
            <text>
               <p>Bar graphs of mean P20R values within each evaluation category</p>
            </text>
            <file name="gb-2008-9-s1-s2-S2.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S3">
            <title>
               <p>Additional data file 3</p>
            </title>
            <caption>
               <p>Bar graphs comparing properties of GO annotations in the held-out gene set, in the newly annotated gene set and in the training set</p>
            </caption>
            <text>
               <p>Bar graphs comparing properties of GO annotations in the held-out gene set, in the newly annotated gene set and in the training set.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S3.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S4">
            <title>
               <p>Additional data file 4</p>
            </title>
            <caption>
               <p>Clustergram indicating Pearson correlation coefficients of the P20R performance measure among different submissions</p>
            </caption>
            <text>
               <p>Clustergram indicating Pearson correlation coefficients of the P20R performance measure among different submissions.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S4.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S5">
            <title>
               <p>Additional data file 5</p>
            </title>
            <caption>
               <p>Heatmaps of precision at several recall values evaluated using held-out annotations on all GO terms within each of the 12 evaluation categories for each submission</p>
            </caption>
            <text>
               <p>Heatmaps of precision at several recall values evaluated using held-out annotations on all GO terms within each of the 12 evaluation categories for each submission.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S5.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S6">
            <title>
               <p>Additional data file 6</p>
            </title>
            <caption>
               <p>Heatmap of median precision at several recall values evaluated using held-out annotations within each of the 12 evaluation categories per submission.</p>
            </caption>
            <text>
               <p>Heatmap of median precision at several recall values evaluated using held-out annotations within each of the 12 evaluation categories per submission</p>
            </text>
            <file name="gb-2008-9-s1-s2-S6.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S7">
            <title>
               <p>Additional data file 7</p>
            </title>
            <caption>
               <p>Performance measures for the initial round of GO term predictions within each evaluation category evaluated using held-out genes</p>
            </caption>
            <text>
               <p>Performance measures for the initial round of GO term predictions within each evaluation category evaluated using held-out genes.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S7.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S8">
            <title>
               <p>Additional data file 8</p>
            </title>
            <caption>
               <p>Performance measures for the initial round of GO term predictions within each evaluation category evaluated using the newly annotated genes (prospective evaluation)</p>
            </caption>
            <text>
               <p>Performance measures for the initial round of GO term predictions within each evaluation category evaluated using the newly annotated genes (prospective evaluation).</p>
            </text>
            <file name="gb-2008-9-s1-s2-S8.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S9">
            <title>
               <p>Additional data file 9</p>
            </title>
            <caption>
               <p>Performance measures for the second round of GO term predictions within each evaluation category evaluated using held-out genes</p>
            </caption>
            <text>
               <p>Performance measures for the second round of GO term predictions within each evaluation category evaluated using held-out genes.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S9.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S10">
            <title>
               <p>Additional data file 10</p>
            </title>
            <caption>
               <p>Performance measures for the second round of GO term predictions within each evaluation category evaluated using the newly annotated genes (prospective evaluation)</p>
            </caption>
            <text>
               <p>Performance measures for the second round of GO term predictions within each evaluation category evaluated using the newly annotated genes (prospective evaluation).</p>
            </text>
            <file name="gb-2008-9-s1-s2-S10.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S11">
            <title>
               <p>Additional data file 11</p>
            </title>
            <caption>
               <p>Results of the analysis of variance in prediction performance</p>
            </caption>
            <text>
               <p>Results of the analysis of variance in prediction performance.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S11.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S12">
            <title>
               <p>Additional data file 12</p>
            </title>
            <caption>
               <p>Performance and variance on five subsets of the training data</p>
            </caption>
            <text>
               <p>Performance and variance on five subsets of the training data.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S12.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S13">
            <title>
               <p>Additional data file 13</p>
            </title>
            <caption>
               <p>Performance measures of the unified predictions for each GO term</p>
            </caption>
            <text>
               <p>Performance measures of the unified predictions for each GO term.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S13.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S14">
            <title>
               <p>Additional data file 14</p>
            </title>
            <caption>
               <p>High-scoring predictions evaluated against existing literature</p>
            </caption>
            <text>
               <p>High-scoring predictions evaluated against existing literature.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S14.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S15">
            <title>
               <p>Additional data file 15</p>
            </title>
            <caption>
               <p>Mitochondrial part predictions with data from a previous study <abbrgrp><abbr bid="B38">38</abbr></abbrgrp></p>
            </caption>
            <text>
               <p>Mitochondrial part predictions with data from a previous study <abbrgrp><abbr bid="B38">38</abbr></abbrgrp>.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S15.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S16">
            <title>
               <p>Additional data file 16</p>
            </title>
            <caption>
               <p>Data underlying Figure <figr fid="F6">6</figr></p>
            </caption>
            <text>
               <p>Data underlying Figure <figr fid="F6">6</figr>.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S16.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S17">
            <title>
               <p>Additional data file 17</p>
            </title>
            <caption>
               <p>Data underlying Figure <figr fid="F7">7</figr></p>
            </caption>
            <text>
               <p>Data underlying Figure <figr fid="F7">7</figr>.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S17.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S18">
            <title>
               <p>Additional data file 18</p>
            </title>
            <caption>
               <p>Performance measures for various individual evidence sources within each evaluation category evaluated using held-out genes</p>
            </caption>
            <text>
               <p>Performance measures for various individual evidence sources within each evaluation category evaluated using held-out genes.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S18.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S19">
            <title>
               <p>Additional data file 19</p>
            </title>
            <caption>
               <p>Fraction of GO terms with higher precision and recall than a given precision/recall point for the unified predictions</p>
            </caption>
            <text>
               <p>Fraction of GO terms with higher precision and recall than a given precision/recall point for the unified predictions.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S19.swf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S20">
            <title>
               <p>Additional data file 20</p>
            </title>
            <caption>
               <p>Description of the function prediction method used in each submission</p>
            </caption>
            <text>
               <p>Description of the function prediction method used in each submission.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S20.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S21">
            <title>
               <p>Additional data file 21</p>
            </title>
            <caption>
               <p>Detailed description of the submission methods and the straw man classifier</p>
            </caption>
            <text>
               <p>Detailed description of the submission methods and the straw man classifier.</p>
            </text>
            <file name="gb-2008-9-s1-s2-S21.pdf">
               <p>Click here for file</p>
            </file>
         </suppl>
      </sec>
   </bdy>
   <bm>
      <ack>
         <sec>
            <st>
               <p>Acknowledgements</p>
            </st>
            <p>Team A (GO, GL, JQ, CG, MJ, and WSN) was supported by NIH award R33 HG003070. Team B (HL, MD, TC, and FS) was supported by NIH/NSF joint mathematical biology initiative DMS-0241102 and NIH P50 HG 002790; HL is supported by the systems biology infrastructure establishment grant provided by Gwangju Institute of Science and Technology in 2008; MD is supported by the National Natural Science Foundation of China (No. 30570425), the National Key Basic Research Project of China (No. 2003CB715903), and Microsoft Research Asia (MSRA). Team C (SM, DW-F, CG, DR, QM) was supported by an NSERC operating grant to QM as well as a Genome Canada grant administered by the Ontario Genomics Institute. Team D (YG, CLM, ZB, and OGT) was partially supported by NIH grant R01 GM071966 and NSF grant IIS-0513552 to OGT and NIGMS Center of Excellence grant P50 GM071508. Team E (WKK, CK, and EMM) was supported by grants from the NIH, NSF, Packard and Welch Foundations. Team F (TJ, CZ, GNL, and DX) was supported by USDA/CSREES-2004-25604-14708 and NSF/ITR-IIS-0407204. Team G (MT, WT, FDG, GFB, and FPR) was supported by NIH grants (HG003224, HG0017115, HL81341, HG004233 and HG004098), by the Keck Foundation, and by NSF TeraGrid resources. Team H (YQ, JK, and ZB) was supported in part by National Science Foundation NSF grants EIA0225656, EIA0225636, CAREER CC044917 and National Institutes of Health NIH grant LM07994-01. Team I (ML and AP) warmly thanks A Vazquez for his support. DPH and JAB were supported by HG002273. LP-C and TRH were supported by a CIHR grant and thank O Meruvia for helping with the design of figures.</p>
            <p>This article has been published as part of <it>Genome Biology </it>Volume 9 Supplement 1, 2008: Quantitative inference of gene function from diverse large-scale datasets. The full contents of the supplement are available online at <url>http://genomebiology.com/supplements/9/S1</url></p>
         </sec>
      </ack>
      <refgrp>
         <bibl id="B1">
            <title>
               <p>Gene prioritization through genomic data fusion.</p>
            </title>
            <aug>
               <au>
                  <snm>Aerts</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Lambrechts</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Maity</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Van Loo</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Coessens</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>De Smet</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Tranchevent</snm>
                  <fnm>LC</fnm>
               </au>
               <au>
                  <snm>De Moor</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Marynen</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Hassan</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Carmeliet</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Moreau</snm>
                  <fnm>Y</fnm>
               </au>
            </aug>
            <source>Nat Biotechnol</source>
            <pubdate>2006</pubdate>
            <volume>24</volume>
            <fpage>537</fpage>
            <lpage>544</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">16680138</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B2">
            <title>
               <p>Global protein function annotation through mining genome-scale data in yeast <it>Saccharomyces cerevisiae</it>.</p>
            </title>
            <aug>
               <au>
                  <snm>Chen</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Xu</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2004</pubdate>
            <volume>32</volume>
            <fpage>6414</fpage>
            <lpage>6424</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">535686</pubid>
                  <pubid idtype="pmpid" link="fulltext">15585665</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B3">
            <title>
               <p>Genome-scale gene function prediction using multiple sources of high-throughput data in yeast <it>Saccharomyces cerevisiae</it>.</p>
            </title>
            <aug>
               <au>
                  <snm>Joshi</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Chen</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Becker</snm>
                  <fnm>JM</fnm>
               </au>
               <au>
                  <snm>Alexandrov</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Xu</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>OMICS</source>
            <pubdate>2004</pubdate>
            <volume>8</volume>
            <fpage>322</fpage>
            <lpage>333</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">15703479</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B4">
            <title>
               <p>Whole-genome annotation by using evidence integration in functional-linkage networks.</p>
            </title>
            <aug>
               <au>
                  <snm>Karaoz</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Murali</snm>
                  <fnm>TM</fnm>
               </au>
               <au>
                  <snm>Letovsky</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Zheng</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Ding</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Cantor</snm>
                  <fnm>CR</fnm>
               </au>
               <au>
                  <snm>Kasif</snm>
                  <fnm>S</fnm>
               </au>
            </aug>
            <source>Proc Natl Acad Sci USA</source>
            <pubdate>2004</pubdate>
            <volume>101</volume>
            <fpage>2888</fpage>
            <lpage>2893</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">365715</pubid>
                  <pubid idtype="pmpid" link="fulltext">14981259</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B5">
            <title>
               <p>A statistical framework for genomic data fusion.</p>
            </title>
            <aug>
               <au>
                  <snm>Lanckriet</snm>
                  <fnm>GR</fnm>
               </au>
               <au>
                  <snm>De Bie</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Cristianini</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Jordan</snm>
                  <fnm>MI</fnm>
               </au>
               <au>
                  <snm>Noble</snm>
                  <fnm>WS</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2004</pubdate>
            <volume>20</volume>
            <fpage>2626</fpage>
            <lpage>2635</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">15130933</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B6">
            <title>
               <p>VIRGO: computational prediction of gene functions.</p>
            </title>
            <aug>
               <au>
                  <snm>Massjouni</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Rivera</snm>
                  <fnm>CG</fnm>
               </au>
               <au>
                  <snm>Murali</snm>
                  <fnm>TM</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2006</pubdate>
            <volume>34</volume>
            <fpage>W340</fpage>
            <lpage>344</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1538839</pubid>
                  <pubid idtype="pmpid" link="fulltext">16845022</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B7">
            <title>
               <p>Discovery of biological networks from diverse functional genomic data.</p>
            </title>
            <aug>
               <au>
                  <snm>Myers</snm>
                  <fnm>CL</fnm>
               </au>
               <au>
                  <snm>Robson</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Wible</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Hibbs</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Chiriac</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Theesfeld</snm>
                  <fnm>CL</fnm>
               </au>
               <au>
                  <snm>Dolinski</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Troyanskaya</snm>
                  <fnm>OG</fnm>
               </au>
            </aug>
            <source>Genome Biol</source>
            <pubdate>2005</pubdate>
            <volume>6</volume>
            <fpage>R114</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1414113</pubid>
                  <pubid idtype="pmpid" link="fulltext">16420673</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B8">
            <title>
               <p>Connectionist approaches for predicting mouse gene function from gene expression.</p>
            </title>
            <aug>
               <au>
                  <snm>Shenouda</snm>
                  <fnm>EAMA</fnm>
               </au>
               <au>
                  <snm>Morris</snm>
                  <fnm>Q</fnm>
               </au>
               <au>
                  <snm>Bonner</snm>
                  <fnm>AJ</fnm>
               </au>
            </aug>
            <source>Neural Information Processing: 13th International Conference, ICONIP 2006, Hong Kong, China, October 3-6, 2006, Proceedings</source>
            <publisher>Heidelberg, Berlin; Springer</publisher>
            <editor>King I, Wang J, Chan L, Wang DL</editor>
            <pubdate>2006</pubdate>
            <fpage>280</fpage>
            <lpage>289</lpage>
         </bibl>
         <bibl id="B9">
            <title>
               <p>A Bayesian framework for combining heterogeneous data sources for gene function prediction (in <it>Saccharomyces cerevisiae</it>).</p>
            </title>
            <aug>
               <au>
                  <snm>Troyanskaya</snm>
                  <fnm>OG</fnm>
               </au>
               <au>
                  <snm>Dolinski</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Owen</snm>
                  <fnm>AB</fnm>
               </au>
               <au>
                  <snm>Altman</snm>
                  <fnm>RB</fnm>
               </au>
               <au>
                  <snm>Botstein</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>Proc Natl Acad Sci USA</source>
            <pubdate>2003</pubdate>
            <volume>100</volume>
            <fpage>8348</fpage>
            <lpage>8353</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">166232</pubid>
                  <pubid idtype="pmpid" link="fulltext">12826619</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B10">
            <title>
               <p>A regression-based K nearest neighbor algorithm for gene function prediction from heterogeneous data.</p>
            </title>
            <aug>
               <au>
                  <snm>Yao</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Ruzzo</snm>
                  <fnm>WL</fnm>
               </au>
            </aug>
            <source>BMC Bioinformatics</source>
            <pubdate>2006</pubdate>
            <volume>7</volume>
            <issue>suppl 1</issue>
            <fpage>S11</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1810312</pubid>
                  <pubid idtype="pmpid" link="fulltext">16723004</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B11">
            <title>
               <p>Gene trap mutagenesis.</p>
            </title>
            <aug>
               <au>
                  <snm>Abuin</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Hansen</snm>
                  <fnm>GM</fnm>
               </au>
               <au>
                  <snm>Zambrowicz</snm>
                  <fnm>B</fnm>
               </au>
            </aug>
            <source>Handb Exp Pharmacol</source>
            <pubdate>2007</pubdate>
            <volume>178</volume>
            <fpage>129</fpage>
            <lpage>147</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">17203654</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B12">
            <title>
               <p>A mouse for all reasons.</p>
            </title>
            <aug>
               <au>
                  <snm>Collins</snm>
                  <fnm>FS</fnm>
               </au>
               <au>
                  <snm>Rossant</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Wurst</snm>
                  <fnm>W</fnm>
               </au>
            </aug>
            <source>Cell</source>
            <pubdate>2007</pubdate>
            <volume>128</volume>
            <fpage>9</fpage>
            <lpage>13</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">17218247</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B13">
            <title>
               <p>Predicting protein function from protein/protein interaction data: a probabilistic approach.</p>
            </title>
            <aug>
               <au>
                  <snm>Letovsky</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Kasif</snm>
                  <fnm>S</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2003</pubdate>
            <volume>19</volume>
            <issue>suppl 1</issue>
            <fpage>i197</fpage>
            <lpage>204</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12855458</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B14">
            <title>
               <p>Assessing the limits of genomic data integration for predicting protein networks.</p>
            </title>
            <aug>
               <au>
                  <snm>Lu</snm>
                  <fnm>LJ</fnm>
               </au>
               <au>
                  <snm>Xia</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Paccanaro</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Yu</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Gerstein</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2005</pubdate>
            <volume>15</volume>
            <fpage>945</fpage>
            <lpage>953</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1172038</pubid>
                  <pubid idtype="pmpid" link="fulltext">15998909</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B15">
            <title>
               <p>Integrative analysis of genome-wide experiments in the context of a large high-throughput data compendium.</p>
            </title>
            <aug>
               <au>
                  <snm>Tanay</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Steinfeld</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Kupiec</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Shamir</snm>
                  <fnm>R</fnm>
               </au>
            </aug>
            <source>Mol Syst Biol</source>
            <pubdate>2005</pubdate>
            <volume>1</volume>
            <fpage>2005.0002</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1681453</pubid>
                  <pubid idtype="pmpid" link="fulltext">16729037</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B16">
            <title>
               <p>Functional bioinformatics for <it>Arabidopsis thaliana</it>.</p>
            </title>
            <aug>
               <au>
                  <snm>Clare</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Karwath</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Ougham</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>King</snm>
                  <fnm>RD</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2006</pubdate>
            <volume>22</volume>
            <fpage>1130</fpage>
            <lpage>1136</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">16481336</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B17">
            <title>
               <p>Accurate prediction of protein functional class from sequence in the <it>Mycobacterium tuberculosis </it>and <it>Escherichia coli </it>genomes using data mining.</p>
            </title>
            <aug>
               <au>
                  <snm>King</snm>
                  <fnm>RD</fnm>
               </au>
               <au>
                  <snm>Karwath</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Clare</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Dehaspe</snm>
                  <fnm>L</fnm>
               </au>
            </aug>
            <source>Yeast</source>
            <pubdate>2000</pubdate>
            <volume>17</volume>
            <fpage>283</fpage>
            <lpage>293</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">11119305</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B18">
            <title>
               <p>Predicting gene function by conserved co-expression.</p>
            </title>
            <aug>
               <au>
                  <snm>van Noort</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Snel</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Huynen</snm>
                  <fnm>MA</fnm>
               </au>
            </aug>
            <source>Trends Genet</source>
            <pubdate>2003</pubdate>
            <volume>19</volume>
            <fpage>238</fpage>
            <lpage>242</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12711213</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B19">
            <title>
               <p>Network motifs: simple building blocks of complex networks.</p>
            </title>
            <aug>
               <au>
                  <snm>Milo</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Shen-Orr</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Itzkovitz</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Kashtan</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Chklovskii</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Alon</snm>
                  <fnm>U</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2002</pubdate>
            <volume>298</volume>
            <fpage>824</fpage>
            <lpage>827</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12399590</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B20">
            <title>
               <p>Systems biology. Life's complexity pyramid.</p>
            </title>
            <aug>
               <au>
                  <snm>Oltvai</snm>
                  <fnm>ZN</fnm>
               </au>
               <au>
                  <snm>Barabasi</snm>
                  <fnm>AL</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2002</pubdate>
            <volume>298</volume>
            <fpage>763</fpage>
            <lpage>764</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12399572</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B21">
            <title>
               <p>The functional landscape of mouse gene expression.</p>
            </title>
            <aug>
               <au>
                  <snm>Zhang</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Morris</snm>
                  <fnm>QD</fnm>
               </au>
               <au>
                  <snm>Chang</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Shai</snm>
                  <fnm>O</fnm>
               </au>
               <au>
                  <snm>Bakowski</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Mitsakakis</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Mohammad</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Robinson</snm>
                  <fnm>MD</fnm>
               </au>
               <au>
                  <snm>Zirngibl</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Somogyi</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Laurin</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Eftekharpour</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Sat</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Grigull</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Pan</snm>
                  <fnm>Q</fnm>
               </au>
               <au>
                  <snm>Peng</snm>
                  <fnm>WT</fnm>
               </au>
               <au>
                  <snm>Krogan</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Greenblatt</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Fehlings</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kooy</snm>
                  <mnm>van der</mnm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Aubin</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Bruneau</snm>
                  <fnm>BG</fnm>
               </au>
               <au>
                  <snm>Rossant</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Blencowe</snm>
                  <fnm>BJ</fnm>
               </au>
               <au>
                  <snm>Frey</snm>
                  <fnm>BJ</fnm>
               </au>
               <au>
                  <snm>Hughes</snm>
                  <fnm>TR</fnm>
               </au>
            </aug>
            <source>J Biol</source>
            <pubdate>2004</pubdate>
            <volume>3</volume>
            <fpage>21</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">549719</pubid>
                  <pubid idtype="pmpid" link="fulltext">15588312</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B22">
            <title>
               <p>Computational protein function prediction: Are we making progress?</p>
            </title>
            <aug>
               <au>
                  <snm>Godzik</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Jambon</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Friedberg</snm>
                  <fnm>I</fnm>
               </au>
            </aug>
            <source>Cell Mol Life Sci</source>
            <pubdate>2007</pubdate>
            <volume>64</volume>
            <fpage>2505</fpage>
            <lpage>2511</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">17611711</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B23">
            <title>
               <p>The art of gene function prediction.</p>
            </title>
            <aug>
               <au>
                  <snm>Murali</snm>
                  <fnm>TM</fnm>
               </au>
               <au>
                  <snm>Wu</snm>
                  <fnm>CJ</fnm>
               </au>
               <au>
                  <snm>Kasif</snm>
                  <fnm>S</fnm>
               </au>
            </aug>
            <source>Nat Biotechnol</source>
            <pubdate>2006</pubdate>
            <volume>24</volume>
            <fpage>1474</fpage>
            <lpage>1475</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">17160037</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B24">
            <title>
               <p>Finding function: evaluation methods for functional genomic data.</p>
            </title>
            <aug>
               <au>
                  <snm>Myers</snm>
                  <fnm>CL</fnm>
               </au>
               <au>
                  <snm>Barrett</snm>
                  <fnm>DR</fnm>
               </au>
               <au>
                  <snm>Hibbs</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Huttenhower</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Troyanskaya</snm>
                  <fnm>OG</fnm>
               </au>
            </aug>
            <source>BMC Genomics</source>
            <pubdate>2006</pubdate>
            <volume>7</volume>
            <fpage>187</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1560386</pubid>
                  <pubid idtype="pmpid" link="fulltext">16869964</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B25">
            <title>
               <p>Computational analyses of high-throughput protein-protein interaction data.</p>
            </title>
            <aug>
               <au>
                  <snm>Chen</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Xu</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>Curr Protein Pept Sci</source>
            <pubdate>2003</pubdate>
            <volume>4</volume>
            <fpage>159</fpage>
            <lpage>181</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12769716</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B26">
            <title>
               <p>Treasures and traps in genome-wide data sets: case examples from yeast.</p>
            </title>
            <aug>
               <au>
                  <snm>Grunenfelder</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Winzeler</snm>
                  <fnm>EA</fnm>
               </au>
            </aug>
            <source>Nat Rev Genet</source>
            <pubdate>2002</pubdate>
            <volume>3</volume>
            <fpage>653</fpage>
            <lpage>661</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12209140</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B27">
            <title>
               <p>Gene function on a genomic scale.</p>
            </title>
            <aug>
               <au>
                  <snm>Steinmetz</snm>
                  <fnm>LM</fnm>
               </au>
               <au>
                  <snm>Deutschbauer</snm>
                  <fnm>AM</fnm>
               </au>
            </aug>
            <source>J Chromatogr B Analyt Technol Biomed Life Sci</source>
            <pubdate>2002</pubdate>
            <volume>782</volume>
            <fpage>151</fpage>
            <lpage>163</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12458004</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B28">
            <title>
               <p>Online Predicted Human Interaction Database.</p>
            </title>
            <aug>
               <au>
                  <snm>Brown</snm>
                  <fnm>KR</fnm>
               </au>
               <au>
                  <snm>Jurisica</snm>
                  <fnm>I</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2005</pubdate>
            <volume>21</volume>
            <fpage>2076</fpage>
            <lpage>2082</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">15657099</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B29">
            <title>
               <p>Annotation transfer between genomes: protein-protein interologs and protein-DNA regulogs.</p>
            </title>
            <aug>
               <au>
                  <snm>Yu</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Luscombe</snm>
                  <fnm>NM</fnm>
               </au>
               <au>
                  <snm>Lu</snm>
                  <fnm>HX</fnm>
               </au>
               <au>
                  <snm>Zhu</snm>
                  <fnm>X</fnm>
               </au>
               <au>
                  <snm>Xia</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Han</snm>
                  <fnm>JD</fnm>
               </au>
               <au>
                  <snm>Bertin</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Chung</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Vidal</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Gerstein</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2004</pubdate>
            <volume>14</volume>
            <fpage>1107</fpage>
            <lpage>1118</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">419789</pubid>
                  <pubid idtype="pmpid" link="fulltext">15173116</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B30">
            <title>
               <p>MouseFunc I</p>
            </title>
            <url>http://hugheslab.med.utoronto.ca/supplementary-data/mouseFunc_I/</url>
         </bibl>
         <bibl id="B31">
            <title>
               <p>Gene ontology: tool for the unification of biology.</p>
            </title>
            <aug>
               <au>
                  <cnm>The Gene Ontology Consortium</cnm>
               </au>
            </aug>
            <source>Nat Genet</source>
            <pubdate>2000</pubdate>
            <volume>25</volume>
            <fpage>25</fpage>
            <lpage>29</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">10802651</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B32">
            <title>
               <p>The Jackson Laboratory Mouse Genome Database (MGD), Mouse Genome Informatics Web Site</p>
            </title>
            <url>http://www.informatics.jax.org</url>
         </bibl>
         <bibl id="B33">
            <title>
               <p>Guide to GO Evidence Codes</p>
            </title>
            <url>http://www.geneontology.org/GO.evidence.shtml</url>
         </bibl>
         <bibl id="B34">
            <title>
               <p>A method of comparing the areas under receiver operating characteristic curves derived from the same cases.</p>
            </title>
            <aug>
               <au>
                  <snm>Hanley</snm>
                  <fnm>JA</fnm>
               </au>
               <au>
                  <snm>McNeil</snm>
                  <fnm>BJ</fnm>
               </au>
            </aug>
            <source>Radiology</source>
            <pubdate>1983</pubdate>
            <volume>148</volume>
            <fpage>839</fpage>
            <lpage>843</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">6878708</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B35">
            <title>
               <p>A combined algorithm for genome-wide prediction of protein function.</p>
            </title>
            <aug>
               <au>
                  <snm>Marcotte</snm>
                  <fnm>EM</fnm>
               </au>
               <au>
                  <snm>Pellegrini</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Thompson</snm>
                  <fnm>MJ</fnm>
               </au>
               <au>
                  <snm>Yeates</snm>
                  <fnm>TO</fnm>
               </au>
               <au>
                  <snm>Eisenberg</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>Nature</source>
            <pubdate>1999</pubdate>
            <volume>402</volume>
            <fpage>83</fpage>
            <lpage>86</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">10573421</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B36">
            <title>
               <p>Modulation of T cell development and activation by novel members of the Schlafen (slfn) gene family harbouring an RNA helicase-like motif.</p>
            </title>
            <aug>
               <au>
                  <snm>Geserick</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Kaiser</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Klemm</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Kaufmann</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Zerrahn</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>Int Immunol</source>
            <pubdate>2004</pubdate>
            <volume>16</volume>
            <fpage>1535</fpage>
            <lpage>1548</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">15351786</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B37">
            <title>
               <p>MFunc: Predictions of Gene Function</p>
            </title>
            <url>http://llama.med.harvard.edu/cgi/Mfunc/mfunc.py</url>
         </bibl>
         <bibl id="B38">
            <title>
               <p>Global survey of organ and organelle protein expression in mouse: combined proteomic and transcriptomic profiling.</p>
            </title>
            <aug>
               <au>
                  <snm>Kislinger</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Cox</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Kannan</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Chung</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Hu</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Ignatchenko</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Scott</snm>
                  <fnm>MS</fnm>
               </au>
               <au>
                  <snm>Gramolini</snm>
                  <fnm>AO</fnm>
               </au>
               <au>
                  <snm>Morris</snm>
                  <fnm>Q</fnm>
               </au>
               <au>
                  <snm>Hallett</snm>
                  <fnm>MT</fnm>
               </au>
               <au>
                  <snm>Rossant</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Hughes</snm>
                  <fnm>TR</fnm>
               </au>
               <au>
                  <snm>Frey</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Emili</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Cell</source>
            <pubdate>2006</pubdate>
            <volume>125</volume>
            <fpage>173</fpage>
            <lpage>186</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">16615898</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B39">
            <title>
               <p>Transcript annotation in FANTOM3: mouse gene catalog based on physical cDNAs.</p>
            </title>
            <aug>
               <au>
                  <snm>Maeda</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Kasukawa</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Oyama</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Gough</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Frith</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Engstrom</snm>
                  <fnm>PG</fnm>
               </au>
               <au>
                  <snm>Lenhard</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Aturaliya</snm>
                  <fnm>RN</fnm>
               </au>
               <au>
                  <snm>Batalov</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Beisel</snm>
                  <fnm>KW</fnm>
               </au>
               <au>
                  <snm>Bult</snm>
                  <fnm>CJ</fnm>
               </au>
               <au>
                  <snm>Fletcher</snm>
                  <fnm>CF</fnm>
               </au>
               <au>
                  <snm>Forrest</snm>
                  <fnm>AR</fnm>
               </au>
               <au>
                  <snm>Furuno</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Hill</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Itoh</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kanamori-Katayama</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Katayama</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Katoh</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kawashima</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Quackenbush</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Ravasi</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Ring</snm>
                  <fnm>BZ</fnm>
               </au>
               <au>
                  <snm>Shibata</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Sugiura</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Takenaka</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Teasdale</snm>
                  <fnm>RD</fnm>
               </au>
               <au>
                  <snm>Wells</snm>
                  <fnm>CA</fnm>
               </au>
               <au>
                  <snm>Zhu</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Kai</snm>
                  <fnm>C</fnm>
               </au>
               <etal/>
            </aug>
            <source>PLoS Genet</source>
            <pubdate>2006</pubdate>
            <volume>2</volume>
            <fpage>e62</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1449903</pubid>
                  <pubid idtype="pmpid" link="fulltext">16683036</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B40">
            <title>
               <p>On combining classifiers.</p>
            </title>
            <aug>
               <au>
                  <snm>Kittler</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Hatef</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Duin</snm>
                  <fnm>RPW</fnm>
               </au>
               <au>
                  <snm>Matas</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>IEEE Trans Pattern Analysis Machine Intelligence</source>
            <pubdate>1998</pubdate>
            <volume>20</volume>
            <fpage>226</fpage>
            <lpage>239</lpage>
         </bibl>
         <bibl id="B41">
            <title>
               <p>A panoramic view of yeast noncoding RNA processing.</p>
            </title>
            <aug>
               <au>
                  <snm>Peng</snm>
                  <fnm>WT</fnm>
               </au>
               <au>
                  <snm>Robinson</snm>
                  <fnm>MD</fnm>
               </au>
               <au>
                  <snm>Mnaimneh</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Krogan</snm>
                  <fnm>NJ</fnm>
               </au>
               <au>
                  <snm>Cagney</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Morris</snm>
                  <fnm>Q</fnm>
               </au>
               <au>
                  <snm>Davierwala</snm>
                  <fnm>AP</fnm>
               </au>
               <au>
                  <snm>Grigull</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Yang</snm>
                  <fnm>X</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Mitsakakis</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Ryan</snm>
                  <fnm>OW</fnm>
               </au>
               <au>
                  <snm>Datta</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Jojic</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Pal</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Canadien</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Richards</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Beattie</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Wu</snm>
                  <fnm>LF</fnm>
               </au>
               <au>
                  <snm>Altschuler</snm>
                  <fnm>SJ</fnm>
               </au>
               <au>
                  <snm>Roweis</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Frey</snm>
                  <fnm>BJ</fnm>
               </au>
               <au>
                  <snm>Emili</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Greenblatt</snm>
                  <fnm>JF</fnm>
               </au>
               <au>
                  <snm>Hughes</snm>
                  <fnm>TR</fnm>
               </au>
            </aug>
            <source>Cell</source>
            <pubdate>2003</pubdate>
            <volume>113</volume>
            <fpage>919</fpage>
            <lpage>933</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">12837249</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B42">
            <title>
               <p>An introduction to ROC analysis.</p>
            </title>
            <aug>
               <au>
                  <snm>Fawcett</snm>
                  <fnm>T</fnm>
               </au>
            </aug>
            <source>Pattern Recogn Lett</source>
            <pubdate>2006</pubdate>
            <volume>27</volume>
            <fpage>861</fpage>
            <lpage>874</lpage>
         </bibl>
         <bibl id="B43">
            <title>
               <p>The relationship between Precision-Recall and ROC curves.</p>
            </title>
            <aug>
               <au>
                  <snm>Davis</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Goadrich</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Proceedings of the 23rd International Conference on Machine Learning: June 25-29, 2006; Pittsburgh, Pennsylvania</source>
            <publisher>New York: ACM Press</publisher>
            <editor>Cohen WW, Moore A</editor>
            <pubdate>2006</pubdate>
            <fpage>233</fpage>
            <lpage>240</lpage>
         </bibl>
         <bibl id="B44">
            <title>
               <p>A gene atlas of the mouse and human protein-encoding transcriptomes.</p>
            </title>
            <aug>
               <au>
                  <snm>Su</snm>
                  <fnm>AI</fnm>
               </au>
               <au>
                  <snm>Wiltshire</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Batalov</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Lapp</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Ching</snm>
                  <fnm>KA</fnm>
               </au>
               <au>
                  <snm>Block</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Zhang</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Soden</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Hayakawa</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kreiman</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Cooke</snm>
                  <fnm>MP</fnm>
               </au>
               <au>
                  <snm>Walker</snm>
                  <fnm>JR</fnm>
               </au>
               <au>
                  <snm>Hogenesch</snm>
                  <fnm>JB</fnm>
               </au>
            </aug>
            <source>Proc Natl Acad Sci USA</source>
            <pubdate>2004</pubdate>
            <volume>101</volume>
            <fpage>6062</fpage>
            <lpage>6067</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">395923</pubid>
                  <pubid idtype="pmpid" link="fulltext">15075390</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B45">
            <title>
               <p>A mouse atlas of gene expression: large-scale digital gene-expression profiles from precisely defined developing C57BL/6J mouse tissues and cells.</p>
            </title>
            <aug>
               <au>
                  <snm>Siddiqui</snm>
                  <fnm>AS</fnm>
               </au>
               <au>
                  <snm>Khattra</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Delaney</snm>
                  <fnm>AD</fnm>
               </au>
               <au>
                  <snm>Zhao</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Astell</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Asano</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Babakaiff</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Barber</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Beland</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Bohacec</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Brown-John</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Chand</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Charest</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Charters</snm>
                  <fnm>AM</fnm>
               </au>
               <au>
                  <snm>Cullum</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Dhalla</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Featherstone</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Gerhard</snm>
                  <fnm>DS</fnm>
               </au>
               <au>
                  <snm>Hoffman</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Holt</snm>
                  <fnm>RA</fnm>
               </au>
               <au>
                  <snm>Hou</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Kuo</snm>
                  <fnm>BY</fnm>
               </au>
               <au>
                  <snm>Lee</snm>
                  <fnm>LL</fnm>
               </au>
               <au>
                  <snm>Lee</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Leung</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Ma</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Matsuo</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Mayo</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>McDonald</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Prabhu</snm>
                  <fnm>AL</fnm>
               </au>
               <etal/>
            </aug>
            <source>Proc Natl Acad Sci USA</source>
            <pubdate>2005</pubdate>
            <volume>102</volume>
            <fpage>18485</fpage>
            <lpage>18490</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1311911</pubid>
                  <pubid idtype="pmpid" link="fulltext">16352711</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B46">
            <title>
               <p>Pfam: clans, web tools and services.</p>
            </title>
            <aug>
               <au>
                  <snm>Finn</snm>
                  <fnm>RD</fnm>
               </au>
               <au>
                  <snm>Mistry</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Schuster-Bockler</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Griffiths-Jones</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Hollich</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Lassmann</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Moxon</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Marshall</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Khanna</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Durbin</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Eddy</snm>
                  <fnm>SR</fnm>
               </au>
               <au>
                  <snm>Sonnhammer</snm>
                  <fnm>EL</fnm>
               </au>
               <au>
                  <snm>Bateman</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2006</pubdate>
            <volume>34</volume>
            <fpage>D247</fpage>
            <lpage>251</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1347511</pubid>
                  <pubid idtype="pmpid" link="fulltext">16381856</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B47">
            <title>
               <p>InterPro, progress and status in 2005.</p>
            </title>
            <aug>
               <au>
                  <snm>Mulder</snm>
                  <fnm>NJ</fnm>
               </au>
               <au>
                  <snm>Apweiler</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Attwood</snm>
                  <fnm>TK</fnm>
               </au>
               <au>
                  <snm>Bairoch</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Bateman</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Binns</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Bradley</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Bork</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Bucher</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Cerutti</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Copley</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Courcelle</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Das</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Durbin</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Fleischmann</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Gough</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Haft</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Harte</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Hulo</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Kahn</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Kanapin</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Krestyaninova</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Lonsdale</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Lopez</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Letunic</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Madera</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Maslen</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>McDowall</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Mitchell</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Nikolskaya</snm>
                  <fnm>AN</fnm>
               </au>
               <etal/>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2005</pubdate>
            <volume>33</volume>
            <fpage>D201</fpage>
            <lpage>205</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">540060</pubid>
                  <pubid idtype="pmpid" link="fulltext">15608177</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B48">
            <title>
               <p>The mouse genome database (MGD): new features facilitating a model system.</p>
            </title>
            <aug>
               <au>
                  <snm>Eppig</snm>
                  <fnm>JT</fnm>
               </au>
               <au>
                  <snm>Blake</snm>
                  <fnm>JA</fnm>
               </au>
               <au>
                  <snm>Bult</snm>
                  <fnm>CJ</fnm>
               </au>
               <au>
                  <snm>Kadin</snm>
                  <fnm>JA</fnm>
               </au>
               <au>
                  <snm>Richardson</snm>
                  <fnm>JE</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2007</pubdate>
            <volume>35</volume>
            <fpage>D630</fpage>
            <lpage>637</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1751527</pubid>
                  <pubid idtype="pmpid" link="fulltext">17135206</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B49">
            <title>
               <p>Phenotype Annotations from MGI</p>
            </title>
            <url>http://ftp.informatics.jax.org/pub/reports</url>
         </bibl>
         <bibl id="B50">
            <title>
               <p>EnsMart: a generic system for fast and flexible access to biological data.</p>
            </title>
            <aug>
               <au>
                  <snm>Kasprzyk</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Keefe</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Smedley</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>London</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Spooner</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Melsopp</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Hammond</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Rocca-Serra</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Cox</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Birney</snm>
                  <fnm>E</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2004</pubdate>
            <volume>14</volume>
            <fpage>160</fpage>
            <lpage>169</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">314293</pubid>
                  <pubid idtype="pmpid" link="fulltext">14707178</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B51">
            <title>
               <p>Inparanoid: a comprehensive database of eukaryotic orthologs.</p>
            </title>
            <aug>
               <au>
                  <snm>O'Brien</snm>
                  <fnm>KP</fnm>
               </au>
               <au>
                  <snm>Remm</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Sonnhammer</snm>
                  <fnm>EL</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2005</pubdate>
            <volume>33</volume>
            <fpage>D476</fpage>
            <lpage>D480</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">540061</pubid>
                  <pubid idtype="pmpid" link="fulltext">15608241</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B52">
            <title>
               <p>Database resources of the National Center for Biotechnology Information.</p>
            </title>
            <aug>
               <au>
                  <snm>Wheeler</snm>
                  <fnm>DL</fnm>
               </au>
               <au>
                  <snm>Barrett</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Benson</snm>
                  <fnm>DA</fnm>
               </au>
               <au>
                  <snm>Bryant</snm>
                  <fnm>SH</fnm>
               </au>
               <au>
                  <snm>Canese</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Chetvernin</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Church</snm>
                  <fnm>DM</fnm>
               </au>
               <au>
                  <snm>DiCuccio</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Edgar</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Federhen</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Geer</snm>
                  <fnm>LY</fnm>
               </au>
               <au>
                  <snm>Kapustin</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Khovayko</snm>
                  <fnm>O</fnm>
               </au>
               <au>
                  <snm>Landsman</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Lipman</snm>
                  <fnm>DJ</fnm>
               </au>
               <au>
                  <snm>Madden</snm>
                  <fnm>TL</fnm>
               </au>
               <au>
                  <snm>Maglott</snm>
                  <fnm>DR</fnm>
               </au>
               <au>
                  <snm>Ostell</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Miller</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Pruitt</snm>
                  <fnm>KD</fnm>
               </au>
               <au>
                  <snm>Schuler</snm>
                  <fnm>GD</fnm>
               </au>
               <au>
                  <snm>Sequeira</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Sherry</snm>
                  <fnm>ST</fnm>
               </au>
               <au>
                  <snm>Sirotkin</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Souvorov</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Starchenko</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Tatusov</snm>
                  <fnm>RL</fnm>
               </au>
               <au>
                  <snm>Tatusova</snm>
                  <fnm>TA</fnm>
               </au>
               <au>
                  <snm>Wagner</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Yaschenko</snm>
                  <fnm>E</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2007</pubdate>
            <volume>35</volume>
            <fpage>D5</fpage>
            <lpage>12</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1781113</pubid>
                  <pubid idtype="pmpid" link="fulltext">17170002</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B53">
            <title>
               <p>Online Mendelian Inheritance in Man (OMIM), a knowledgebase of human genes and genetic disorders.</p>
            </title>
            <aug>
               <au>
                  <snm>Hamosh</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Scott</snm>
                  <fnm>AF</fnm>
               </au>
               <au>
                  <snm>Amberger</snm>
                  <fnm>JS</fnm>
               </au>
               <au>
                  <snm>Bocchini</snm>
                  <fnm>CA</fnm>
               </au>
               <au>
                  <snm>McKusick</snm>
                  <fnm>VA</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2005</pubdate>
            <volume>33</volume>
            <fpage>D514</fpage>
            <lpage>D517</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">539987</pubid>
                  <pubid idtype="pmpid" link="fulltext">15608251</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B54">
            <title>
               <p>Disease Associations from OMIM</p>
            </title>
            <url>http://ftp.ncbi.nih.gov/repository/OMIM/</url>
         </bibl>
         <bibl id="B55">
            <title>
               <p>Diffusion kernel-based logistic regression models for protein function prediction.</p>
            </title>
            <aug>
               <au>
                  <snm>Lee</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Tu</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Deng</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Sun</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Chen</snm>
                  <fnm>T</fnm>
               </au>
            </aug>
            <source>OMICS</source>
            <pubdate>2006</pubdate>
            <volume>10</volume>
            <fpage>40</fpage>
            <lpage>55</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">16584317</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B56">
            <title>
               <p>Hierarchical multi-label prediction of gene function.</p>
            </title>
            <aug>
               <au>
                  <snm>Barutcuoglu</snm>
                  <fnm>Z</fnm>
               </au>
               <au>
                  <snm>Schapire</snm>
                  <fnm>RE</fnm>
               </au>
               <au>
                  <snm>Troyanskaya</snm>
                  <fnm>OG</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2006</pubdate>
            <volume>22</volume>
            <fpage>830</fpage>
            <lpage>836</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">16410319</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B57">
            <title>
               <p>Predicting protein functions with message passing algorithms.</p>
            </title>
            <aug>
               <au>
                  <snm>Leone</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Pagnani</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2005</pubdate>
            <volume>21</volume>
            <fpage>239</fpage>
            <lpage>247</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">15377508</pubid>
            </xrefbib>
         </bibl>
      </refgrp>
   </bm>
</art>
