<?xml version='1.0'?>
<!DOCTYPE art SYSTEM 'http://www.biomedcentral.com/xml/article.dtd'>
<art>
   <ui>1471-2105-9-S10-O8</ui>
   <ji>1471-2105</ji>
   <fm>
      <dochead>Oral presentation</dochead>
      <bibl>
         <title>
            <p>Revealing sequence variation patterns in rice with machine learning methods</p>
         </title>
         <aug>
            <au id="A1" ca="yes">
               <snm>Bohnert</snm>
               <fnm>Regina</fnm>
               <insr iid="I1"/>
               <email>Regina.Bohnert@tuebingen.mpg.de</email>
            </au>
            <au id="A2">
               <snm>Zeller</snm>
               <fnm>Georg</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
            </au>
            <au id="A3">
               <snm>Clark</snm>
               <mi>M</mi>
               <fnm>Richard</fnm>
               <insr iid="I2"/>
               <insr iid="I3"/>
            </au>
            <au id="A4">
               <snm>Childs</snm>
               <mi>L</mi>
               <fnm>Kevin</fnm>
               <insr iid="I4"/>
            </au>
            <au id="A5">
               <snm>Ulat</snm>
               <fnm>Victor</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A6">
               <snm>Stokowski</snm>
               <fnm>Renee</fnm>
               <insr iid="I6"/>
            </au>
            <au id="A7">
               <snm>Ballinger</snm>
               <fnm>Dennis</fnm>
               <insr iid="I6"/>
            </au>
            <au id="A8">
               <snm>Frazer</snm>
               <fnm>Kelly</fnm>
               <insr iid="I6"/>
            </au>
            <au id="A9">
               <snm>Cox</snm>
               <fnm>David</fnm>
               <insr iid="I6"/>
            </au>
            <au id="A10">
               <snm>Bruskiewich</snm>
               <fnm>Richard</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A11">
               <snm>Buell</snm>
               <fnm>C Robin</fnm>
               <insr iid="I4"/>
            </au>
            <au id="A12">
               <snm>Leach</snm>
               <fnm>Jan</fnm>
               <insr iid="I7"/>
            </au>
            <au id="A13">
               <snm>Leung</snm>
               <fnm>Hei</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A14">
               <snm>McNally</snm>
               <mi>L</mi>
               <fnm>Kenneth</fnm>
               <insr iid="I5"/>
            </au>
            <au id="A15">
               <snm>Weigel</snm>
               <fnm>Detlef</fnm>
               <insr iid="I2"/>
            </au>
            <au id="A16">
               <snm>R&#228;tsch</snm>
               <fnm>Gunnar</fnm>
               <insr iid="I1"/>
            </au>
         </aug>
         <insg>
            <ins id="I1">
               <p>Friedrich Miescher Laboratory, Max Planck Society, 72076 T&#252;bingen, Germany</p>
            </ins>
            <ins id="I2">
               <p>Department of Molecular Biology, Max Planck Institute for Developmental Biology, 72076 T&#252;bingen, Germany</p>
            </ins>
            <ins id="I3">
               <p>Department of Biology, University of Utah, Salt Lake City, UT 84112, USA</p>
            </ins>
            <ins id="I4">
               <p>Department of Plant Biology, Michigan State University, East Lansing, MI 48824, USA</p>
            </ins>
            <ins id="I5">
               <p>International Rice Research Institute, Metro Manila, The Philippines</p>
            </ins>
            <ins id="I6">
               <p>Perlegen Sciences, Inc., Mountain View, California, CA 94043, USA</p>
            </ins>
            <ins id="I7">
               <p>Bioagricultural Sciences and Pest Management, Colorado State University, Colorado, CO 80523, USA</p>
            </ins>
         </insg>
         <source>BMC Bioinformatics</source>
         <supplement>
            <title>
               <p>Highlights from the Fourth International Society for Computational Biology (ISCB) Student Council Symposium</p>
            </title>
            <editor>Lucia Peixoto, Nils Gehlenborg and Sarath Chandra Janga</editor>
            <note>Meeting abstracts &#8211; A single PDF containing all abstracts in this Supplement is available <a href="http://www.biomedcentral.com/content/pdf/1471-2105-9-S10-full.pdf">here</a>.</note>
            <url>http://www.biomedcentral.com/content/pdf/1471-2105-9-S10-info.pdf</url>
         </supplement>
         <conference>
            <title>
               <p>Fourth International Society for Computational Biology (ISCB) Student Council Symposium</p>
            </title>
            <location>Toronto, Canada</location>
            <date-range>18 July 2008</date-range>
            <url>http://www.iscbsc.org</url>
         </conference>
         <issn>1471-2105</issn>
         <pubdate>2008</pubdate>
         <volume>9</volume>
         <issue>Suppl 10</issue>
         <fpage>O8</fpage>
         <url>http://www.biomedcentral.com/1471-2105/9/S10/O8</url>
         <xrefbib>
            <pubid idtype="doi">10.1186/1471-2105-9-S10-O8</pubid>
         </xrefbib>
      </bibl>
      <history>
         <pub>
            <date>
               <day>30</day>
               <month>10</month>
               <year>2008</year>
            </date>
         </pub>
      </history>
      <cpyrt>
         <year>2008</year>
         <collab>Bohnert et al; licensee BioMed Central Ltd</collab>
      </cpyrt>
   </fm>
   <bdy>
      <sec>
         <st>
            <p>Motivation</p>
         </st>
         <p>The major breakthrough at the turn of the millennium was the completion of genome sequences for individuals from many species, including human, worm and rice. More recently, it has also been important to describe sequence variation within one species, providing the first step towards the linkage of genetic variation to traits.</p>
         <p>Today, rice is the most important source for human caloric intake, making up 20% of the calorie supply and feeding millions of people daily. The more detailed understanding and findings on the molecular assembly of phenotypic rice varieties will therefore be essential for future improvement in rice cultivation and breeding. In order to reveal patterns of sequence variation in <it>Oryza sativa </it>(rice), the non-repetitive portion of the genomes of 20 diverse rice cultivars was resequenced, in collaboration with Perlegen Sciences, Inc., using a high-density oligonucleotide microarray technology.</p>
      </sec>
      <sec>
         <st>
            <p>Methods</p>
         </st>
         <p>Based on experience gained in polymorphism studies for <it>Arabidopsis thaliana </it><abbrgrp><abbr bid="B1">1</abbr></abbrgrp> we developed a method for identifying single nucleotide polymorphisms (SNPs) from the array data using Support Vector Machines (SVMs). In a two-layered approach we trained SVMs to discriminate between SNP and non-SNP positions using information from each cultivar and, in a second step, across all cultivars.</p>
         <p>Wherever several SNPs or deletion/insertion polymorphisms occur in close vicinity, the hybridisation is suppressed and SNP calling in these regions becomes infeasible. We therefore adapted a machine learning method for sequence segmentation <abbrgrp><abbr bid="B2">2</abbr><abbr bid="B3">3</abbr></abbrgrp> to predict <it>highly polymorphic </it>regions in <it>O. sativa </it>(cf. Figure <figr fid="F1">1</figr>). These regions can then be analysed in more detail using alternative experimental techniques.</p>
         <fig id="F1">
            <title>
               <p>Figure 1</p>
            </title>
            <caption>
               <p>Log<sub>2 </sub>intensities for the maximally hybridising oligonucleotide at each tiled position are shown for the reference and a target cultivar together with the known and predicted polymorphisms for the target cultivar (based on data from the <it>Arabidopsis </it>project <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr></abbrgrp>)</p>
            </caption>
            <text>
               <p>Log<sub>2 </sub>intensities for the maximally hybridising oligonucleotide at each tiled position are shown for the reference and a target cultivar together with the known and predicted polymorphisms for the target cultivar (based on data from the <it>Arabidopsis </it>project <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr></abbrgrp>). Most of the 21 known polymorphisms in the target cultivar could not be predicted with the SNP calling methods.</p>
            </text>
            <graphic file="1471-2105-9-S10-O8-1"/>
         </fig>
         <p>For training and evaluation we compiled a set of reference polymorphisms obtained by dideoxy sequencing of more than 3,500 fragments from the 20 cultivars.</p>
      </sec>
      <sec>
         <st>
            <p>Results</p>
         </st>
         <p>Across all cultivars, we discovered 1,349,341 SNPs with the machine learning (ML) method at 316,373 non-redundant positions. In comparison to a model based (MB) SNP calling approach implemented by Perlegen Sciences, Inc. <abbrgrp><abbr bid="B4">4</abbr></abbrgrp>, the ML method was found to be much more sensitive by recovering 20.9% of all known SNPs at a precision of 91.7%, compared to 14.4% and 90.9%, respectively, for the MB approach (cf. Figure <figr fid="F2">2A</figr>). The intersection of MB and ML predictions contained 761,606 SNPs predictions at 159,879 non-repetitive positions constituting a set of markedly higher quality with a precision of 97.1%.</p>
         <fig id="F2">
            <title>
               <p>Figure 2</p>
            </title>
            <caption>
               <p><b>A</b>. Comparison of the proposed SNP prediction (ML) method based on array intensities and additional information with a previously proposed one (MB)</p>
            </caption>
            <text>
               <p><b>A</b>. Comparison of the proposed SNP prediction (ML) method based on array intensities and additional information with a previously proposed one (MB). <b>B</b>. Accuracy of the polymorphic region predictions using the machine learning based segmentation algorithm.</p>
            </text>
            <graphic file="1471-2105-9-S10-O8-2"/>
         </fig>
         <p>In addition to SNP predictions, our polymorphic region predictor discovered a substantial additional proportion of polymorphism regions, resulting in between ~65,000 and ~203,000 polymorphic regions per cultivar (cf. Figure <figr fid="F2">2B</figr>).</p>
      </sec>
      <sec>
         <st>
            <p>Conclusion</p>
         </st>
         <p>We identified hundreds of thousands polymorphisms on a genome-wide scale, providing the first whole genome set of polymorphisms for the world's most important crop plant. This polymorphism data represents a valuable resource for further functional studies and modern breeding of rice.</p>
         <p>Based on the SNP data, high-density genotyping arrays will be designed to investigate genomic variation in many more rice cultivars. The PR predictions will e.g. be helpful to constrain primer design to conserved regions and thus increase PCR success rates.</p>
      </sec>
   </bdy>
   <bm>
      <refgrp>
         <bibl id="B1">
            <title>
               <p>Common Sequence Polymorphisms Shaping Genetic Diversity in <it>Arabidopsis thaliana</it></p>
            </title>
            <aug>
               <au>
                  <snm>Clark</snm>
                  <fnm>RM</fnm>
               </au>
               <au>
                  <snm>Schweikert</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Toomajian</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Ossowski</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Zeller</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Shinn</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Warthmann</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Hu</snm>
                  <fnm>TT</fnm>
               </au>
               <au>
                  <snm>Fu</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Hinds</snm>
                  <fnm>DA</fnm>
               </au>
               <au>
                  <snm>Chen</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Frazer</snm>
                  <fnm>KA</fnm>
               </au>
               <au>
                  <snm>Huson</snm>
                  <fnm>DH</fnm>
               </au>
               <au>
                  <snm>Sch&#246;lkopf</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Nordborg</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>R&#228;tsch</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Ecker</snm>
                  <fnm>JR</fnm>
               </au>
               <au>
                  <snm>Weigel</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2007</pubdate>
            <volume>317</volume>
            <fpage>338</fpage>
            <lpage>42</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.1138632</pubid>
                  <pubid idtype="pmpid" link="fulltext">17641193</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B2">
            <title>
               <p>Detecting Polymorphic Regions in the <it>Arabidopsis thaliana </it>Genome with Resequencing Microarrays</p>
            </title>
            <aug>
               <au>
                  <snm>Zeller</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Clark</snm>
                  <fnm>RM</fnm>
               </au>
               <au>
                  <snm>Schneeberger</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Bohlen</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Weigel</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>R&#228;tsch</snm>
                  <fnm>G</fnm>
               </au>
            </aug>
            <source>Genome Research</source>
            <pubdate>2008</pubdate>
            <volume>18</volume>
            <fpage>918</fpage>
            <lpage>29</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2413159</pubid>
                  <pubid idtype="pmpid" link="fulltext">18323538</pubid>
                  <pubid idtype="doi">10.1101/gr.070169.107</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B3">
            <title>
               <p>Large Margin Methods for Structured and Interdependent Output Variables</p>
            </title>
            <aug>
               <au>
                  <snm>Tsochantaridis</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Joachims</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Hofmann</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Altun</snm>
                  <fnm>Y</fnm>
               </au>
            </aug>
            <source>Journal of Machine Learning Research</source>
            <pubdate>2005</pubdate>
            <volume>6</volume>
            <fpage>1453</fpage>
            <lpage>1484</lpage>
         </bibl>
         <bibl id="B4">
            <title>
               <p>Whole-genome Patterns of Common DNA Variation in Three Human Populations</p>
            </title>
            <aug>
               <au>
                  <snm>Hinds</snm>
                  <fnm>DA</fnm>
               </au>
               <au>
                  <snm>Stuve</snm>
                  <fnm>LL</fnm>
               </au>
               <au>
                  <snm>Nilsen</snm>
                  <fnm>GB</fnm>
               </au>
               <au>
                  <snm>Halperin</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Eskin</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Ballinger</snm>
                  <fnm>DG</fnm>
               </au>
               <au>
                  <snm>Frazer</snm>
                  <fnm>KA</fnm>
               </au>
               <au>
                  <snm>Cox</snm>
                  <fnm>DR</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2005</pubdate>
            <volume>307</volume>
            <fpage>1072</fpage>
            <lpage>9</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.1105436</pubid>
                  <pubid idtype="pmpid" link="fulltext">15718463</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
      </refgrp>
   </bm>
</art>

