Archive Ensembl HomeArchive Ensembl Home
CAFESpeciesTree.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::CAFESpeciesTree
00024 
00025 =head1 SYNOPSIS
00026 
00027 =head1 DESCRIPTION
00028 
00029 This RunnableDB builds a CAFE-compliant species tree (binary & ultrametric with time units).
00030 
00031 =head1 INHERITANCE TREE
00032 
00033 Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable
00034 
00035 =head1 APPENDIX
00036 
00037 The rest of the documentation details each of the object methods.
00038 Internal methods are usually preceded with an underscore (_)
00039 
00040 =cut
00041 
00042 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::CAFESpeciesTree;
00043 
00044 use strict;
00045 use Data::Dumper;
00046 use Scalar::Util qw(looks_like_number);
00047 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00048 
00049 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00050 
00051 sub param_defaults {
00052     return {
00053             'tree_fmt' => '%{n}%{":"d}',
00054            };
00055 }
00056 
00057 
00058 =head2 fetch_input
00059 
00060     Title     : fetch_input
00061     Usage     : $self->fetch_input
00062     Function  : Fetches input data from database
00063     Returns   : none
00064     Args      : none
00065 
00066 =cut
00067 
00068 sub fetch_input {
00069     my ($self) = @_;
00070 
00071     my $genomeDB_Adaptor = $self->compara_dba->get_GenomeDBAdaptor();
00072     $self->param('genomeDB_Adaptor', $genomeDB_Adaptor);
00073 
00074     my $NCBItaxon_Adaptor = $self->compara_dba->get_NCBITaxon(); # Adaptor??
00075     $self->param('NCBItaxon_Adaptor', $NCBItaxon_Adaptor);
00076 
00077     my $species_tree_meta_key = $self->param('species_tree_meta_key');
00078 
00079     my $full_species_tree = $self->get_species_tree_string($species_tree_meta_key);
00080     $self->param('full_species_tree', $full_species_tree);
00081 
00082     $self->param('tree_fmt', '%{n}%{":"d}'); # format for the tree
00083 
00084     my $cafe_species = $self->param('cafe_species');
00085     if (scalar $cafe_species == 0) {  # No species for the tree. Make a full tree
00086 #        die "No species for the CAFE tree";
00087         print STDERR "No species provided for the CAFE tree. I will take them all\n" if ($self->debug());
00088     }
00089 
00090     return;
00091 }
00092 
00093 sub run {
00094     my ($self) = @_;
00095     my $species_tree_string = $self->param('full_species_tree');
00096     my $species = $self->param('cafe_species');
00097     my $fmt = $self->param('tree_fmt');
00098     print Dumper $species if ($self->debug());
00099     my $eval_species_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($species_tree_string);
00100 
00101     $self->include_distance_to_parent($eval_species_tree);
00102     $self->ensembl_timetree_mya_to_distance_to_parent($eval_species_tree);
00103     $self->include_names($eval_species_tree);
00104     $self->ultrametrize($eval_species_tree);
00105     my $binTree = $self->binarize($eval_species_tree);
00106     $self->fix_zeros($binTree);
00107     my $cafeTree;
00108     if (defined $species) {
00109         $cafeTree = $self->prune_tree($binTree, $species);
00110     } else {
00111         $cafeTree = $binTree;
00112     }
00113 #     if ($self->debug) {
00114 #         $self->check_tree($cafeTree);
00115 #     }
00116     my $cafeTreeStr = $cafeTree->newick_format('ryo', $fmt);
00117 
00118     my $cafe_tree_string_meta_key = 'cafe_tree_string';
00119     print STDERR "$cafeTreeStr\n" if ($self->debug());
00120     print STDERR "cafe_tree_string_meta_key => $cafe_tree_string_meta_key\n" if ($self->debug());
00121     my $sql = "INSERT into meta (meta_key, meta_value) values (?,?);";
00122     my $sth = $self->compara_dba->dbc->prepare($sql);
00123     $sth->execute($cafe_tree_string_meta_key, $cafeTreeStr);
00124     $sth->finish();
00125 
00126     $self->param('cafe_tree_string_meta_key', $cafe_tree_string_meta_key);
00127 }
00128 
00129 sub write_output {
00130     my ($self) = @_;
00131     my $cafe_tree_string = $self->param('cafe_tree_string');
00132     # To CAFETable
00133     $self->dataflow_output_id (
00134                                {
00135                                 'cafe_tree_string', $self->param('cafe_tree_string'),
00136                                }, 1
00137                               );
00138 }
00139 
00140 
00141 #############################
00142 ## Internal methods #########
00143 #############################
00144 
00145 sub get_species_tree_string {
00146     my ($self, $species_tree_meta_key) = @_;
00147 
00148     my $table_name = 'meta';
00149     my $table_key = 'meta_key';
00150     my $table_column = 'meta_value';
00151     my $table_value = $species_tree_meta_key;
00152 
00153     my $sth = $self->dbc->prepare( "select $table_column from $table_name where $table_key=?" );
00154     $sth->execute($table_value);
00155     my ($species_tree_string) = $sth->fetchrow_array;
00156     $sth->finish;
00157     return $species_tree_string;
00158 }
00159 
00160 
00161 # Not used for now
00162 sub get_tree_from_db {
00163     my ($self) = @_;
00164     my $sql1 = "select value from gene_tree_root_tag where tag='CAFE_species_tree_string'";
00165     my $sth1 = $self->compara_dba->dbc->prepare($sql1);
00166     $sth1->execute();
00167     my $species_tree_string = $sth1->fetchrow_hashref;
00168     $sth1->finish;
00169 
00170     $self->param('species_tree', $species_tree_string->{value});
00171     return $species_tree_string->{value};
00172 }
00173 
00174 sub get_taxon_id_from_dbID {
00175     my ($self, $dbID) = @_;
00176     my $genomeDB_Adaptor = $self->param('genomeDB_Adaptor');
00177     my $genomeDB = $genomeDB_Adaptor->fetch_by_dbID($dbID);
00178     return $genomeDB->taxon_id();
00179 }
00180 
00181 
00182 sub is_in {
00183     my ($item, $arref) = @_;
00184     for my $elem (@$arref) {
00185         if ($item eq $elem) {
00186             return 1;
00187         }
00188     }
00189     return 0;
00190 }
00191 
00192 sub include_distance_to_parent {
00193     my ($self, $tree) = @_;
00194     my $NCBItaxon_Adaptor = $self->param('NCBItaxon_Adaptor');
00195 
00196     my $nodes = $tree->get_all_nodes();
00197     for my $node (@$nodes) {
00198         unless ($node->is_leaf) {
00199             my $taxon_id = $node->name();
00200             my $ncbiTaxon = $NCBItaxon_Adaptor->fetch_node_by_taxon_id($taxon_id);
00201             my $mya = $ncbiTaxon->get_tagvalue('ensembl timetree mya');
00202             for my $child (@{$node->children}) {
00203                 if ($mya) {
00204                     $child->distance_to_parent(int($mya));
00205                 } else {
00206                     print STDERR "++ taxon_id " . $child->name() . " doesn't have 'ensembl timetree mya' tag (defaulting to 0)\n" if ($self->debug);
00207                     $child->distance_to_parent(0);
00208                 }
00209             }
00210         }
00211     }
00212 }
00213 
00214 sub fix_ensembl_timetree_mya {
00215     my ($self, $tree) = @_;
00216     my $leaves = $tree->get_all_leaves();
00217     for my $leaf (@$leaves) {
00218         fix_path($leaf);
00219     }
00220 }
00221 
00222 sub fix_path {
00223     my ($node) = @_;
00224     for (;;) {
00225         if ($node->has_parent()) {
00226             if ($node->parent->distance_to_parent() == 0) {
00227                 $node = $node->parent;
00228                 next;
00229             }
00230             if ($node->parent()->distance_to_parent() < $node->distance_to_parent()) {
00231                 $node->distance_to_parent($node->parent()->distance_to_parent());
00232             }
00233         } else {
00234             return
00235         }
00236         $node = $node->parent();
00237     }
00238 }
00239 
00240 sub ensembl_timetree_mya_to_distance_to_parent {
00241     my ($self, $tree) = @_;
00242     my $leaves = $tree->get_all_leaves();
00243     for my $leaf (@$leaves) {
00244         mya_to_dtp_1path($leaf);
00245     }
00246 }
00247 
00248 sub mya_to_dtp_1path {
00249     my ($node) = @_;
00250     my $d = 0;
00251     for (;;) {
00252         my $dtp = 0;
00253         if ($node->get_tagvalue('revised') eq 1) {
00254             if ($node->has_parent()) {
00255                 $node = $node->parent();
00256                 next;
00257             } else {
00258                 return;
00259             }
00260         }
00261         if ($node->distance_to_parent != 0) {
00262             $dtp = $node->distance_to_parent - $d;
00263         }
00264         $node->distance_to_parent($dtp);
00265         $node->add_tag("revised", "1");
00266         $d += $dtp;
00267         if ($node->has_parent()) {
00268             $node = $node->parent();
00269         } else {
00270             return;
00271         }
00272     }
00273 }
00274 
00275 sub include_names {
00276     my ($self, $tree) = @_;
00277     my $genomeDB_Adaptor = $self->param('genomeDB_Adaptor');
00278     my $leaves = $tree->get_all_leaves();
00279     for my $leaf ( @$leaves ) {
00280         my $taxon_id = $leaf->name();
00281         $taxon_id =~ s/\*//g;
00282         my $genomeDB = $genomeDB_Adaptor->fetch_by_taxon_id($taxon_id);
00283         my $name = $genomeDB->name();
00284         $name =~ s/_/\./;
00285         $leaf->name($name);
00286     }
00287 }
00288 
00289 sub ultrametrize {
00290     my ($self, $tree) = @_;
00291     my $longest_path = get_longest_path($tree);
00292     my $leaves = $tree->get_all_leaves();
00293     for my $leaf (@$leaves) {
00294         my $path = path_length($leaf);
00295         $leaf->distance_to_parent($leaf->distance_to_parent() + ($longest_path-$path));
00296     }
00297 }
00298 
00299 sub get_longest_path {
00300     my ($tree) = @_;
00301     my $leaves = $tree->get_all_leaves();
00302     my @paths;
00303     my $longest = -1;
00304     for my $leaf(@$leaves) {
00305         my $newpath = path_length($leaf);
00306         if ($newpath > $longest) {
00307             $longest = $newpath;
00308         }
00309     }
00310     return $longest;
00311 }
00312 
00313 sub binarize {
00314     my ($self, $orig_tree) = @_;
00315     my $newTree = Bio::EnsEMBL::Compara::NestedSet->new();
00316     $newTree->name('root');
00317     $newTree->node_id('0');
00318     _binarize($orig_tree, $newTree);
00319     return $newTree;
00320 }
00321 
00322 sub _binarize {
00323     my ($origTree, $binTree) = @_;
00324     my $children = $origTree->children();
00325     for my $child (@$children) {
00326         my $newNode = Bio::EnsEMBL::Compara::NestedSet->new();
00327         $newNode->name($child->name());
00328         $newNode->node_id($child->node_id());
00329         $newNode->distance_to_parent($child->distance_to_parent()); # no parent!!
00330         if (scalar @{$binTree->children()} > 1) {
00331             $child->disavow_parent();
00332             my $newBranch = Bio::EnsEMBL::Compara::NestedSet->new();
00333             for my $c (@{$binTree->children()}) {
00334                 $c->distance_to_parent(0);
00335                 $newBranch->add_child($c);
00336             }
00337             $binTree->add_child($newBranch);
00338             $newBranch->name($newBranch->parent()->name());
00339         }
00340         $binTree->add_child($newNode);
00341         _binarize($child, $newNode);
00342     }
00343 }
00344 
00345 sub fix_zeros {
00346     my ($self, $tree) = @_;
00347     my $leaves = $tree->get_all_leaves();
00348     for my $leaf (@$leaves) {
00349         fix_zeros_1($leaf);
00350     }
00351 }
00352 
00353 sub fix_zeros_1 {
00354     my ($node) = @_;
00355     my $to_add = 0;
00356     for (;;) {
00357         return unless ($node->has_parent());
00358         my $dtp = $node->distance_to_parent();
00359         if ($dtp == 0) {
00360             $to_add++;
00361             $node->distance_to_parent(1);
00362         }
00363         my $siblings = siblings($node);
00364         die "too many siblings" if (scalar @$siblings > 1);
00365         $siblings->[0]->distance_to_parent($siblings->[0]->distance_to_parent() + $to_add);
00366         $node = $node->parent();
00367     }
00368 }
00369 
00370 sub prune_tree {
00371     my ($self, $tree, $species_to_keep) = @_;
00372     my $leaves = $tree->get_all_leaves();
00373     my %species_to_remove;
00374     for my $leaf (@$leaves) {
00375         my $name = $leaf->name();
00376         $species_to_remove{$name} = 1;
00377     }
00378     for my $sp (@$species_to_keep) {
00379         delete $species_to_remove{$sp};
00380     }
00381     my $newTree = remove_nodes($tree, [keys %species_to_remove]);
00382     return $newTree;
00383 }
00384 
00385 sub remove_nodes {
00386     my ($tree, $nodes) = @_;
00387     my $leaves = $tree->get_all_leaves();
00388     for my $node (@$leaves) {
00389         if (is_in($node->name, $nodes)) {
00390             if ($node->has_parent()) {
00391                 my $parent = $node->parent();
00392                 my $siblings = siblings($node);
00393                 if (scalar @$siblings > 1) {
00394                     die "The tree is not binary";
00395                 }
00396                 $node->disavow_parent();
00397                 if ($parent->has_parent) {
00398                     my $grandpa = $parent->parent();
00399                     my $dtg = $parent->distance_to_parent();
00400                     $parent->disavow_parent();
00401                     my $newsdtp = $siblings->[0]->distance_to_parent() + $dtg;
00402                     $grandpa->add_child($siblings->[0], $newsdtp);
00403                 } else {
00404                     $siblings->[0]->disavow_parent();
00405                     $tree=$siblings->[0];
00406                 }
00407             }
00408         }
00409     }
00410     return $tree;
00411 }
00412 
00413 sub siblings {
00414     my ($node) = @_;
00415     return undef unless ($node->has_parent());
00416     my $parent = $node->parent();
00417     my $children = $parent->children();
00418     my @siblings = ();
00419     for my $child (@$children) {
00420         if ($child != $node) {
00421             push @siblings, $child;
00422         }
00423     }
00424     return [@siblings];
00425 }
00426 
00427 sub check_tree {
00428   my ($self, $tree) = @_;
00429   if (is_ultrametric($tree)) {
00430       if ($self->debug()) {
00431           print STDERR "The tree is ultrametric\n";
00432       }
00433   } else {
00434       die "The tree is NOT ultrametric\n";
00435   }
00436 
00437   eval (is_binary($tree));
00438   if ($@) {
00439     die $@;
00440   } else {
00441       if ($self->debug()) {
00442           print STDERR "The tree is binary\n";
00443       }
00444   }
00445 }
00446 
00447 sub is_binary {
00448   my ($node) = @_;
00449   if ($node->is_leaf()) {
00450     return 0
00451   }
00452   my $children = $node->children();
00453   if (scalar @$children != 2) {
00454     my $name = $node->name();
00455     die "Not binary in node $name\n";
00456   }
00457   for my $child (@$children) {
00458     is_binary($child);
00459   }
00460 }
00461 
00462 sub is_ultrametric {
00463   my ($tree) = @_;
00464   my $leaves = $tree->get_all_leaves();
00465   my $path = -1;
00466   for my $leaf (@$leaves) {
00467     my $newpath = path_length($leaf);
00468     if ($path == -1) {
00469       $path = $newpath;
00470       next;
00471     }
00472     if ($path == $newpath) {
00473       $path = $newpath;
00474     } else {
00475       return 0
00476     }
00477   }
00478   return 1
00479 }
00480 
00481 sub path_length {
00482   my ($node) = @_;
00483   my $d = 0;
00484   for (;;){
00485     $d += $node->distance_to_parent();
00486     if ($node->has_parent()) {
00487       $node = $node->parent();
00488     } else {
00489       last;
00490     }
00491   }
00492   return $d;
00493 }
00494 
00495 1;