Archive Ensembl HomeArchive Ensembl Home
NCTreeBestMMerge.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCTreeBestMMerge
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $treebest_mmerge = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCTreeBestMMerge->new
00018   (
00019    -db         => $db,
00020    -input_id   => $input_id,
00021    -analysis   => $analysis
00022   );
00023 $treebest_mmerge->fetch_input(); #reads from DB
00024 $treebest_mmerge->run();
00025 $treebest_mmerge->output();
00026 $treebest_mmerge->write_output(); #writes to DB
00027 
00028 =cut
00029 
00030 
00031 =head1 DESCRIPTION
00032 
00033 This Analysis will take the sequences from a cluster, the cm from
00034 nc_profile and run a profiled alignment, storing the results as
00035 cigar_lines for each sequence.
00036 
00037 =cut
00038 
00039 
00040 =head1 CONTACT
00041 
00042   Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
00043   Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
00044 
00045 =cut
00046 
00047 
00048 =head1 APPENDIX
00049 
00050 The rest of the documentation details each of the object methods. 
00051 Internal methods are usually preceded with a _
00052 
00053 =cut
00054 
00055 
00056 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCTreeBestMMerge;
00057 
00058 use strict;
00059 use Bio::AlignIO;
00060 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00061 
00062 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00063 
00064 
00065 sub param_defaults {
00066     return {
00067         'clusterset_id'  => 1,
00068     };
00069 }
00070 
00071 
00072 =head2 fetch_input
00073 
00074     Title   :   fetch_input
00075     Usage   :   $self->fetch_input
00076     Function:   Fetches input data from the database
00077     Returns :   none
00078     Args    :   none
00079 
00080 =cut
00081 
00082 
00083 sub fetch_input {
00084   my( $self) = @_;
00085 
00086       # Fetch sequences:
00087   $self->param('nc_tree', $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($self->param('nc_tree_id')) );
00088 
00089   $self->load_input_trees;
00090 
00091   my $treebest_exe = $self->param('treebest_exe')
00092           or die "'treebest_exe' is an obligatory parameter";
00093                   
00094   die "Cannot execute '$treebest_exe'" unless(-x $treebest_exe);
00095 }
00096 
00097 
00098 =head2 run
00099 
00100     Title   :   run
00101     Usage   :   $self->run
00102     Function:   runs something
00103     Returns :   none
00104     Args    :   none
00105 
00106 =cut
00107 
00108 sub run {
00109   my $self = shift;
00110 
00111   if (defined($self->param('inputtrees_unrooted'))) {
00112     $self->reroot_inputtrees;
00113     $self->run_treebest_mmerge;
00114     $self->calculate_branch_lengths;
00115   }
00116 }
00117 
00118 
00119 =head2 write_output
00120 
00121     Title   :   write_output
00122     Usage   :   $self->write_output
00123     Function:   stores something
00124     Returns :   none
00125     Args    :   none
00126 
00127 =cut
00128 
00129 
00130 sub write_output {
00131   my $self = shift;
00132 
00133   $self->store_nctree if (defined($self->param('inputtrees_unrooted')));
00134 }
00135 
00136 sub DESTROY {
00137   my $self = shift;
00138 
00139   if($self->param('nc_tree')) {
00140     printf("NctreeBestMMerge::DESTROY  releasing tree\n") if($self->debug);
00141     $self->param('nc_tree')->release_tree;
00142     $self->param('nc_tree', undef);
00143   }
00144 
00145   $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
00146 }
00147 
00148 
00149 ##########################################
00150 #
00151 # internal methods
00152 #
00153 ##########################################
00154 
00155 sub get_species_tree_file {
00156     my $self = shift @_;
00157 
00158     unless( $self->param('species_tree_file') ) {
00159 
00160         unless( $self->param('species_tree_string') ) {
00161 
00162             my $tag_table_name = 'gene_tree_root_tag';
00163 
00164             my $sth = $self->dbc->prepare( "select value from $tag_table_name where tag='species_tree_string'" );
00165             $sth->execute;
00166             my ($species_tree_string) = $sth->fetchrow_array;
00167             $sth->finish;
00168 
00169             $self->param('species_tree_string', $species_tree_string)
00170                 or die "Could not fetch 'species_tree_string' from $tag_table_name";
00171         }
00172 
00173         my $species_tree_string = $self->param('species_tree_string');
00174         eval {
00175             my $eval_species_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($species_tree_string);
00176             my @leaves = @{$eval_species_tree->get_all_leaves};
00177         };
00178         if($@) {
00179             die "Error parsing species tree from the string '$species_tree_string'";
00180         }
00181 
00182             # store the string in a local file:
00183         my $species_tree_file = $self->worker_temp_directory . "spec_tax.nh";
00184         open SPECIESTREE, ">$species_tree_file" or die "Could not open '$species_tree_file' for writing : $!";
00185         print SPECIESTREE $species_tree_string;
00186         close SPECIESTREE;
00187         $self->param('species_tree_file', $species_tree_file);
00188     }
00189     return $self->param('species_tree_file');
00190 }
00191 
00192 sub run_treebest_mmerge {
00193   my $self = shift;
00194 
00195   my $root_id = $self->param('nc_tree')->node_id;
00196   my $species_tree_file = $self->get_species_tree_file();
00197   my $treebest_exe = $self->param('treebest_exe');
00198   my $temp_directory = $self->worker_temp_directory;
00199 
00200   my $mmergefilename = $temp_directory . $root_id . ".mmerge";
00201   my $mmerge_output_filename = $mmergefilename . ".output";
00202   open FILE,">$mmergefilename" or die $!;
00203   foreach my $method (keys %{$self->param('inputtrees_rooted')}) {
00204     my $inputtree = $self->param('inputtrees_rooted')->{$method};
00205     print FILE "$inputtree\n";
00206   }
00207   close FILE;
00208 
00209   my $cmd = "$treebest_exe mmerge -s $species_tree_file $mmergefilename > $mmerge_output_filename";
00210   print("$cmd\n") if($self->debug);
00211   $DB::single=1;1;#??
00212   unless(system("$cmd") == 0) {
00213     print("$cmd\n");
00214     $self->throw("error running treebest mmerge, $!\n");
00215   }
00216 
00217   $self->param('mmerge_output', $mmerge_output_filename);
00218 
00219   return 1;
00220 }
00221 
00222 sub calculate_branch_lengths {
00223   my $self = shift;
00224 
00225   $self->param('input_aln', $self->dumpTreeMultipleAlignmentToWorkdir($self->param('nc_tree')) );
00226 
00227   my $leafcount = scalar(@{$self->param('nc_tree')->get_all_leaves});
00228   if($leafcount<3) {
00229     printf(STDERR "tree cluster %d has <3 genes - can not build a tree\n", 
00230            $self->param('nc_tree')->node_id);
00231     $self->param('mmerge_blengths_output', $self->param('mmerge_output'));
00232     $self->parse_newick_into_nctree;
00233     return;
00234   }
00235 
00236   my $treebest_exe = $self->param('treebest_exe');
00237   my $constrained_tree = $self->param('mmerge_output');
00238   my $tree_with_blengths = $self->param('mmerge_output') . ".blengths.nh";
00239   my $input_aln = $self->param('input_aln');
00240   my $species_tree_file = $self->get_species_tree_file();
00241   my $cmd = $treebest_exe;
00242   $cmd .= " nj";
00243   if ($treebest_exe =~ /tracking/) {
00244       $cmd .= " -I";
00245   }
00246   $cmd .= " -c $constrained_tree";
00247   $cmd .= " -s $species_tree_file";
00248   $cmd .= " $input_aln";
00249   $cmd .= " > $tree_with_blengths";
00250 #  my $cmd = "$treebest_exe nj -c $constrained_tree -s $species_tree_file $input_aln > $tree_with_blengths";
00251   print STDERR +("$cmd\n") if($self->debug);
00252 
00253   unless(system("$cmd") == 0) {
00254     print("$cmd\n");
00255     $self->throw("error running treebest nj, $!\n");
00256   }
00257 
00258   $self->param('mmerge_blengths_output', $tree_with_blengths);
00259 
00260   #parse the tree into the datastucture
00261   $self->parse_newick_into_nctree;
00262   return 1;
00263 }
00264 
00265 sub reroot_inputtrees {
00266   my $self = shift;
00267 
00268   my $root_id = $self->param('nc_tree')->node_id;
00269   my $species_tree_file = $self->get_species_tree_file;
00270   my $treebest_exe = $self->param('treebest_exe');
00271 
00272   my $temp_directory = $self->worker_temp_directory;
00273   my $template_cmd = "$treebest_exe sdi -rs $species_tree_file";
00274 
00275   foreach my $method (keys %{$self->param('inputtrees_unrooted')}) {
00276     my $cmd = $template_cmd;
00277     my $unrootedfilename = $temp_directory . $root_id . "." . $method . ".unrooted";
00278     my $rootedfilename = $temp_directory . $root_id . "." . $method . ".rooted";
00279     my $inputtree = $self->param('inputtrees_unrooted')->{$method};
00280     open FILE,">$unrootedfilename" or die $!;
00281     print FILE $inputtree;
00282     close FILE;
00283 
00284     $cmd .= " $unrootedfilename";
00285     $cmd .= " > $rootedfilename";
00286 
00287     print("$cmd\n") if($self->debug);
00288     $DB::single=1;1;
00289     unless(system("$cmd") == 0) {
00290       print("$cmd\n");
00291       $self->throw("error running treebest sdi, $!\n");
00292     }
00293 
00294     # Parse the rooted tree string
00295     my $rootedstring;
00296     open (FH, $rootedfilename) or $self->throw("Couldnt open rooted file [$rootedfilename]");
00297     while(<FH>) {
00298       chomp $_;
00299       $rootedstring .= $_;
00300     }
00301     close(FH);
00302 
00303       # manual vivification needed:
00304     unless($self->param('inputtrees_rooted')) {
00305         $self->param('inputtrees_rooted', {});
00306     }
00307     $self->param('inputtrees_rooted')->{$method} = $rootedstring;
00308   }
00309 
00310   return 1;
00311 }
00312 
00313 sub load_input_trees {
00314   my $self = shift;
00315   my $tree = $self->param('nc_tree')->tree;
00316 
00317   foreach my $tag ($tree->get_all_tags) {
00318     next unless $tag =~ m/_it_/;
00319     my $inputtree_string = $tree->get_value_for_tag($tag);
00320 
00321     my $eval_inputtree;
00322     eval {
00323       $eval_inputtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($inputtree_string);
00324       my @leaves = @{$eval_inputtree->get_all_leaves};
00325     };
00326     unless ($@) {
00327         # manual vivification needed:
00328       unless($self->param('inputtrees_unrooted')) {
00329           $self->param('inputtrees_unrooted', {});
00330       }
00331 
00332       $self->param('inputtrees_unrooted')->{$tag} = $inputtree_string;
00333     }
00334   }
00335 
00336   return 1;
00337 }
00338 
00339 
00340 ########################################################
00341 #
00342 # GeneTree input/output section
00343 #
00344 ########################################################
00345 
00346 sub dumpTreeMultipleAlignmentToWorkdir {
00347   my $self = shift;
00348   my $nc_tree = shift;
00349 
00350   $self->param('file_root', $self->worker_temp_directory. "nctree_". $nc_tree->node_id);
00351 
00352   my $aln_file = $self->param('file_root') . ".aln";
00353   return $aln_file if(-e $aln_file);
00354   my $leafcount = scalar(@{$nc_tree->get_all_leaves});
00355   if($self->debug) {
00356     printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $leafcount);
00357     print("aln_file = '$aln_file'\n");
00358   }
00359 
00360   open(OUTSEQ, ">$aln_file")
00361     or $self->throw("Error opening $aln_file for write");
00362 
00363   # Using append_taxon_id will give nice seqnames_taxonids needed for
00364   # njtree species_tree matching
00365   my %sa_params = $self->param('use_genomedb_id') ? ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1);
00366 
00367   my $sa = $nc_tree->get_SimpleAlign
00368     (
00369      -id_type => 'MEMBER',
00370      %sa_params,
00371     );
00372   $sa->set_displayname_flat(1);
00373 
00374   my $alignIO = Bio::AlignIO->newFh
00375     (
00376      -fh => \*OUTSEQ,
00377      -format => "fasta"
00378     );
00379   print $alignIO $sa;
00380 
00381   close OUTSEQ;
00382 
00383   $self->param('input_aln', $aln_file);
00384 
00385   return $aln_file;
00386 }
00387 
00388 sub store_nctree
00389 {
00390     my $self = shift;
00391 
00392     my $tree = $self->param('nc_tree') or return;
00393     my $tree_adaptor = $self->compara_dba->get_NCTreeAdaptor;
00394 
00395     printf("NCTreeBestMMerge::store_nctree\n") if($self->debug);
00396 
00397     $tree->build_leftright_indexing(1);
00398     $tree_adaptor->store($tree);
00399     $tree_adaptor->delete_nodes_not_in_tree($tree);
00400 
00401     if($self->debug >1) {
00402         print("done storing - now print\n");
00403         $tree->print_tree;
00404     }
00405 
00406     $self->store_tags($tree);
00407 
00408     $self->_store_tree_tags;
00409 
00410 }
00411 
00412 sub store_tags
00413 {
00414     my $self = shift;
00415     my $node = shift;
00416 
00417     if (not $node->is_leaf) {
00418         my $node_type;
00419         if ($node->has_tag('node_type')) {
00420             $node_type = $node->get_tagvalue('node_type');
00421         } elsif ($node->get_tagvalue("DD", 0)) {
00422             $node_type = 'dubious';
00423         } elsif ($node->get_tagvalue('Duplication', '') eq '1') {
00424             $node_type = 'duplication';
00425         } else {
00426             $node_type = 'speciation';
00427         }
00428         $node->store_tag('node_type', $node_type);
00429         if ($self->debug) {
00430             print "store node_type: $node_type"; $node->print_node;
00431         }
00432     }
00433 
00434     if ($node->has_tag("E")) {
00435         my $n_lost = $node->get_tagvalue("E");
00436         $n_lost =~ s/.{2}//;        # get rid of the initial $-
00437         my @lost_taxa = split('-', $n_lost);
00438         foreach my $taxon (@lost_taxa) {
00439             if ($self->debug) {
00440                 printf("store lost_taxon_id : $taxon "); $node->print_node;
00441             }
00442             $node->store_tag('lost_taxon_id', $taxon, 1);
00443         }
00444     }
00445 
00446     my %mapped_tags = ('B' => 'bootstrap', 'SIS' => 'species_intersection_score', 'T' => 'tree_support');
00447     foreach my $tag (keys %mapped_tags) {
00448         if ($node->has_tag($tag)) {
00449             my $value = $node->get_tagvalue($tag);
00450             my $db_tag = $mapped_tags{$tag};
00451             # Because the duplication_confidence_score won't be computed for dubious nodes
00452             $db_tag = 'duplication_confidence_score' if ($node->get_tagvalue('node_type') eq 'dubious' and $tag eq 'SIS');
00453             $node->store_tag($db_tag, $value);
00454             if ($self->debug) {
00455                 printf("store $tag as $db_tag: $value"); $node->print_node;
00456             }
00457         }
00458     }
00459 
00460     foreach my $child (@{$node->children}) {
00461         $self->store_tags($child);
00462     }
00463 }
00464 
00465 sub parse_newick_into_nctree
00466 {
00467   my $self = shift;
00468   my $newick_file = $self->param('mmerge_blengths_output');
00469 
00470   my $tree = $self->param('nc_tree');
00471   
00472   #cleanup old tree structure- 
00473   #  flatten and reduce to only GeneTreeMember leaves
00474   $tree->flatten_tree;
00475   $tree->print_tree(20) if($self->debug);
00476   foreach my $node (@{$tree->get_all_leaves}) {
00477     next if($node->isa('Bio::EnsEMBL::Compara::GeneTreeMember'));
00478     $node->disavow_parent;
00479   }
00480 
00481   #parse newick into a new tree object structure
00482   my $newick = '';
00483   print("load from file $newick_file\n") if($self->debug);
00484   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00485   while(<FH>) { $newick .= $_;  }
00486   close(FH);
00487 
00488   my $newtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick, "Bio::EnsEMBL::Compara::GeneTreeNode");
00489   $newtree->print_tree(20) if($self->debug > 1);
00490 
00491   # get rid of the taxon_id needed by njtree -- name tag
00492   foreach my $leaf (@{$newtree->get_all_leaves}) {
00493     my $njtree_phyml_name = $leaf->get_tagvalue('name');
00494     $njtree_phyml_name =~ /(\d+)\_\d+/;
00495     my $member_id = $1;
00496     $leaf->add_tag('name', $member_id);
00497   }
00498   $newtree->print_tree(20) if($self->debug > 1);
00499 
00500   # Leaves of newick tree are named with member_id of members from
00501   # input tree move members (leaves) of input tree into newick tree to
00502   # mirror the 'member_id' nodes
00503   foreach my $member (@{$tree->get_all_leaves}) {
00504     my $tmpnode = $newtree->find_node_by_name($member->member_id);
00505     if($tmpnode) {
00506       $member->Bio::EnsEMBL::Compara::AlignedMember::copy($tmpnode);
00507       bless $tmpnode, 'Bio::EnsEMBL::Compara::GeneTreeMember';
00508       $tmpnode->node_id($member->node_id);
00509       $tmpnode->adaptor($member->adaptor);
00510     } else {
00511       print("unable to find node in newick for member");
00512       $member->print_member;
00513     }
00514   }
00515 
00516   $newtree->node_id($tree->node_id);
00517   $newtree->adaptor($tree->adaptor);
00518   $newtree->tree($tree->tree);
00519   $self->param('nc_tree', $newtree);
00520   # to keep the link to the super-tree
00521   if ($tree->has_parent) {
00522       $tree->parent->add_child($newtree);
00523    }
00524 
00525   # Newick tree is now empty so release it
00526   $tree->release_tree;
00527 
00528   $newtree->print_tree if($self->debug);
00529   # check here on the leaf to test if they all are GeneTreeMembers as
00530   # minimize_tree/minimize_node might not work properly
00531   foreach my $leaf (@{$newtree->get_all_leaves}) {
00532     unless($leaf->isa('Bio::EnsEMBL::Compara::GeneTreeMember')) {
00533       $self->throw("TreeBestMMerge tree does not have all leaves as GeneTreeMembers\n");
00534     }
00535   }
00536 }
00537 
00538 sub _store_tree_tags {
00539     my $self = shift;
00540     my $tree = $self->param('nc_tree');
00541     my $pta = $self->compara_dba->get_NCTreeAdaptor;
00542 
00543     print "Storing Tree tags...\n";
00544 
00545     my @leaves = @{$tree->get_all_leaves};
00546     my @nodes = @{$tree->get_all_nodes};
00547 
00548     # Tree number of leaves.
00549     my $tree_num_leaves = scalar(@leaves);
00550     $tree->tree->store_tag("tree_num_leaves",$tree_num_leaves);
00551 
00552     # Tree number of human peptides contained.
00553     my $num_hum_peps = 0;
00554     foreach my $leaf (@leaves) {
00555     $num_hum_peps++ if ($leaf->taxon_id == 9606);
00556     }
00557     $tree->tree->store_tag("tree_num_human_genes",$num_hum_peps);
00558 
00559     # Tree max root-to-tip distance.
00560     my $tree_max_length = $tree->max_distance;
00561     $tree->tree->store_tag("tree_max_length",$tree_max_length);
00562 
00563     # Tree max single branch length.
00564     my $tree_max_branch = 0;
00565     foreach my $node (@nodes) {
00566         my $dist = $node->distance_to_parent;
00567         $tree_max_branch = $dist if ($dist > $tree_max_branch);
00568     }
00569     $tree->tree->store_tag("tree_max_branch",$tree_max_branch);
00570 
00571     # Tree number of duplications and speciations.
00572     my $num_dups = 0;
00573     my $num_specs = 0;
00574     foreach my $node (@nodes) {
00575         my $node_type = $node->get_tagvalue("node_type");
00576         if ((defined $node_type) and ($node_type ne 'speciation')) {
00577             $num_dups++;
00578         } else {
00579             $num_specs++;
00580         }
00581     }
00582     $tree->tree->store_tag("tree_num_dup_nodes",$num_dups);
00583     $tree->tree->store_tag("tree_num_spec_nodes",$num_specs);
00584 
00585     print "Done storing stuff!\n" if ($self->debug);
00586 }
00587 
00588 1;