Archive Ensembl HomeArchive Ensembl Home
QuickTreeBreak.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009    http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::QuickTreeBreak
00022 
00023 =head1 DESCRIPTION
00024 
00025 This Analysis/RunnableDB is designed to take ProteinTree as input.
00026 
00027 This must already have a multiple alignment run on it. It uses that
00028 alignment as input into the QuickTree program which then generates a
00029 simple phylogenetic tree to be broken down into 2 pieces.
00030 
00031 Google QuickTree to get the latest tar.gz from the Sanger.
00032 Google sreformat to get the sequence reformatter that switches from fasta to stockholm.
00033 
00034 input_id/parameters format eg: "{'protein_tree_id'=>1234,'clusterset_id'=>1}"
00035     protein_tree_id : use 'id' to fetch a cluster from the ProteinTree
00036 
00037 =head1 SYNOPSIS
00038 
00039 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00040 my $quicktreebreak = Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::QuickTreeBreak->new
00041   (
00042    -db         => $db,
00043    -input_id   => $input_id,
00044    -analysis   => $analysis
00045   );
00046 $quicktreebreak->fetch_input(); #reads from DB
00047 $quicktreebreak->run();
00048 $quicktreebreak->output();
00049 $quicktreebreak->write_output(); #writes to DB
00050 
00051 =head1 AUTHORSHIP
00052 
00053 Ensembl Team. Individual contributions can be found in the CVS log.
00054 
00055 =head1 MAINTAINER
00056 
00057 $Author: mm14 $
00058 
00059 =head VERSION
00060 
00061 $Revision: 1.17 $
00062 
00063 =head1 APPENDIX
00064 
00065 The rest of the documentation details each of the object methods.
00066 Internal methods are usually preceded with an underscore (_)
00067 
00068 =cut
00069 
00070 package Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::QuickTreeBreak;
00071 
00072 use strict;
00073 
00074 use IO::File;
00075 use File::Basename;
00076 use Time::HiRes qw(time gettimeofday tv_interval);
00077 
00078 use Bio::AlignIO;
00079 use Bio::SimpleAlign;
00080 
00081 use Bio::EnsEMBL::Compara::Member;
00082 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00083 use Bio::EnsEMBL::Compara::GeneTree;
00084 use Bio::EnsEMBL::Compara::GeneTreeNode;
00085 
00086 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00087 
00088 
00089 =head2 fetch_input
00090 
00091     Title   :   fetch_input
00092     Usage   :   $self->fetch_input
00093     Function:   Fetches input data for repeatmasker from the database
00094     Returns :   none
00095     Args    :   none
00096 
00097 =cut
00098 
00099 
00100 sub fetch_input {
00101     my $self = shift @_;
00102 
00103     $self->check_if_exit_cleanly;
00104 
00105     my $protein_tree_id     = $self->param('protein_tree_id') or die "'protein_tree_id' is an obligatory parameter";
00106     my $protein_tree        = $self->compara_dba->get_ProteinTreeAdaptor->fetch_node_by_node_id( $protein_tree_id )
00107                                         or die "Could not fetch protein_tree with protein_tree_id='$protein_tree_id'";
00108     $self->param('protein_tree', $protein_tree);
00109 
00110     $self->param('mlss_id') or die "'mlss_id' is an obligatory parameter";
00111 }
00112 
00113 
00114 =head2 run
00115 
00116     Title   :   run
00117     Usage   :   $self->run
00118     Function:   runs NJTREE PHYML
00119     Returns :   none
00120     Args    :   none
00121 
00122 =cut
00123 
00124 
00125 sub run {
00126     my $self = shift @_;
00127 
00128     $self->check_if_exit_cleanly;
00129     $self->run_quicktreebreak;
00130 }
00131 
00132 
00133 =head2 write_output
00134 
00135     Title   :   write_output
00136     Usage   :   $self->write_output
00137     Function:   stores proteintree
00138     Returns :   none
00139     Args    :   none
00140 
00141 =cut
00142 
00143 
00144 sub write_output {
00145     my $self = shift @_;
00146 
00147     $self->check_if_exit_cleanly;
00148     $self->store_proteintrees;
00149 }
00150 
00151 sub release_tree {
00152     my $self       = shift @_;
00153     my $tree_param = shift @_;
00154 
00155     if(my $root = $self->param($tree_param)) {
00156         $root->release_tree;
00157         $self->param($tree_param, undef);
00158     }
00159 }
00160 
00161 sub DESTROY {
00162     my $self = shift;
00163 
00164     printf("QuickTreeBreak::DESTROY releasing trees\n") if($self->debug);
00165 
00166     $self->release_tree('protein_tree');
00167     $self->release_tree('max_subtree');
00168 
00169     $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
00170 }
00171 
00172 
00173 ##########################################
00174 #
00175 # internal methods
00176 #
00177 ##########################################
00178 
00179 
00180 sub run_quicktreebreak {
00181   my $self = shift;
00182 
00183   my $starttime = time()*1000;
00184 
00185   my $input_aln = $self->dumpTreeMultipleAlignmentToWorkdir ( $self->param('protein_tree') ) or return;
00186 
00187   my $quicktree_exe = $self->param('quicktree_exe')
00188         or die "'quicktree_exe' is an obligatory parameter";
00189 
00190   die "Cannot execute '$quicktree_exe'" unless(-x $quicktree_exe);
00191 
00192   my $cmd = $quicktree_exe;
00193   $cmd .= " -out t -in a";
00194   $cmd .= " ". $input_aln;
00195 
00196   $self->compara_dba->dbc->disconnect_when_inactive(1);
00197   print("$cmd\n") if($self->debug);
00198   open(RUN, "$cmd |") or die "Could not open pipe [$cmd |] for reading : $!";
00199   my @output = <RUN>;
00200   close(RUN) or die "Could not close pipe [$cmd |] : $!";
00201   $self->compara_dba->dbc->disconnect_when_inactive(0);
00202 
00203   my $quicktree_newick_string = '';
00204   foreach my $line (@output) {
00205     $line =~ s/\n//;
00206     $quicktree_newick_string .= $line;
00207   }
00208 
00209   #parse the tree into the datastucture
00210   $self->generate_subtrees( $quicktree_newick_string );
00211 
00212   my $runtime = time()*1000-$starttime;
00213   $self->param('protein_tree')->tree->store_tag('QuickTreeBreak_runtime_msec', $runtime);
00214 }
00215 
00216 
00217 ########################################################
00218 #
00219 # ProteinTree input/output section
00220 #
00221 ########################################################
00222 
00223 sub dumpTreeMultipleAlignmentToWorkdir {
00224   my $self = shift;
00225   my $protein_tree = shift;
00226 
00227   $self->param('original_leafcount', scalar(@{$protein_tree->get_all_leaves}) );
00228   if($self->param('original_leafcount')<3) {
00229     printf(STDERR "tree cluster %d has <3 proteins - can not build a tree\n", $protein_tree->node_id);
00230     return undef;
00231   }
00232 
00233   my $file_root = $self->worker_temp_directory. "proteintree_". $protein_tree->node_id;
00234   $file_root =~ s/\/\//\//g;  # converts any // in path to /
00235 
00236   my $aln_file = $file_root . '.aln';
00237   return $aln_file if(-e $aln_file);
00238   if($self->debug) {
00239     printf("dumpTreeMultipleAlignmentToWorkdir : %d members\n", $self->param('original_leafcount') );
00240     print("aln_file = '$aln_file'\n");
00241   }
00242 
00243   open(OUTSEQ, ">$aln_file") or die "Could not open '$aln_file' for writing : $!";
00244 
00245   # Using append_taxon_id will give nice seqnames_taxonids needed for
00246   # njtree species_tree matching
00247   my %sa_params = ($self->param('use_genomedb_id')) ? ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1);
00248 
00249   my $sa = $protein_tree->get_SimpleAlign ( -id_type => 'MEMBER', %sa_params );
00250   $sa->set_displayname_flat(1);
00251   my $alignIO = Bio::AlignIO->newFh ( -fh => \*OUTSEQ, -format => "fasta" );
00252   print $alignIO $sa;
00253 
00254   close OUTSEQ;
00255 
00256   print STDERR "Using sreformat to change to stockholm format\n" if ($self->debug);
00257   my $stk_file = $file_root . '.stk';
00258   
00259   my $sreformat_exe = $self->param('sreformat_exe')
00260         or die "'sreformat_exe' is an obligatory parameter";
00261 
00262   die "Cannot execute '$sreformat_exe'" unless(-x $sreformat_exe);
00263   
00264   my $cmd = "$sreformat_exe stockholm $aln_file > $stk_file";
00265 
00266   if(system($cmd)) {
00267     die "Error running command [$cmd] : $!";
00268   }
00269 
00270   return $stk_file;
00271 }
00272 
00273 
00274 sub store_proteintrees {
00275   my $self = shift;
00276 
00277   $self->store_clusters;
00278 
00279   if($self->debug >1) {
00280     print("done storing\n");
00281   }
00282 
00283   return undef;
00284 }
00285 
00286 sub store_clusters {
00287   my $self = shift;
00288 
00289   my $protein_tree_adaptor = $self->compara_dba->get_ProteinTreeAdaptor;
00290   my $starttime = time();
00291 
00292   $protein_tree_adaptor->store($self->param('original_cluster'));
00293   $self->param('original_cluster')->root->store_tag('tree_support', 'quicktree');
00294 
00295   foreach my $cluster (@{$self->param('subclusters')}) {
00296     my $node_id = $cluster->root_id;
00297 
00298     #calc residue count total
00299     my $leafcount = scalar(@{$cluster->root->get_all_leaves});
00300     $cluster->store_tag('gene_count', $leafcount);
00301     print STDERR "Stored $node_id with $leafcount leaves\n" if ($self->debug);
00302 
00303     # Dataflow clusters
00304     # This will create a new MSA alignment job for each of the newly generated clusters
00305     my $output_id = sprintf("{'protein_tree_id'=>%d}", $node_id);
00306 
00307     $self->dataflow_output_id($output_id, 2);
00308     print STDERR "Created new cluster $node_id\n";
00309   }
00310 }
00311 
00312 
00313 sub generate_subtrees {
00314     my $self                    = shift @_;
00315     my $quicktree_newick_string = shift @_;
00316 
00317     my $mlss_id = $self->param('mlss_id');
00318     my $protein_tree = $self->param('protein_tree');
00319 
00320   #cleanup old tree structure- 
00321   #  flatten and reduce to only GeneTreeMember leaves
00322   $protein_tree->flatten_tree;
00323   $protein_tree->print_tree(20) if($self->debug);
00324 
00325   #parse newick into a new tree object structure
00326   my $newtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($quicktree_newick_string);
00327   $newtree->print_tree(20) if($self->debug > 1);
00328   # get rid of the taxon_id needed by njtree -- name tag
00329   foreach my $leaf (@{$newtree->get_all_leaves}) {
00330     my $quicktreebreak_name = $leaf->get_tagvalue('name');
00331     $quicktreebreak_name =~ /(\d+)\_\d+/;
00332     my $member_name = $1;
00333     $leaf->add_tag('name', $member_name);
00334     bless $leaf, "Bio::EnsEMBL::Compara::GeneTreeMember";
00335     $leaf->member_id($member_name);
00336   }
00337 
00338   # Break the tree by immediate children recursively
00339   my @children;
00340   my $keep_breaking = 1;
00341   $self->param('max_subtree', $newtree);
00342   while ($keep_breaking) {
00343     @children = @{$self->param('max_subtree')->children};
00344     my $max_num_leaves = 0;
00345     foreach my $child (@children) {
00346       my $num_leaves = scalar(@{$child->get_all_leaves});
00347       if ($num_leaves > $max_num_leaves) {
00348         $max_num_leaves = $num_leaves;
00349         $self->param('max_subtree', $child);
00350       }
00351     }
00352     # Broke down to half, happy with it
00353     my $proportion = ($max_num_leaves*100/$self->param('original_leafcount') );
00354     print STDERR "QuickTreeBreak iterate -- $max_num_leaves ($proportion)\n" if ($self->debug);
00355     if ($proportion <= 50) {
00356       $keep_breaking = 0;
00357     }
00358   }
00359 
00360   my $final_original_num = scalar @{$self->param('protein_tree')->get_all_leaves};
00361   # Creating the supertree structure
00362   my $supertree = $self->param('protein_tree');
00363   $supertree->tree->tree_type('proteinsupertree');
00364   my $supertree_leaf1 = new Bio::EnsEMBL::Compara::GeneTreeNode;
00365   my $supertree_leaf2 = new Bio::EnsEMBL::Compara::GeneTreeNode;
00366   $supertree_leaf1->tree($supertree->tree);
00367   $supertree_leaf2->tree($supertree->tree);
00368 
00369   my $cluster1_root = new Bio::EnsEMBL::Compara::GeneTreeNode;
00370   my $cluster1 = new Bio::EnsEMBL::Compara::GeneTree;
00371   $cluster1->tree_type('proteintree');
00372   $cluster1->method_link_species_set_id($supertree->tree->method_link_species_set_id);
00373   $cluster1->clusterset_id($supertree->tree->clusterset_id);
00374   $cluster1->root($cluster1_root);
00375   $cluster1_root->tree($cluster1);
00376   
00377   my $cluster2_root = new Bio::EnsEMBL::Compara::GeneTreeNode;
00378   my $cluster2 = new Bio::EnsEMBL::Compara::GeneTree;
00379   $cluster2->tree_type('proteintree');
00380   $cluster2->method_link_species_set_id($supertree->tree->method_link_species_set_id);
00381   $cluster2->clusterset_id($supertree->tree->clusterset_id);
00382   $cluster2->root($cluster2_root);
00383   $cluster2_root->tree($cluster2);
00384 
00385 
00386   my $subtree_leaves;
00387   foreach my $leaf (@{$self->param('max_subtree')->get_all_leaves}) {
00388     $subtree_leaves->{$leaf->member_id} = 1;
00389   }
00390   foreach my $leaf (@{$supertree->get_all_leaves}) {
00391     if (defined $subtree_leaves->{$leaf->member_id}) {
00392       $cluster1_root->add_child($leaf);
00393       $leaf->tree($cluster1);
00394     } else {
00395       $cluster2_root->add_child($leaf);
00396       $leaf->tree($cluster2);
00397     }
00398   }
00399   $supertree->add_child($supertree_leaf1, $self->param('max_subtree')->distance_to_parent/2);
00400   $supertree->add_child($supertree_leaf2, $self->param('max_subtree')->distance_to_parent/2);
00401   $supertree->build_leftright_indexing(1);
00402   
00403   if ($self->debug) {
00404     print "SUPERTREE " ;
00405     $supertree->print_tree(20);
00406     print "CLUSTER1 ";
00407     $cluster1_root->print_tree(20);
00408     print "CLUSTER2 ";
00409     $cluster2_root->print_tree(20);
00410   }
00411   $supertree_leaf1->add_child($cluster1_root);
00412   $supertree_leaf2->add_child($cluster2_root);
00413 
00414   if ($self->debug) {
00415     print "FINAL STRUCTURE ";
00416     $supertree->print_tree(20);
00417   }
00418 
00419   $self->param('subclusters', [$cluster1, $cluster2]);
00420 
00421   # Some checks
00422   my       $final_max_num = scalar @{$cluster1->root->get_all_leaves};
00423   my $final_remaining_num = scalar @{$cluster2->root->get_all_leaves};
00424 
00425   if(($final_max_num + $final_remaining_num) != $final_original_num) {
00426     die "Incorrect sum of leaves [$final_max_num + $final_remaining_num != $final_original_num]";
00427   }
00428 
00429   $self->param('original_cluster', $supertree->tree);
00430 }
00431 
00432 1;