Archive Ensembl HomeArchive Ensembl Home
Ktreedist.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::Ktreedist
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $ktreedist = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::Ktreedist->new
00018   (
00019    -db         => $db,
00020    -input_id   => $input_id,
00021    -analysis   => $analysis
00022   );
00023 $ktreedist->fetch_input(); #reads from DB
00024 $ktreedist->run();
00025 $ktreedist->output();
00026 $ktreedist->write_output(); #writes to DB
00027 
00028 =cut
00029 
00030 
00031 =head1 DESCRIPTION
00032 
00033 This Analysis will take the sequences from a cluster, the cm from
00034 nc_profile and run a profiled alignment, storing the results as
00035 cigar_lines for each sequence.
00036 
00037 =cut
00038 
00039 
00040 =head1 CONTACT
00041 
00042   Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
00043   Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
00044 
00045 =cut
00046 
00047 
00048 =head1 APPENDIX
00049 
00050 The rest of the documentation details each of the object methods. 
00051 Internal methods are usually preceded with a _
00052 
00053 =cut
00054 
00055 
00056 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::Ktreedist;
00057 
00058 use strict;
00059 use Data::Dumper;
00060 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00061 
00062 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00063 
00064 
00065 =head2 fetch_input
00066 
00067     Title   :   fetch_input
00068     Usage   :   $self->fetch_input
00069     Function:   Fetches input data from the database
00070     Returns :   none
00071     Args    :   none
00072 
00073 =cut
00074 
00075 
00076 sub fetch_input {
00077   my( $self) = @_;
00078 
00079     # Fetch sequences:
00080   $self->param('nc_tree', $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($self->param('nc_tree_id')) );
00081 
00082   $self->load_input_trees;
00083 
00084   my $ktreedist_exe = $self->param('ktreedist_exe')
00085       or die "'ktreedist_exe' is an obligatory parameter";
00086 
00087   die "Cannot execute '$ktreedist_exe'" unless(-x $ktreedist_exe);
00088 }
00089 
00090 
00091 =head2 run
00092 
00093     Title   :   run
00094     Usage   :   $self->run
00095     Function:   runs something
00096     Returns :   none
00097     Args    :   none
00098 
00099 =cut
00100 
00101 sub run {
00102   my $self = shift;
00103 
00104   $self->reroot_inputtrees;
00105   $self->run_ktreedist;
00106 }
00107 
00108 
00109 =head2 write_output
00110 
00111     Title   :   write_output
00112     Usage   :   $self->write_output
00113     Function:   stores something
00114     Returns :   none
00115     Args    :   none
00116 
00117 =cut
00118 
00119 
00120 sub write_output {
00121   my $self = shift;
00122   $self->store_ktreedist_score;
00123 }
00124 
00125 
00126 ##########################################
00127 #
00128 # internal methods
00129 #
00130 ##########################################
00131 
00132 sub get_species_tree_file {
00133     my $self = shift @_;
00134 
00135     unless( $self->param('species_tree_file') ) {
00136 
00137         unless( $self->param('species_tree_string') ) {
00138 
00139             my $tag_table_name = 'gene_tree_root_tag';
00140 
00141             my $sth = $self->dbc->prepare( "select value from $tag_table_name where tag='species_tree_string'" );
00142             $sth->execute;
00143             my ($species_tree_string) = $sth->fetchrow_array;
00144             $sth->finish;
00145 
00146             $self->param('species_tree_string', $species_tree_string)
00147                 or die "Could not fetch 'species_tree_string' from $tag_table_name";
00148         }
00149 
00150         my $species_tree_string = $self->param('species_tree_string');
00151         eval {
00152             my $eval_species_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($species_tree_string);
00153             my @leaves = @{$eval_species_tree->get_all_leaves};
00154         };
00155         if($@) {
00156             die "Error parsing species tree from the string '$species_tree_string'";
00157         }
00158 
00159             # store the string in a local file:
00160         my $species_tree_file = $self->worker_temp_directory . "spec_tax.nh";
00161         open SPECIESTREE, ">$species_tree_file" or die "Could not open '$species_tree_file' for writing : $!";
00162         print SPECIESTREE $species_tree_string;
00163         close SPECIESTREE;
00164         $self->param('species_tree_file', $species_tree_file);
00165     }
00166     return $self->param('species_tree_file');
00167 }
00168 
00169 sub run_ktreedist {
00170   my $self = shift;
00171 
00172   my $root_id = $self->param('nc_tree')->node_id;
00173 #  my $species_tree_file = $self->param('species_tree_file');
00174   my $ktreedist_exe = $self->param('ktreedist_exe');
00175   my $temp_directory = $self->worker_temp_directory;
00176 
00177   my $comparisonfilename = $temp_directory . $root_id . ".ct";
00178   my $referencefilename = $temp_directory . $root_id . ".rt";
00179   open CTFILE,">$comparisonfilename" or die $!;
00180   print CTFILE "#NEXUS\n\n";
00181   print CTFILE "Begin TREES;\n\n";
00182   foreach my $method (keys %{$self->param('inputtrees_rooted')}) {
00183     my $inputtree = $self->param('inputtrees_rooted')->{$method};
00184     die ($method." is not defined in inputtrees_rooted")  unless (defined $inputtree);
00185     my $comparison_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($inputtree);
00186     my $newick_string = $comparison_tree->newick_simple_format;
00187     $self->throw("error with newick tree") unless (defined($newick_string));
00188     print CTFILE "TREE    $method = $newick_string\n";
00189   }
00190   print CTFILE "End;\n\n";
00191   close CTFILE;
00192 
00193   open RTFILE,">$referencefilename" or die $!;
00194   print RTFILE "#NEXUS\n\n";
00195   print RTFILE "Begin TREES;\n\n";
00196   my $reference_string = $self->param('nc_tree')->newick_format('member_id_taxon_id');
00197   $self->throw("error with newick tree") unless (defined($reference_string));
00198   print RTFILE "TREE    treebest = $reference_string\n";
00199   print CTFILE "End;\n\n";
00200   close RTFILE;
00201 
00202   my $cmd = "$ktreedist_exe -a -rt $referencefilename -ct $comparisonfilename";
00203   print("$cmd\n") if($self->debug);
00204   my $run; my $exit_status;
00205   open($run, "$cmd |") or $self->throw("Cannot run ktreedist with: $cmd");
00206   my @output = <$run>;
00207   $exit_status = close($run);
00208   $self->throw("Error exit status running Ktreedist") if (!$exit_status);
00209   my $ktreedist_score = $self->param('ktreedist_score', {});
00210   foreach my $line (@output) {
00211     if ($line =~ /\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)/) {
00212       my ($tag,$k_score,$scale_factor,$symm_difference,$n_partitions) = ($1,$2,$3,$4,$5);
00213       print "Parsing: $root_id,$tag,$k_score,$scale_factor,$symm_difference,$n_partitions\n" if ($self->debug);
00214       $ktreedist_score->{$root_id}{$k_score}{_tag}{$tag}{k_score} = $k_score;
00215       $ktreedist_score->{$root_id}{$k_score}{_tag}{$tag}{scale_factor} = $scale_factor;
00216       $ktreedist_score->{$root_id}{$k_score}{_tag}{$tag}{symm_difference} = $symm_difference;
00217       $ktreedist_score->{$root_id}{$k_score}{_tag}{$tag}{n_partitions} = $n_partitions;
00218     }
00219   }
00220 
00221   return 1;
00222 }
00223 
00224 sub load_input_trees {
00225 
00226   my $self = shift;
00227   my $tree = $self->param('nc_tree')->tree;
00228 
00229   foreach my $tag ($tree->get_all_tags) {
00230     next unless $tag =~ m/_it_/;
00231     my $inputtree_string = $tree->get_value_for_tag($tag);
00232 
00233     my $eval_inputtree;
00234     eval {
00235       $eval_inputtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($inputtree_string);
00236       my @leaves = @{$eval_inputtree->get_all_leaves};
00237     };
00238     unless ($@) {
00239         # manual vivification needed:
00240       unless($self->param('inputtrees_unrooted')) {
00241           $self->param('inputtrees_unrooted', {});
00242       }
00243 
00244       $self->param('inputtrees_unrooted')->{$tag} = $inputtree_string;
00245     }
00246   }
00247 
00248   return 1;
00249 }
00250 
00251 sub reroot_inputtrees {
00252   my $self = shift;
00253 
00254   my $root_id = $self->param('nc_tree')->node_id;
00255   my $species_tree_file = $self->get_species_tree_file();
00256 
00257   my $treebest_exe = $self->param('treebest_exe')
00258     or die "'treebest_exe' is an obligatory parameter";
00259 
00260   die "Cannot execute '$treebest_exe'" unless(-x $treebest_exe);
00261 
00262   my $temp_directory = $self->worker_temp_directory;
00263   my $template_cmd = "$treebest_exe sdi -rs $species_tree_file";
00264 
00265   foreach my $method (keys %{$self->param('inputtrees_unrooted')}) {
00266     my $cmd = $template_cmd;
00267     my $unrootedfilename = $temp_directory . $root_id . "." . $method . ".unrooted";
00268     my $rootedfilename = $temp_directory . $root_id . "." . $method . ".rooted";
00269     my $inputtree = $self->param('inputtrees_unrooted')->{$method};
00270     open FILE,">$unrootedfilename" or die $!;
00271     print FILE $inputtree;
00272     close FILE;
00273 
00274     $cmd .= " $unrootedfilename";
00275     $cmd .= " > $rootedfilename";
00276 
00277     print("$cmd\n") if($self->debug);
00278 
00279     unless(system("$cmd") == 0) {
00280       print("$cmd\n");
00281       $self->throw("error running treebest sdi, $!\n");
00282     }
00283 
00284     # Parse the rooted tree string
00285     my $rootedstring;
00286     open (FH, $rootedfilename) or $self->throw("Couldnt open rooted file [$rootedfilename]");
00287     while(<FH>) {
00288       chomp $_;
00289       $rootedstring .= $_;
00290     }
00291     close(FH);
00292 
00293       # manual vivification needed:
00294     unless($self->param('inputtrees_rooted')) {
00295         $self->param('inputtrees_rooted', {});
00296     }
00297     $self->param('inputtrees_rooted')->{$method} = $rootedstring;
00298   }
00299 
00300   return 1;
00301 }
00302 
00303 sub parse_newick_into_nctree
00304 {
00305   my $self = shift;
00306   my $newick_file =  $self->param('mmerge_blengths_output');
00307   my $nc_tree = $self->param('nc_tree');
00308   
00309   #cleanup old tree structure- 
00310   #  flatten and reduce to only GeneTreeMember leaves
00311   $nc_tree->flatten_tree;
00312   $nc_tree->print_tree(20) if($self->debug);
00313   foreach my $node (@{$nc_tree->get_all_leaves}) {
00314     next if($node->isa('Bio::EnsEMBL::Compara::GeneTreeMember'));
00315     $node->disavow_parent;
00316   }
00317 
00318   #parse newick into a new tree object structure
00319   my $newick = '';
00320   print("load from file $newick_file\n") if($self->debug);
00321   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00322   while(<FH>) { $newick .= $_;  }
00323   close(FH);
00324 
00325   my $newtree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($newick);
00326   $newtree->print_tree(20) if($self->debug > 1);
00327   # get rid of the taxon_id needed by njtree -- name tag
00328   foreach my $leaf (@{$newtree->get_all_leaves}) {
00329     my $njtree_phyml_name = $leaf->get_tagvalue('name');
00330     $njtree_phyml_name =~ /(\d+)\_\d+/;
00331     my $member_name = $1;
00332     $leaf->add_tag('name', $member_name);
00333   }
00334 
00335   # Leaves of newick tree are named with member_id of members from
00336   # input tree move members (leaves) of input tree into newick tree to
00337   # mirror the 'member_id' nodes
00338   foreach my $member (@{$nc_tree->get_all_leaves}) {
00339     my $tmpnode = $newtree->find_node_by_name($member->member_id);
00340     if($tmpnode) {
00341       $tmpnode->add_child($member, 0.0);
00342       $tmpnode->minimize_node; #tmpnode is now redundant so it is removed
00343     } else {
00344       print("unable to find node in newick for member"); 
00345       $member->print_member;
00346     }
00347   }
00348 
00349   # Merge the trees so that the children of the newick tree are now
00350   # attached to the input tree's root node
00351   $nc_tree->merge_children($newtree);
00352 
00353   # Newick tree is now empty so release it
00354   $newtree->release_tree;
00355 
00356   $nc_tree->print_tree if($self->debug);
00357   # check here on the leaf to test if they all are GeneTreeMember as
00358   # minimize_tree/minimize_node might not work properly
00359   foreach my $leaf (@{$self->param('nc_tree')->get_all_leaves}) {
00360     unless($leaf->isa('Bio::EnsEMBL::Compara::GeneTreeMember')) {
00361       $self->throw("TreeBestMMerge tree does not have all leaves as GeneTreeMembers\n");
00362     }
00363   }
00364 
00365   return undef;
00366 }
00367 
00368 sub store_ktreedist_score {
00369   my $self = shift;
00370   my $root_id = $self->param('nc_tree')->node_id;
00371 
00372   my $sth = $self->compara_dba->dbc->prepare
00373     ("INSERT IGNORE INTO ktreedist_score 
00374                            (node_id,
00375                             tag,
00376                             k_score,
00377                             scale_factor,
00378                             symm_difference,
00379                             n_partitions,
00380                             k_score_rank) VALUES (?,?,?,?,?,?,?)");
00381   my $count = 1;
00382   my $ktreedist_score_root_id = $self->param('ktreedist_score')->{$root_id};
00383   foreach my $k_score_as_rank (sort {$a <=> $b} keys %$ktreedist_score_root_id) {
00384     foreach my $tag (keys %{$ktreedist_score_root_id->{$k_score_as_rank}{_tag}}) {
00385       my $k_score         = $ktreedist_score_root_id->{$k_score_as_rank}{_tag}{$tag}{k_score};
00386       my $scale_factor    = $ktreedist_score_root_id->{$k_score_as_rank}{_tag}{$tag}{scale_factor};
00387       my $symm_difference = $ktreedist_score_root_id->{$k_score_as_rank}{_tag}{$tag}{symm_difference};
00388       my $n_partitions    = $ktreedist_score_root_id->{$k_score_as_rank}{_tag}{$tag}{n_partitions};
00389       my $k_score_rank = $count;
00390       $DB::single=1;1;
00391       $sth->execute($root_id,
00392                     $tag,
00393                     $k_score,
00394                     $scale_factor,
00395                     $symm_difference,
00396                     $n_partitions,
00397                     $k_score_rank);
00398       $count++;
00399     }
00400   }
00401   $sth->finish;
00402 
00403 
00404 }
00405 
00406 1;