Archive Ensembl HomeArchive Ensembl Home
NCGenomicTree.pm
Go to the documentation of this file.
00001 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCGenomicTree;
00002 
00003 use strict;
00004 use warnings;
00005 use Data::Dumper;
00006 use Time::HiRes qw/time/;
00007 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00008 
00009 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00010 
00011 sub fetch_input {
00012     my ($self) = @_;
00013     my $nc_tree_id = $self->param('nc_tree_id');
00014     my $nc_tree = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id);
00015     $self->param('nc_tree', $nc_tree);
00016     my $alignment_id = $self->param('alignment_id');
00017     print STDERR "ALN INPUT ID: " . $alignment_id . "\n" if ($self->debug);
00018     my $aln_file = $self->_load_and_dump_alignment();
00019     if (! defined $aln_file) {
00020         $self->throw("I can not dump the alignment in $alignment_id");
00021     }
00022     $self->param('aln_input',$aln_file);
00023     $self->throw("need a method") unless (defined $self->param('method'));
00024     $self->throw("need an alignment output file to build the tree") unless (defined $self->param('aln_input'));
00025     $self->throw("tree with id $nc_tree_id is undefined") unless (defined $nc_tree);
00026 
00027 }
00028 
00029 sub run {
00030     my ($self) = @_;
00031     $self->run_ncgenomic_tree($self->param('method'));
00032 }
00033 
00034 sub write_output {
00035     my ($self) = @_;
00036 }
00037 
00038 sub get_species_tree_file {
00039     my $self = shift @_;
00040 
00041     unless( $self->param('species_tree_file') ) {
00042 
00043         unless( $self->param('species_tree_string') ) {
00044 
00045             my $tag_table_name = 'gene_tree_root_tag';
00046 
00047             my $sth = $self->dbc->prepare( "select value from $tag_table_name where tag='species_tree_string'" );
00048             $sth->execute;
00049             my ($species_tree_string) = $sth->fetchrow_array;
00050             $sth->finish;
00051 
00052             $self->param('species_tree_string', $species_tree_string)
00053                 or die "Could not fetch 'species_tree_string' from $tag_table_name";
00054         }
00055 
00056         my $species_tree_string = $self->param('species_tree_string');
00057         eval {
00058             my $eval_species_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($species_tree_string);
00059             my @leaves = @{$eval_species_tree->get_all_leaves};
00060         };
00061         if($@) {
00062             die "Error parsing species tree from the string '$species_tree_string'";
00063         }
00064 
00065             # store the string in a local file:
00066         my $species_tree_file = $self->worker_temp_directory . "spec_tax.nh";
00067         open SPECIESTREE, ">$species_tree_file" or die "Could not open '$species_tree_file' for writing : $!";
00068         print SPECIESTREE $species_tree_string;
00069         close SPECIESTREE;
00070         $self->param('species_tree_file', $species_tree_file);
00071     }
00072     return $self->param('species_tree_file');
00073 }
00074 
00075 sub run_ncgenomic_tree {
00076     my ($self, $method) = @_;
00077     my $cluster = $self->param('nc_tree');
00078     my $nc_tree_id = $self->param('nc_tree_id');
00079     my $input_aln = $self->param('aln_input');
00080     print STDERR "INPUT ALN: $input_aln\n";
00081     die "$input_aln doesn't exist" unless (-e $input_aln);
00082     if ($method eq "phyml" && (scalar $cluster->get_all_leaves < 4)) {
00083         $self->input_job->incomplete(0);
00084         die ("tree cluster $nc_tree_id has ".(scalar $cluster->get_all_leaves)." proteins - can not build a phyml tree\n");
00085     }
00086 
00087     my $treebest_exe = $self->param('treebest_exe')
00088           or die "'treebest_exe' is an obligatory parameter";
00089 
00090     die "Cannot execute '$treebest_exe'" unless(-x $treebest_exe);
00091 
00092     my $newick_file = $input_aln . ".treebest.$method.nh";
00093     $self->param('newick_file', $newick_file);
00094     my $treebest_err_file = $self->worker_temp_directory . "treebest.err";
00095 
00096     my $cmd = $treebest_exe;
00097     $cmd .= " $method";
00098     $cmd .= " -Snf " if ($method eq 'phyml');
00099     $cmd .= " -s " if ($method eq 'nj');
00100     $cmd .= $self->get_species_tree_file();
00101     $cmd .= " " . $input_aln;
00102     $cmd .= " 2> ". $treebest_err_file;
00103     $cmd .= " > " . $newick_file;
00104 
00105     print STDERR "$cmd\n" if ($self->debug);
00106     my $worker_temp_directory = $self->worker_temp_directory;
00107     $DB::single=1; $DB::single && 1; # To avoid warnings about $DB::single used only once
00108     $self->compara_dba->dbc->disconnect_when_inactive(0);
00109 
00110     ## FIXME! -- I am not sure that $cmd will return errors. It will never dataflows!
00111     unless (system("cd $worker_temp_directory; $cmd") == 0) {
00112         print STDERR "We have a problem running treebest -- Inspecting $treebest_err_file\n";
00113         open my $treebest_err_fh, "<", $treebest_err_file or die $!;
00114         while (<$treebest_err_fh>) {
00115             chomp;
00116             if (/low memory/) {
00117                 $self->dataflow_output_id (
00118                                            {
00119                                             'nc_tree_id' => $self->param('nc_tree_id'),
00120                                             'method' => $self->param('method'),
00121                                             'alignement_id' => $self->param('alignment_id'),
00122                                            }, -1
00123                                           );
00124                 $self->input_job->incomplete(0);
00125                 die "error running treebest $method: $!\n -- Signaling MEMLIMIT";
00126             }
00127         }
00128         print "$cmd\n";
00129         $self->throw("error running treebest $method: $!\n");
00130     }
00131     $self->compara_dba->dbc->disconnect_when_inactive(1);
00132     $self->store_newick_into_protein_tree_tag_string($method)
00133 }
00134 
00135 sub store_newick_into_protein_tree_tag_string {
00136     my ($self, $method) = @_;
00137 
00138   my $newick_file =  $self->param('newick_file');
00139   my $newick = '';
00140   print STDERR "load from file $newick_file\n" if($self->debug);
00141   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00142   while(<FH>) {
00143     chomp $_;
00144     $newick .= $_;
00145   }
00146   close(FH);
00147   $newick =~ s/(\d+\.\d{4})\d+/$1/g; # We round up to only 4 digits
00148   return if ($newick eq '_null_;');
00149   my $tag = "pg_IT_" . $method;
00150   $self->param('nc_tree')->tree->store_tag($tag, $newick);
00151 }
00152 
00153 sub _load_and_dump_alignment {
00154     my ($self) = @_;
00155 
00156     my $root_id = $self->param('nc_tree_id');
00157     my $alignment_id = $self->param('alignment_id');
00158     my $file_root = $self->worker_temp_directory. "nctree_" . $root_id;
00159     my $aln_file = $file_root . ".aln";
00160     open my $outaln, ">", "$aln_file" or $self->throw("Error opening $aln_file for writing");
00161 
00162     my $sql_load_alignment = "SELECT member_id, aligned_sequence FROM aligned_sequence WHERE alignment_id = ?";
00163     my $sth_load_alignment = $self->dbc->prepare($sql_load_alignment);
00164     print STDERR "SELECT member_id, aligned_sequence FROM aligned_sequence WHERE alignment_id = $alignment_id\n" if ($self->debug);
00165     $sth_load_alignment->execute($alignment_id);
00166     my $all_aln_seq_hashref = $sth_load_alignment->fetchall_arrayref({});
00167 
00168     for my $row_hashref (@$all_aln_seq_hashref) {
00169         my $mem_id = $row_hashref->{member_id};
00170         my $member = $self->compara_dba->get_NCTreeAdaptor->fetch_AlignedMember_by_member_id_root_id($mem_id);
00171         my $taxid = $member->taxon_id();
00172         my $aln_seq = $row_hashref->{aligned_sequence};
00173         $aln_seq =~ s/^N/A/;  # To avoid RAxML failure
00174         print $outaln ">" . $mem_id. "_" . $taxid . "\n" . $aln_seq . "\n";
00175     }
00176     close($outaln);
00177 
00178     return $aln_file;
00179 }
00180 
00181 1;