Archive Ensembl HomeArchive Ensembl Home
NCSecStructTree.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCSecStructTree
00024 
00025 =head1 SYNOPSIS
00026 
00027 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00028 my $ncsecstructtree = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCSecStructTree->new
00029   (
00030    -db         => $db,
00031    -input_id   => $input_id,
00032    -analysis   => $analysis
00033   );
00034 $ncsecstructtree->fetch_input(); #reads from DB
00035 $ncsecstructtree->run();
00036 $ncsecstructtree->output();
00037 $ncsecstructtree->write_output(); #writes to DB
00038 
00039 =head1 DESCRIPTION
00040 
00041 This RunnableDB build phylogenetic trees using RAxML. RAxML can use several secondary
00042 structure substitution models. This Runnable can run several of them in a row, but it
00043 is recommended to run them in parallel.
00044 
00045 =head1 INHERITANCE TREE
00046 
00047   Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable
00048   +- Bio::EnsEMBL::Hive::Process
00049 
00050 =head1 APPENDIX
00051 
00052 The rest of the documentation details each of the object methods. 
00053 Internal methods are usually preceded with a _
00054 
00055 =cut
00056 
00057 
00058 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCSecStructTree;
00059 
00060 use strict;
00061 use Time::HiRes qw(time gettimeofday tv_interval);
00062 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00063 
00064 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00065 
00066 
00067 sub param_defaults {
00068     return {
00069         'models'      => 'S16B S16A S7B S7C S6A S6B S6C S6D S6E S7A S7D S7E S7F S16',
00070     };
00071 }
00072 
00073 =head2 fetch_input
00074 
00075     Title   :   fetch_input
00076     Usage   :   $self->fetch_input
00077     Function:   Fetches input data from the database
00078     Returns :   none
00079     Args    :   none
00080 
00081 =cut
00082 
00083 
00084 sub fetch_input {
00085     my $self = shift @_;
00086 
00087     $self->input_job->transient_error(0);
00088     my $nc_tree_id = $self->param('nc_tree_id') || die "'nc_tree_id' is an obligatory numeric parameter\n";
00089     $self->input_job->transient_error(1);
00090 
00091     my $nc_tree    = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id) or die "Could not fetch nc_tree with id=$nc_tree_id\n";
00092     $self->param('nc_tree', $nc_tree);
00093 
00094     if(my $input_aln = $self->dumpMultipleAlignmentStructToWorkdir($nc_tree) ) {
00095         $self->param('input_aln', $input_aln);
00096     } else {
00097         return;
00098     }
00099 }
00100 
00101 
00102 =head2 run
00103 
00104     Title   :   run
00105     Usage   :   $self->run
00106     Function:   runs something
00107     Returns :   none
00108     Args    :   none
00109 
00110 =cut
00111 
00112 sub run {
00113     my $self = shift @_;
00114 
00115     # Run RAxML without ay structure info first
00116     $self->run_bootstrap_raxml;
00117     # Run RAxML with all selected secondary structure substitution models
00118     $self->run_ncsecstructtree;
00119 }
00120 
00121 
00122 =head2 write_output
00123 
00124     Title   :   write_output
00125     Usage   :   $self->write_output
00126     Function:   stores something
00127     Returns :   none
00128     Args    :   none
00129 
00130 =cut
00131 
00132 
00133 sub write_output {
00134     my $self = shift @_;
00135 
00136 }
00137 
00138 
00139 ##########################################
00140 #
00141 # internal methods
00142 #
00143 ##########################################
00144 
00145 
00146 sub run_bootstrap_raxml {
00147   my $self = shift;
00148 
00149   my $aln_file = $self->param('input_aln');
00150   return unless (defined($aln_file));
00151 
00152   my $raxml_tag = $self->param('nc_tree')->node_id . "." . $self->worker->process_id . ".raxml";
00153 
00154   my $raxml_exe = $self->param('raxml_exe')
00155     or die "'raxml_exe' is an obligatory parameter";
00156 
00157   die "Cannot execute '$raxml_exe'" unless(-x $raxml_exe);
00158 
00159   my $bootstrap_num = 10;
00160   my $tag = 'ml_IT_' . $bootstrap_num;
00161 
00162   # Checks if the bootstrap tree is already in the DB (is this a rerun?)
00163   if ($self->param('nc_tree')->tree->has_tag($tag)) {
00164     my $eval_tree;
00165     # Checks the tree string can be parsed succsesfully
00166     eval {
00167       $eval_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->param('nc_tree')->tree->get_value_for_tag($tag));
00168     };
00169     if (defined($eval_tree) and !$@ and !$self->debug) {
00170       # The bootstrap RAxML tree has been obtained already and the tree can be parsed successfully.
00171       return;
00172     }
00173   }
00174 
00175   # /software/ensembl/compara/raxml/RAxML-7.2.2/raxmlHPC-PTHREADS-SSE3
00176   # -m GTRGAMMA -s nctree_20327.aln -N 10 -n nctree_20327.raxml.10
00177   my $cmd = $raxml_exe;
00178   $cmd .= " -T 2"; # ATTN, you need the PTHREADS version of raxml for this
00179   $cmd .= " -m GTRGAMMA";
00180   $cmd .= " -s $aln_file";
00181   $cmd .= " -N $bootstrap_num";
00182   $cmd .= " -n $raxml_tag.$bootstrap_num";
00183 
00184   my $worker_temp_directory = $self->worker_temp_directory;
00185   $self->compara_dba->dbc->disconnect_when_inactive(1);
00186   print("$cmd\n") if($self->debug);
00187   my $bootstrap_starttime = time()*1000;
00188 #    $DB::single=1;1;
00189   unless(system("cd $worker_temp_directory; $cmd") == 0) {
00190     $self->throw("error running raxml\ncd $worker_temp_directory; $cmd\n $!\n");
00191   }
00192   $self->compara_dba->dbc->disconnect_when_inactive(0);
00193   my $bootstrap_msec = int(time()*1000-$bootstrap_starttime);
00194 
00195   my $ideal_msec = 30000; # 5 minutes
00196   my $time_per_sample = $bootstrap_msec / $bootstrap_num;
00197   my $ideal_bootstrap_num = $ideal_msec / $time_per_sample;
00198   if ($ideal_bootstrap_num < 10) {
00199     if   ($ideal_bootstrap_num < 5) { $self->param('bootstrap_num',  1); }
00200     else                            { $self->param('bootstrap_num', 10); }
00201   } elsif ($ideal_bootstrap_num > 100) {
00202     $self->param('bootstrap_num', 100);
00203   } else {
00204     $self->param('bootstrap_num', int($ideal_bootstrap_num) );
00205   }
00206 
00207   my $raxml_output = $self->worker_temp_directory . "RAxML_bestTree." . "$raxml_tag.$bootstrap_num";
00208 
00209   $self->store_newick_into_protein_tree_tag_string($tag,$raxml_output);
00210 
00211   # Unlink run files
00212   my $temp_dir = $self->worker_temp_directory;
00213   my $temp_regexp = $temp_dir."*$raxml_tag.$bootstrap_num.RUN.*";
00214   system("rm -f $temp_regexp");
00215   return 1;
00216 }
00217 
00218 sub run_ncsecstructtree {
00219   my $self = shift;
00220 
00221   my $aln_file    = $self->param('input_aln');
00222   return unless (defined($aln_file));
00223   my $struct_file = $self->param('struct_aln');
00224 
00225   my $raxml_tag = $self->param('nc_tree')->node_id . "." . $self->worker->process_id . ".raxml";
00226 
00227   my $raxml_exe = $self->param('raxml_exe')
00228     or die "'raxml_exe' is an obligatory parameter";
00229 
00230   die "Cannot execute '$raxml_exe'" unless(-x $raxml_exe);
00231 
00232   my $tree = $self->param('nc_tree')->tree;
00233   my $models = $self->param('models');
00234   $models = [split(/\W+/, $models)];
00235   foreach my $model (@$models) {
00236     my $tag = 'ss_IT_' . $model;
00237     if ($tree->has_tag($tag)) {
00238       my $eval_tree;
00239       # Checks the tree string can be parsed succsesfully
00240       eval {
00241         $eval_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($tree->get_value_for_tag($tag));
00242       };
00243       if (defined($eval_tree) and !$@ and !$self->debug) {
00244         # The secondary structure RAxML tree for this model has been obtained already and the tree can be parsed successfully.
00245         next; # Go to next model
00246       }
00247     }
00248 
00249     # /software/ensembl/compara/raxml/RAxML-7.2.2/raxmlHPC-SSE3
00250     # -m GTRGAMMA -s nctree_20327.aln -S nctree_20327.struct -A S7D -n nctree_20327.raxml
00251     my $cmd = $raxml_exe;
00252     $cmd .= " -T 2"; # ATTN, you need the PTHREADS version of raxml for this
00253     $cmd .= " -m GTRGAMMA";
00254     $cmd .= " -s $aln_file";
00255     $cmd .= " -S $struct_file" if (defined($struct_file));
00256     $cmd .= " -A $model";
00257     $cmd .= " -n $raxml_tag.$model";
00258     $cmd .= " -N " . $self->param('bootstrap_num') if (defined($self->param('bootstrap_num')));
00259 
00260     my $worker_temp_directory = $self->worker_temp_directory;
00261     $self->compara_dba->dbc->disconnect_when_inactive(1);
00262     print("$cmd\n") if($self->debug);
00263 
00264     my $error_file = $worker_temp_directory."/RAxML_bestTree..$raxml_tag.$model.err";
00265     $cmd .= ">& $error_file";
00266 
00267     my $starttime = time()*1000;
00268     unless(system("cd $worker_temp_directory; $cmd") == 0) {
00269       # Try to catch some known errors
00270       if (-e $error_file and qx"grep 'freqa > 0.0 && freqc > 0.0 && freqg > 0.0 && freqt > 0.0' $error_file") {
00271         # This can happen when there is not one of the nucleotides in one of the DNA data partition (RAxML-7.2.2)
00272         # RAxML will refuse to run this, we can safely skip all other models as well.
00273         last;
00274       } elsif (-e $error_file and qx"grep 'Empirical base frequency for state number [0-9] is equal to zero in DNA data partition' $error_file") {
00275         # Same as before, but for RAxML-7.2.8
00276         last;
00277       }
00278       $self->throw("error running raxml\ncd $worker_temp_directory; $cmd\n $!\n");
00279     }
00280     $self->compara_dba->dbc->disconnect_when_inactive(0);
00281     my $runtime_msec = int(time()*1000-$starttime);
00282 
00283     my $raxml_output = $self->worker_temp_directory . "RAxML_bestTree." . "$raxml_tag.$model";
00284     $self->param('model', $model);
00285 
00286     $self->store_newick_into_protein_tree_tag_string($tag,$raxml_output);
00287     my $model_runtime = $self->param('model') . "_runtime_msec";
00288     $self->param('nc_tree')->tree->store_tag($model_runtime, $runtime_msec);
00289 
00290     # Unlink run files
00291     my $temp_dir = $self->worker_temp_directory;
00292     my $temp_regexp = $temp_dir."*$raxml_tag.$model.RUN.*";
00293     $DB::single=1;1;#??
00294     system("rm -f $temp_regexp");
00295   }
00296 
00297   return 1;
00298 }
00299 
00300 sub dumpMultipleAlignmentStructToWorkdir {
00301   my $self = shift;
00302   my $tree = shift;
00303 
00304   my $leafcount = scalar(@{$tree->get_all_leaves});
00305   if($leafcount<4) {
00306     printf(STDERR "tree cluster %d has <4 proteins - can not build a raxml tree\n", 
00307            $tree->node_id);
00308     return undef;
00309   }
00310 
00311   my $file_root = $self->worker_temp_directory. "nctree_". $tree->node_id;
00312   $file_root    =~ s/\/\//\//g;  # converts any // in path to /
00313 
00314   my $aln_file = $file_root . ".aln";
00315   if($self->debug) {
00316     printf("dumpMultipleAlignmentStructToWorkdir : %d members\n", $leafcount);
00317     print("aln_file = '$aln_file'\n");
00318   }
00319 
00320   open(OUTSEQ, ">$aln_file")
00321     or $self->throw("Error opening $aln_file for write");
00322 
00323   # Using append_taxon_id will give nice seqnames_taxonids needed for
00324   # njtree species_tree matching
00325   my %sa_params = ($self->param('use_genomedb_id')) ?   ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1);
00326 
00327   my $sa = $tree->get_SimpleAlign
00328     (
00329      -id_type => 'MEMBER',
00330      %sa_params,
00331     );
00332   $sa->set_displayname_flat(1);
00333 
00334   # Phylip header
00335   print OUTSEQ $sa->no_sequences, " ", $sa->length, "\n";
00336   # Phylip body
00337   my $count = 0;
00338   foreach my $aln_seq ($sa->each_seq) {
00339     print OUTSEQ $aln_seq->display_id, "\n";
00340     my $seq = $aln_seq->seq;
00341 
00342     # Here we do a trick for all Ns sequences by changing the first
00343     # nucleotide to an A so that raxml can at least do the tree for
00344     # the rest of the sequences, instead of giving an error
00345     if ($seq =~ /N+/) { $seq =~ s/^N/A/; }
00346 
00347     print OUTSEQ "$seq\n";
00348     $count++;
00349     print STDERR "sequences $count\n" if ($count % 50 == 0);
00350   }
00351   close OUTSEQ;
00352 
00353   my $struct_string = $self->param('nc_tree')->tree->get_tagvalue('ss_cons');
00354   # Allowed Characters are "( ) < > [ ] { } " and "."
00355   $struct_string =~ s/[^\(^\)^\<^\>^\[^\]^\{^\}^\.]/\./g;
00356   my $struct_file = $file_root . ".struct";
00357   if ($struct_string =~ /^\.+$/) {
00358     $struct_file = undef;
00359     # No struct file
00360   } else {
00361     open(STRUCT, ">$struct_file")
00362       or $self->throw("Error opening $struct_file for write");
00363     print STRUCT "$struct_string\n";
00364     close STRUCT;
00365   }
00366   $self->param('input_aln', $aln_file);
00367   $self->param('struct_aln', $struct_file);
00368   return $aln_file;
00369 }
00370 
00371 
00372 sub store_newick_into_protein_tree_tag_string {
00373   my $self = shift;
00374   my $tag = shift;
00375   my $newick_file = shift;
00376 
00377   my $newick = '';
00378   print("load from file $newick_file\n") if($self->debug);
00379   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00380   while(<FH>) {
00381     chomp $_;
00382     $newick .= $_;
00383   }
00384   close(FH);
00385   $newick =~ s/(\d+\.\d{4})\d+/$1/g; # We round up to only 4 digits
00386 
00387   $self->param('nc_tree')->tree->store_tag($tag, $newick);
00388   if (defined($self->param('model'))) {
00389     my $bootstrap_tag = $self->param('model') . "_bootstrap_num";
00390     $self->param('nc_tree')->tree->store_tag($bootstrap_tag, $self->param('bootstrap_num'));
00391   }
00392 }
00393 
00394 
00395 1;