Archive Ensembl HomeArchive Ensembl Home
PrepareSecStructModels.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::PrepareSecStructModels
00024 
00025 =head1 SYNOPSIS
00026 
00027 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00028 my $ncsecstructtree = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::PrepareSecModels->new
00029   (
00030    -db         => $db,
00031    -input_id   => $input_id,
00032    -analysis   => $analysis
00033   );
00034 $ncsecstructtree->fetch_input(); #reads from DB
00035 $ncsecstructtree->run();
00036 $ncsecstructtree->output();
00037 $ncsecstructtree->write_output(); #writes to DB
00038 
00039 =head1 DESCRIPTION
00040 
00041 This RunnableDB builds phylogenetic trees using RAxML. RAxML can use several secondary
00042 structure substitution models. This Runnable can run several of them in a row, but it
00043 is recommended to run them in parallel.
00044 
00045 =head1 INHERITANCE TREE
00046 
00047   Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable
00048   +- Bio::EnsEMBL::Hive::Process
00049 
00050 =head1 APPENDIX
00051 
00052 The rest of the documentation details each of the object methods. 
00053 Internal methods are usually preceded with a _
00054 
00055 =cut
00056 
00057 
00058 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::PrepareSecStructModels;
00059 
00060 use strict;
00061 use Time::HiRes qw(time);
00062 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00063 
00064 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00065 
00066 sub param_defaults {
00067     return {
00068         'models'      => [qw/S16B S16A S7B S7C S6A S6B S6C S6D S6E S7A S7D S7E S7F S16/],
00069     };
00070 }
00071 
00072 =head2 fetch_input
00073 
00074     Title   :   fetch_input
00075     Usage   :   $self->fetch_input
00076     Function:   Fetches input data from the database
00077     Returns :   none
00078     Args    :   none
00079 
00080 =cut
00081 
00082 sub fetch_input {
00083     my $self = shift @_;
00084 
00085     $self->input_job->transient_error(0);
00086     my $nc_tree_id = $self->param('nc_tree_id') || die "'nc_tree_id' is an obligatory numeric parameter\n";
00087     $self->input_job->transient_error(1);
00088 
00089     my $nc_tree    = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id) or $self->throw("Could not fetch nc_tree with id=$nc_tree_id");
00090     $self->param('nc_tree', $nc_tree);
00091 
00092 ### !! Struct files are not used in this first tree!!
00093     if(my $input_aln = $self->_dumpMultipleAlignmentStructToWorkdir($nc_tree) ) {
00094         $self->param('input_aln', $input_aln);
00095     } else {
00096         die "I can't write input alignment";
00097     }
00098 }
00099 
00100 =head2 run
00101 
00102     Title   :   run
00103     Usage   :   $self->run
00104     Function:   runs something
00105     Returns :   none
00106     Args    :   none
00107 
00108 =cut
00109 
00110 sub run {
00111     my $self = shift @_;
00112     my $nc_tree_id = $self->param('nc_tree_id');
00113     # First check the size of the alignents to compute:
00114     if ($self->param('tag_residue_count') > 150000) {
00115         $self->dataflow_output_id (
00116                                    {
00117                                     'nc_tree_id' => $nc_tree_id,
00118                                    }, -1
00119                                   );
00120         $self->dataflow_output_id (
00121                                    {
00122                                     'clusterset_id' => 1,
00123                                     'nc_tree_id' => $nc_tree_id,
00124                                    }, 1
00125                                   );
00126         # Should we die here? Nothing more to do in the Runnable
00127         $self->input_job->incomplete(0);
00128         die "$nc_tree_id family is too big. Only fast trees will be computed\n";
00129     } else {
00130     # Run RAxML without any structure info first
00131         $self->_run_bootstrap_raxml;
00132     }
00133 }
00134 
00135 =head2 write_output
00136 
00137     Title   :   write_output
00138     Usage   :   $self->write_output
00139     Function:   stores something
00140     Returns :   none
00141     Args    :   none
00142 
00143 =cut
00144 
00145 
00146 sub write_output {
00147     my $self = shift @_;
00148 
00149     # Run RAxML with all selected secondary structure substitution models
00150     # $self->_run_ncsecstructtree;
00151 
00152     my $nc_tree_id = $self->param('nc_tree_id');
00153     my $models = $self->param('models');
00154     my $bootstrap_num = $self->param('bootstrap_num');
00155     print STDERR "Bootstrap_num: $bootstrap_num\n" if ($self->debug());
00156 
00157     for my $model (@$models) {
00158         $self->dataflow_output_id ( {
00159                                      'model' => $model,
00160                                      'nc_tree_id' => $nc_tree_id,
00161                                      'bootstrap_num' => $bootstrap_num
00162                                     }, 2); # fan
00163     }
00164 
00165 }
00166 
00167 sub _run_bootstrap_raxml {
00168     my $self = shift;
00169 
00170 
00171     ## Regarding RAxML 7.2.8 (http://www.phylo.org/tools/raxmlhpc2.html)
00172 #In RAxML 7.0.4, a run specified with the model GTRGAMMA (command line = -m GTRGAMMA -x -f a) performed rapid bootstrapping using the GTRCAT model, followed by an ML search using the GTRGAMMA model. That is, GTRGAMMA was used only for the ML search, while GTRCAT was used during the bootstrapping for improved efficiency. Similarly, RAxML 7.0.4 offered the option GTRMIX conducted inference under GRTCAT and calculated best tree under GTRGAMMA. The GTRMIX option (which conducted inference under GRTCAT and calculated best tree under GTRGAMMA) is no longer offered for RAxML 7.1.0 and above.
00173 
00174 #For RAxML 7.2.8, selecting the GTRGAMMA model has a very different effect (command line = -m GTRGAMMA -x -f a). This option causes GTRGAMMA to be used both during the rapid bootstrapping AND inference of the best tree. The result is that it takes much longer to produce results using GTRGAMMA in RAxML 7.0.4, and the analysis is different from the one run using RAxML 7.0.4, where GTRCAT was used to conduct the bootstrapping phase. If you wish to run the same analysis you ran using RAxML 7.0.4, you must instead choose the model GTRCAT (-m GTRCAT -x -f a)
00175 
00176   my $aln_file = $self->param('input_aln');
00177   return unless (defined($aln_file));
00178 
00179   my $raxml_tag = $self->param('nc_tree')->node_id . "." . $self->worker->process_id . ".raxml";
00180 
00181   my $raxml_exe = $self->param('raxml_exe')
00182     or die "'raxml_exe' is an obligatory parameter";
00183 
00184   die "Cannot execute '$raxml_exe'" unless(-x $raxml_exe);
00185 
00186   my $bootstrap_num = 10;
00187   my $tag = 'ml_IT_' . $bootstrap_num;
00188 
00189   # Checks if the bootstrap tree is already in the DB (is this a rerun?)
00190   if ($self->param('nc_tree')->tree->has_tag($tag)) {
00191     my $eval_tree;
00192     # Checks the tree string can be parsed succsesfully
00193     eval {
00194       $eval_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->param('nc_tree')->tree->get_value_for_tag($tag));
00195     };
00196     if (defined($eval_tree) and !$@ and !$self->debug) {
00197       # The bootstrap RAxML tree has been obtained already and the tree can be parsed successfully.
00198       return;
00199     }
00200   }
00201 
00202   # /software/ensembl/compara/raxml/RAxML-7.2.8-ALPHA/raxmlHPC-SSE3
00203   # -m GTRGAMMA -s nctree_20327.aln -N 10 -n nctree_20327.raxml.10
00204   my $cmd = $raxml_exe;
00205   $cmd .= " -T 2"; # ATTN, you need the PTHREADS version of raxml for this
00206   $cmd .= " -m GTRGAMMA";
00207   $cmd .= " -s $aln_file";
00208   $cmd .= " -N $bootstrap_num";
00209   $cmd .= " -n $raxml_tag.$bootstrap_num";
00210 
00211   my $worker_temp_directory = $self->worker_temp_directory;
00212   $self->compara_dba->dbc->disconnect_when_inactive(1);
00213   print "$cmd\n" if($self->debug);
00214   my $bootstrap_starttime = time()*1000;
00215 
00216   unless(system("cd $worker_temp_directory; $cmd") == 0) {
00217     $self->throw("error running raxml\ncd $worker_temp_directory; $cmd\n $!\n");
00218   }
00219   $self->compara_dba->dbc->disconnect_when_inactive(0);
00220   my $bootstrap_msec = int(time()*1000-$bootstrap_starttime);
00221 
00222   my $ideal_msec = 30000; # 5 minutes
00223   my $time_per_sample = $bootstrap_msec / $bootstrap_num;
00224   my $ideal_bootstrap_num = $ideal_msec / $time_per_sample;
00225   if ($ideal_bootstrap_num < 10) {
00226     if   ($ideal_bootstrap_num < 5) { $self->param('bootstrap_num',  1); }
00227     else                            { $self->param('bootstrap_num', 10); }
00228   } elsif ($ideal_bootstrap_num > 100) {
00229     $self->param('bootstrap_num', 100);
00230   } else {
00231     $self->param('bootstrap_num', int($ideal_bootstrap_num) );
00232   }
00233 
00234   my $raxml_output = $self->worker_temp_directory . "RAxML_bestTree." . "$raxml_tag.$bootstrap_num";
00235 
00236   $self->_store_newick_into_nc_tree_tag_string($tag,$raxml_output);
00237 
00238   # Unlink run files
00239   my $temp_dir = $self->worker_temp_directory;
00240   my $temp_regexp = $temp_dir."*$raxml_tag.$bootstrap_num.RUN.*";
00241   system("rm -f $temp_regexp");
00242   return 1;
00243 }
00244 
00245 sub _dumpMultipleAlignmentStructToWorkdir {
00246   my $self = shift;
00247   my $tree = shift;
00248 
00249   my $leafcount = scalar(@{$tree->get_all_leaves});
00250   if($leafcount<4) {
00251       $self->input_job->incomplete(0);
00252       #printf(STDERR "tree cluster %d has <4 proteins - can not build a raxml tree\n", $tree->node_id);
00253       my $tree_id = $tree->node_id;
00254       die "tree cluster $tree_id has <4 proteins -- can not build a raxml tree\n";
00255   }
00256 
00257   my $file_root = $self->worker_temp_directory. "nctree_". $tree->node_id;
00258   $file_root    =~ s/\/\//\//g;  # converts any // in path to /
00259 
00260   my $aln_file = $file_root . ".aln";
00261 #   if($self->debug) {
00262 #     printf("dumpMultipleAlignmentStructToWorkdir : %d members\n", $leafcount);
00263 #     print("aln_file = '$aln_file'\n");
00264 #   }
00265 
00266   open(OUTSEQ, ">$aln_file")
00267     or $self->throw("Error opening $aln_file for write");
00268 
00269   # Using append_taxon_id will give nice seqnames_taxonids needed for
00270   # njtree species_tree matching
00271   my %sa_params = ($self->param('use_genomedb_id')) ?   ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1);
00272 
00273   my $sa = $tree->get_SimpleAlign
00274     (
00275      -id_type => 'MEMBER',
00276      %sa_params,
00277     );
00278   $sa->set_displayname_flat(1);
00279 
00280   # Phylip header
00281   print OUTSEQ $sa->no_sequences, " ", $sa->length, "\n";
00282   $self->param('tag_residue_count', $sa->no_sequences * $sa->length);
00283   # Phylip body
00284   my $count = 0;
00285   foreach my $aln_seq ($sa->each_seq) {
00286     print OUTSEQ $aln_seq->display_id, " ";
00287     my $seq = $aln_seq->seq;
00288 
00289     # Here we do a trick for all Ns sequences by changing the first
00290     # nucleotide to an A so that raxml can at least do the tree for
00291     # the rest of the sequences, instead of giving an error
00292     if ($seq =~ /N+/) { $seq =~ s/^N/A/; }
00293 
00294     print OUTSEQ "$seq\n";
00295     $count++;
00296     print STDERR "sequences $count\n" if ($count % 50 == 0);
00297   }
00298   close OUTSEQ;
00299 
00300   my $struct_string = $self->param('nc_tree')->tree->get_tagvalue('ss_cons');
00301   # Allowed Characters are "( ) < > [ ] { } " and "."
00302   $struct_string =~ s/[^\(^\)^\<^\>^\[^\]^\{^\}^\.]/\./g;
00303   my $struct_file = $file_root . ".struct";
00304   if ($struct_string =~ /^\.+$/) {
00305     $struct_file = undef;
00306     # No struct file
00307   } else {
00308     open(STRUCT, ">$struct_file")
00309       or $self->throw("Error opening $struct_file for write");
00310     print STRUCT "$struct_string\n";
00311     close STRUCT;
00312   }
00313   $self->param('input_aln', $aln_file);
00314   $self->param('struct_aln', $struct_file);
00315   return $aln_file;
00316 }
00317 
00318 sub _store_newick_into_nc_tree_tag_string {
00319   my $self = shift;
00320   my $tag = shift;
00321   my $newick_file = shift;
00322 
00323   my $newick = '';
00324   print("load from file $newick_file\n") if($self->debug);
00325   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00326   while(<FH>) {
00327     chomp $_;
00328     $newick .= $_;
00329   }
00330   close(FH);
00331   $newick =~ s/(\d+\.\d{4})\d+/$1/g; # We round up to only 4 digits
00332 
00333   $self->param('nc_tree')->tree->store_tag($tag, $newick);
00334   if (defined($self->param('model'))) {
00335     my $bootstrap_tag = $self->param('model') . "_bootstrap_num";
00336     $self->param('nc_tree')->tree->store_tag($bootstrap_tag, $self->param('bootstrap_num'));
00337   }
00338 }
00339 
00340 
00341 1;