Archive Ensembl HomeArchive Ensembl Home
SecStructModelTree.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::SecStructModelTree
00024 
00025 =head1 SYNOPSIS
00026 
00027 =head1 DESCRIPTION
00028 
00029 =head1 INHERITANCE TREE
00030 
00031   Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable
00032   +- Bio::EnsEMBL::Hive::Process
00033 
00034 =head1 APPENDIX
00035 
00036 The rest of the documentation details each of the object methods. 
00037 Internal methods are usually preceded with a _
00038 
00039 =cut
00040 
00041 
00042 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::SecStructModelTree;
00043 
00044 use strict;
00045 use Time::HiRes qw(time);                          # Needed *
00046 use Bio::EnsEMBL::Compara::Graph::NewickParser;    # Needed *
00047 use IPC::Open3;
00048 use File::Spec;
00049 use Symbol qw/gensym/;
00050 
00051 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00052 
00053 
00054 =head2 fetch_input
00055 
00056    Title   :   fetch_input
00057    Usage   :   $self->fetch_input
00058    Function:   Fetches input data from the database
00059    Returns :   none
00060    Args    :   none
00061 
00062 =cut
00063 
00064 sub fetch_input {
00065     my $self = shift @_;
00066 
00067     $self->input_job->transient_error(0);
00068     my $nc_tree_id = $self->param('nc_tree_id') or $self->throw("An 'nc_tree_id' is mandatory");  # Better to have a nc_tree instead?
00069     $self->input_job->transient_error(1);
00070 
00071     my $nc_tree = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id) or die "Could not fetch nc_tree with id=$nc_tree_id\n";
00072     $self->param('nc_tree',$nc_tree);
00073 
00074     if(my $input_aln = $self->_dumpMultipleAlignmentStructToWorkdir($nc_tree) ) {
00075         $self->param('input_aln', $input_aln);
00076     } else {
00077         die "An input_aln is mandatory";
00078     }
00079 }
00080 
00081 sub run {
00082 
00083     my ($self) = @_;
00084 
00085     my $model = $self->param('model') or die "A model is mandatory";
00086     my $nc_tree = $self->param('nc_tree');
00087     my $aln_file = $self->param('input_aln');
00088     my $struct_file = $self->param('struct_aln') or die "An struct_aln is mandatory";
00089     my $bootstrap_num = $self->param('bootstrap_num') or die "A boostrap_num is mandatory";
00090     my $root_id = $nc_tree->node_id;
00091 
00092     my $raxml_tag = $root_id . "." . $self->worker->process_id . ".raxml";
00093 
00094     my $raxml_exe = $self->param('raxml_exe')
00095         or die "'raxml_exe' is an obligatory parameter";
00096 
00097     die "Cannot execute '$raxml_exe'" unless(-x $raxml_exe);
00098 
00099     my $tag = 'ss_IT_' . $model;
00100     if ($self->param('nc_tree')->tree->has_tag($tag)) {
00101         my $eval_tree;
00102         # Checks the tree string can be parsed successfully
00103         eval {
00104             $eval_tree = Bio::EnsEMBL::Compara::Graph::NewickParser::parse_newick_into_tree($self->param('nc_tree')->tree->get_value_for_tag($tag));
00105         };
00106         if (defined($eval_tree) and !$@) {
00107             # The secondary structure RAxML tree for this model has been obtained already and the tree can be parsed successfully.
00108             return;  # We have ended with this model
00109         }
00110     }
00111 
00112     # /software/ensembl/compara/raxml/RAxML-7.2.2/raxmlHPC-SSE3
00113     # -m GTRGAMMA -s nctree_20327.aln -S nctree_20327.struct -A S7D -n nctree_20327.raxml
00114     my $worker_temp_directory = $self->worker_temp_directory;
00115     my $cmd = $raxml_exe;
00116     $cmd .= " -T 2";
00117     $cmd .= " -m GTRGAMMA";
00118     $cmd .= " -s $aln_file";
00119     $cmd .= " -S $struct_file";
00120     $cmd .= " -A $model";
00121     $cmd .= " -n $raxml_tag.$model";
00122     $cmd .= " -N ".$bootstrap_num if (defined $bootstrap_num);
00123 #    $cmd .= " 2> $raxml_err_file";
00124 #     my $error_file = $worker_temp_directory."/RAxML_bestTree.$raxml_tag.$model.err";
00125 #     $cmd .= ">& $error_file";
00126 
00127 
00128     $self->compara_dba->dbc->disconnect_when_inactive(1);
00129     my $starttime = time()*1000;
00130     print STDERR "$cmd\n" if ($self->debug);
00131 #    unless(system("cd $worker_temp_directory; $cmd") == 0) {
00132     # Assuming that if RAxML runs without problems, no stderr output will be generated.
00133     # We are reading STDERR to get if RAxML fails and the error reported.
00134     # If the error is an assertion error. We report, but no error is raised to msg table.
00135     open (NULL, ">", File::Spec->devnull);
00136     my $pid = open3(gensym, ">&NULL", \*PH, "cd $worker_temp_directory; $cmd");
00137     my $err_msg = "";
00138     while (<PH>) {
00139         $err_msg .= $_;
00140     }
00141     if ($err_msg ne "") {
00142         print STDERR "We have a problem running RAxML -- Inspecting error file\n";
00143         if ($err_msg =~ /Assertion(.+)failed/) {
00144             my $assertion_failed = $1;
00145             $self->input_job->incomplete(0);
00146             die "Assertion failed for RAxML: $assertion_failed\n";
00147         } else {
00148             $self->throw("error running raxml\ncd $worker_temp_directory; $cmd\n$err_msg\n");
00149         }
00150     }
00151     my $runtime_msec = int(time()*1000-$starttime);
00152     $self->compara_dba->dbc->disconnect_when_inactive(0);
00153 
00154     my $raxml_output = $self->worker_temp_directory . "RAxML_bestTree.$raxml_tag.$model";
00155     $self->_store_newick_into_protein_tree_tag_string($tag,$raxml_output);
00156     my $model_runtime = "${model}_runtime_msec";
00157     $nc_tree->tree->store_tag($model_runtime,$runtime_msec);
00158 
00159     #Unlink run files
00160     my $temp_regexp = $worker_temp_directory."*$raxml_tag.$model.RUN.*";
00161     system ("rm -f $temp_regexp");
00162     return 1;
00163 }
00164 
00165 sub write_output {
00166     my $self= shift @_;
00167 }
00168 
00169 
00170 ##########################################
00171 #
00172 # internal methods
00173 #
00174 ##########################################
00175 
00176 sub _store_newick_into_protein_tree_tag_string {
00177 
00178   my $self = shift;
00179   my $tag = shift;
00180   my $newick_file = shift;
00181 
00182   my $newick = '';
00183 #  print("load from file $newick_file\n") if($self->debug);
00184   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00185   while(<FH>) {
00186     chomp $_;
00187     $newick .= $_;
00188   }
00189   close(FH);
00190   $newick =~ s/(\d+\.\d{4})\d+/$1/g; # We round up to only 4 digits
00191 
00192   $self->param('nc_tree')->tree->store_tag($tag, $newick);
00193   if (defined($self->param('model'))) {
00194     my $bootstrap_tag = $self->param('model') . "_bootstrap_num";
00195     $self->param('nc_tree')->tree->store_tag($bootstrap_tag, $self->param('bootstrap_num'));
00196   }
00197 }
00198 
00199 sub _dumpMultipleAlignmentStructToWorkdir {
00200     my $self = shift;
00201     my $tree = shift;
00202 
00203     my $leafcount = scalar(@{$tree->get_all_leaves});
00204     if($leafcount<4) {
00205         my $node_id = $tree->node_id;
00206         $self->input_job->incomplete(0);
00207         die ("tree cluster $node_id has <4 proteins - can not build a raxml tree\n");
00208     }
00209 
00210     my $file_root = $self->worker_temp_directory. "nctree_". $tree->node_id;
00211     $file_root    =~ s/\/\//\//g;  # converts any // in path to /
00212 
00213     my $aln_file = $file_root . ".aln";
00214 #   if($self->debug) {
00215 #     printf("dumpMultipleAlignmentStructToWorkdir : %d members\n", $leafcount);
00216 #     print("aln_file = '$aln_file'\n");
00217 #   }
00218 
00219     open(OUTSEQ, ">$aln_file")
00220         or $self->throw("Error opening $aln_file for write");
00221 
00222   # Using append_taxon_id will give nice seqnames_taxonids needed for
00223   # njtree species_tree matching
00224     my %sa_params = ($self->param('use_genomedb_id')) ? ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1);
00225 
00226     my $sa = $tree->get_SimpleAlign
00227         (
00228          -id_type => 'MEMBER',
00229          %sa_params,
00230         );
00231     $sa->set_displayname_flat(1);
00232 
00233   # Phylip header
00234     print OUTSEQ $sa->no_sequences, " ", $sa->length, "\n";
00235     # Phylip body
00236     my $count = 0;
00237     foreach my $aln_seq ($sa->each_seq) {
00238         print OUTSEQ $aln_seq->display_id, "\n";
00239         my $seq = $aln_seq->seq;
00240 
00241     # Here we do a trick for all Ns sequences by changing the first
00242     # nucleotide to an A so that raxml can at least do the tree for
00243     # the rest of the sequences, instead of giving an error
00244         if ($seq =~ /N+/) { $seq =~ s/^N/A/; }
00245 
00246         print OUTSEQ "$seq\n";
00247         $count++;
00248         print STDERR "sequences $count\n" if ($count % 50 == 0);
00249     }
00250     close OUTSEQ;
00251 
00252     my $struct_string = $self->param('nc_tree')->tree->get_tagvalue('ss_cons');
00253     # Allowed Characters are "( ) < > [ ] { } " and "."
00254     $struct_string =~ s/[^\(^\)^\<^\>^\[^\]^\{^\}^\.]/\./g;
00255     my $struct_file = $file_root . ".struct";
00256     if ($struct_string =~ /^\.+$/) {
00257 #        $self->input_job->transient_error(0);
00258         $self->input_job->incomplete(0);
00259         die "struct string is $struct_string\n";
00260     } else {
00261         open(STRUCT, ">$struct_file")
00262             or $self->throw("Error opening $struct_file for write");
00263         print STRUCT "$struct_string\n";
00264         close STRUCT;
00265     }
00266     $self->param('input_aln', $aln_file);
00267     $self->param('struct_aln', $struct_file);
00268     return $aln_file;
00269 }
00270 
00271 1;