Archive Ensembl HomeArchive Ensembl Home
NCFastTrees.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCFastTrees
00024 
00025 =head1 SYNOPSIS
00026 
00027 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00028 my $ncfasttree = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCFastTrees->new
00029   (
00030    -db         => $db,
00031    -input_id   => $input_id,
00032    -analysis   => $analysis
00033   );
00034 $ncfasttree->fetch_input(); #reads from DB
00035 $ncfasttree->run();
00036 $ncfasttree->output();
00037 $ncfasttree->write_output(); #writes to DB
00038 
00039 =head1 DESCRIPTION
00040 
00041 This RunnableDB builds fast phylogenetic trees using RAxML-Light and FastTree2. It is useful in cases where the alignments are too big to build the usual RAxML trees in PrepareSecStructModels and SecStructModelTree.
00042 
00043 =head1 INHERITANCE TREE
00044 
00045   Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable
00046   +- Bio::EnsEMBL::Hive::Process
00047 
00048 =head1 APPENDIX
00049 
00050 The rest of the documentation details each of the object methods.
00051 Internal methods are usually preceded with a _
00052 
00053 =cut
00054 
00055 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCFastTrees;
00056 
00057 use strict;
00058 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00059 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00060 
00061 =head2 fetch_input
00062 
00063     Title    : fetch_input
00064     Usage    : $self->fetch_input
00065     Function : Fetches input data from the database+
00066     Returns  : none
00067     Args     : none
00068 
00069 =cut
00070 
00071 sub fetch_input {
00072     my ($self) = @_;
00073 
00074     $self->input_job->transient_error(0);
00075     my $nc_tree_id = $self->param('nc_tree_id') || die "'nc_tree_id' is an obligatory parameter\n";
00076     $self->input_job->transient_error(1);
00077 
00078     my $nc_tree = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id) or $self->throw("Couldn't fetch nc_tree with id $nc_tree_id\n");
00079     $self->param('nc_tree', $nc_tree);
00080 
00081     if (my $alignment_id = $self->param('alignment_id')) {
00082         $self->_load_and_dump_alignment();
00083         # $self->param('aln_fasta') and/or $self->param('aln_file') are now set
00084 #        $self->param('input_aln', $self->_load_and_dump_alignment());
00085         return;
00086     }
00087     if (my $input_aln = $self->_dumpMultipleAlignmentStructToWorkdir($nc_tree) ) {
00088         $self->param('input_aln', $input_aln);
00089     } else {
00090         die "I can't write input alignment to disc";
00091     }
00092 }
00093 
00094 =head2 run
00095 
00096     Title     : run
00097     Usage     : $self->run
00098     Function  : runs something
00099     Returns   : none
00100     Args      : none
00101 
00102 =cut
00103 
00104 sub run {
00105     my ($self) = @_;
00106 
00107     $self->_run_fasttree;
00108     $self->_run_parsimonator;
00109     $self->_run_raxml_light;
00110 }
00111 
00112 =head2 write_output
00113 
00114     Title     : write_output
00115     Usage     : $self->write_output
00116     Function  : stores something
00117     Returns   : none
00118     Args      : none
00119 
00120 =cut
00121 
00122 sub write_output {
00123     my ($self) = @_;
00124 
00125 }
00126 
00127 
00128 ##########################################
00129 #
00130 # internal methods
00131 #
00132 ##########################################
00133 
00134 sub _run_fasttree {
00135     my $self = shift;
00136     my $aln_file;
00137     if (defined ($self->param('aln_fasta'))) {
00138         $aln_file = $self->param('aln_fasta');
00139     } else {
00140         $aln_file = $self->param('input_aln');
00141     }
00142 #    my $aln_file = $self->param('input_aln');
00143     return unless (defined($aln_file));
00144 
00145     my $root_id = $self->param('nc_tree')->node_id;
00146     my $fasttree_tag = $root_id . ".". $self->worker->process_id . ".fasttree";
00147 
00148     my $fasttree_exe = $self->param('fasttree_exe')
00149         or die "'fasttree_exe' is an obligatory parameter";
00150 
00151     die "Cannot execute '$fasttree_exe'" unless(-x $fasttree_exe);
00152 
00153     my $fasttree_output = $self->worker_temp_directory . "FastTree.$fasttree_tag";
00154     my $tag = defined $self->param('fastTreeTag') ? $self->param('fastTreeTag') : 'ft_IT_nj';
00155 #    my $tag = 'ft_IT_nj';
00156     my $cmd = $fasttree_exe;
00157     $cmd .= " -nt -quiet -nopr";
00158     $cmd .= " $aln_file";
00159     $cmd .= " > $fasttree_output";
00160 
00161     print STDERR "$cmd\n" if ($self->debug);
00162     $self->compara_dba->dbc->disconnect_when_inactive(1);
00163     unless(system("$cmd") == 0) {
00164         $self->throw("error running FastTree\n$cmd\n");
00165     }
00166     $self->compara_dba->dbc->disconnect_when_inactive(0);
00167 
00168     $self->_store_newick_into_nc_tree_tag_string($tag, $fasttree_output);
00169 
00170     return 1;
00171 }
00172 
00173 sub _run_parsimonator {
00174     my ($self) = @_;
00175     my $aln_file = $self->param('input_aln');
00176     my $worker_temp_directory = $self->worker_temp_directory;
00177     die "$aln_file is not defined" unless (defined($aln_file));
00178 #    return unless(defined($aln_file));
00179 
00180     my $root_id = $self->param('nc_tree')->node_id;
00181     my $parsimonator_tag = $root_id . "." . $self->worker->process_id . ".parsimonator";
00182 
00183     my $parsimonator_exe = $self->param('parsimonator_exe')
00184         or die "'parsimonator_exe' is an obligatory parameter";
00185 
00186     die "Cannot execute '$parsimonator_exe'" unless(-x $parsimonator_exe);
00187 
00188     my $cmd = $parsimonator_exe;
00189     $cmd .= " -s $aln_file";
00190     $cmd .= " -n $parsimonator_tag";
00191     $cmd .= " -p 12345";
00192 
00193     print STDERR "$cmd\n" if ($self->debug);
00194     $self->compara_dba->dbc->disconnect_when_inactive(1);
00195     unless(system("cd $worker_temp_directory; $cmd") == 0) {
00196         $self->throw("error running parsimonator\ncd $worker_temp_directory; $cmd\n");
00197     }
00198     $self->compara_dba->dbc->disconnect_when_inactive(0);
00199 
00200     my $parsimonator_output = $worker_temp_directory . "/RAxML_parsimonyTree.${parsimonator_tag}.0";
00201     $self->param('parsimony_tree_file', $parsimonator_output);
00202 
00203     return;
00204 }
00205 
00206 sub _run_raxml_light {
00207     my ($self) = @_;
00208     my $aln_file = $self->param('input_aln');
00209     my $parsimony_tree = $self->param('parsimony_tree_file');
00210     my $worker_temp_directory = $self->worker_temp_directory;
00211     my $root_id = $self->param('nc_tree')->node_id;
00212 
00213     my $raxmlight_tag = $root_id . "." . $self->worker->process_id . ".raxmlight";
00214 
00215     my $raxmlLight_exe = $self->param('raxmlLight_exe')
00216         or die "'raxmlLight_exe' is an obligatory parameter";
00217 
00218     die "Cannot execute '$raxmlLight_exe'" unless(-x $raxmlLight_exe);
00219 
00220     my $tag = defined $self->param('raxmlLightTag') ? $self->param('raxmlLightTag') : 'ft_IT_ml';
00221 #    my $tag = 'ft_IT_ml';
00222     my $cmd = $raxmlLight_exe;
00223     $cmd .= " -m GTRGAMMA";
00224     $cmd .= " -s $aln_file";
00225     $cmd .= " -t $parsimony_tree";
00226     $cmd .= " -n $raxmlight_tag";
00227 
00228     $self->compara_dba->dbc->disconnect_when_inactive(1);
00229     unless(system("cd $worker_temp_directory; $cmd") == 0) {
00230         $self->throw("error running raxmlLight\ncd $worker_temp_directory; $cmd\n");
00231     }
00232     $self->compara_dba->dbc->disconnect_when_inactive(0);
00233 
00234     my $raxmlight_output = $worker_temp_directory . "/RAxML_result.${raxmlight_tag}";
00235     $self->_store_newick_into_nc_tree_tag_string($tag, $raxmlight_output);
00236 
00237     # Unlink run files
00238     my $temp_regexp = $self->worker_temp_directory;
00239     unlink <*$raxmlight_tag*>;
00240 
00241     return
00242 }
00243 
00244 sub _dumpMultipleAlignmentStructToWorkdir {
00245     my ($self, $tree) = @_;
00246 
00247   my $root_id = $tree->node_id;
00248   my $leafcount = scalar(@{$tree->get_all_leaves});
00249   if($leafcount<4) {
00250       $self->input_job->incomplete(0);
00251       $self->throw("tree cluster $root_id has <4 proteins - can not build a raxml tree\n");
00252   }
00253 
00254   my $file_root = $self->worker_temp_directory. "nctree_". $root_id;
00255   $file_root    =~ s/\/\//\//g;  # converts any // in path to /
00256 
00257   my $aln_file = $file_root . ".aln";
00258 #   if($self->debug) {
00259 #     printf("dumpMultipleAlignmentStructToWorkdir : %d members\n", $leafcount);
00260 #     print("aln_file = '$aln_file'\n");
00261 #   }
00262 
00263   open(OUTSEQ, ">$aln_file")
00264     or $self->throw("Error opening $aln_file for write");
00265 
00266   # Using append_taxon_id will give nice seqnames_taxonids needed for
00267   # njtree species_tree matching
00268   my %sa_params = ($self->param('use_genomedb_id')) ?   ('-APPEND_GENOMEDB_ID', 1) : ('-APPEND_TAXON_ID', 1);
00269 
00270   my $sa = $tree->get_SimpleAlign
00271     (
00272      -id_type => 'MEMBER',
00273      %sa_params,
00274     );
00275   $sa->set_displayname_flat(1);
00276 
00277     # Aln in fasta format (if needed)
00278     if ($sa->length() >= 5000) {
00279         # For FastTree it is better to give the alignment in fasta format
00280         my $aln_fasta = $file_root . ".fa";
00281         open my $aln_fasta_fh, ">" , $aln_fasta or $self->throw("Error opening $aln_fasta for writing");
00282         for my $aln_seq ($sa->each_seq) {
00283             my $header = $aln_seq->display_id;
00284             my $seq = $aln_seq->seq;
00285             print $aln_fasta_fh ">$header\n$seq\n";
00286         }
00287         close($aln_fasta_fh);
00288         $self->param('aln_fasta',$aln_fasta);
00289     }
00290 
00291 
00292   # Phylip header
00293   print OUTSEQ $sa->no_sequences, " ", $sa->length, "\n";
00294 
00295   $self->param('tag_residue_count', $sa->no_sequences * $sa->length);
00296   # Phylip body
00297   my $count = 0;
00298   foreach my $aln_seq ($sa->each_seq) {
00299     print OUTSEQ $aln_seq->display_id, " ";
00300     my $seq = $aln_seq->seq;
00301 
00302     # Here we do a trick for all Ns sequences by changing the first
00303     # nucleotide to an A so that raxml can at least do the tree for
00304     # the rest of the sequences, instead of giving an error
00305     if ($seq =~ /N+/) { $seq =~ s/^N/A/; }
00306 
00307     print OUTSEQ "$seq\n";
00308     $count++;
00309     print STDERR "sequences $count\n" if ($count % 50 == 0);
00310   }
00311   close OUTSEQ;
00312 
00313   return $aln_file;
00314 }
00315 
00316 sub _store_newick_into_nc_tree_tag_string {
00317   my $self = shift;
00318   my $tag = shift;
00319   my $newick_file = shift;
00320 
00321   my $newick = '';
00322   print("load from file $newick_file\n") if($self->debug);
00323   open (FH, $newick_file) or $self->throw("Couldnt open newick file [$newick_file]");
00324   while(<FH>) {
00325     chomp $_;
00326     $newick .= $_;
00327   }
00328   close(FH);
00329   $newick =~ s/(\d+\.\d{4})\d+/$1/g; # We round up to only 4 digits
00330 
00331   $self->param('nc_tree')->tree->store_tag($tag, $newick);
00332   if (defined($self->param('model'))) {
00333     my $bootstrap_tag = $self->param('model') . "_bootstrap_num";
00334     $self->param('nc_tree')->tree->store_tag($bootstrap_tag, $self->param('bootstrap_num'));
00335   }
00336 }
00337 
00338 sub _load_and_dump_alignment {
00339     my ($self) = @_;
00340 
00341     my $root_id = $self->param('nc_tree_id');
00342     my $alignment_id = $self->param('alignment_id');
00343     my $file_root = $self->worker_temp_directory. "nctree_" . $root_id;
00344     my $aln_file = $file_root . ".aln";
00345     open my $outaln, ">", "$aln_file" or $self->throw("Error opening $aln_file for writing");
00346 
00347 #    my $sql_load_alignment = "SELECT member_id, aligned_sequence FROM aligned_sequence WHERE alignment_id = $alignment_id";
00348     my $sql_load_alignment = "SELECT member_id, aligned_sequence FROM aligned_sequence WHERE alignment_id = ?";
00349     my $sth_load_alignment = $self->dbc->prepare($sql_load_alignment);
00350     print STDERR "SELECT member_id, aligned_sequence FROM aligned_sequence WHERE alignment_id = $alignment_id\n" if ($self->debug);
00351     $sth_load_alignment->execute($alignment_id);
00352     my $all_aln_seq_hashref = $sth_load_alignment->fetchall_arrayref({});
00353 
00354     my $seqLen = length($all_aln_seq_hashref->[0]->{aligned_sequence});
00355     if ($seqLen >= 5000) {
00356         # It is better to feed FastTree with aln in fasta format
00357         my $aln_fasta = $file_root . ".fa";
00358         open my $aln_fasta_fh, ">", $aln_fasta or $self->throw("Error opening $aln_fasta for writing");
00359         for my $row_hashref (@$all_aln_seq_hashref) {
00360             my $mem_id = $row_hashref->{member_id};
00361             my $member = $self->compara_dba->get_NCTreeAdaptor->fetch_AlignedMember_by_member_id_root_id($mem_id);
00362             my $taxid = $member->taxon_id();
00363             my $aln_seq = $row_hashref->{aligned_sequence};
00364             print $aln_fasta_fh ">" . $mem_id . "_" . $taxid . "\n";
00365             print $aln_fasta_fh $aln_seq . "\n";
00366         }
00367         close ($aln_fasta_fh);
00368         $self->param('aln_fasta', $aln_fasta);
00369     }
00370 
00371     print $outaln scalar(@$all_aln_seq_hashref), " ", $seqLen, "\n";
00372     for my $row_hashref (@$all_aln_seq_hashref) {
00373         my $mem_id = $row_hashref->{member_id};
00374         my $member = $self->compara_dba->get_NCTreeAdaptor->fetch_AlignedMember_by_member_id_root_id($mem_id);
00375         my $taxid = $member->taxon_id();
00376         my $aln_seq = $row_hashref->{aligned_sequence};
00377         print STDERR "$mem_id\t" if ($self->debug);
00378         print STDERR substr($aln_seq, 0, 60), "...\n" if ($self->debug);
00379         $aln_seq =~ s/^N/A/;  # To avoid RAxML failure
00380         print $outaln $mem_id, "_", $taxid, " ", $aln_seq, "\n";
00381     }
00382     close($outaln);
00383 
00384     $self->param('input_aln', $aln_file);
00385     return;
00386 }
00387 
00388 
00389 1;
00390