Archive Ensembl HomeArchive Ensembl Home
NCGenomicAlignment.pm
Go to the documentation of this file.
00001 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::NCGenomicAlignment;
00002 
00003 use strict;
00004 use warnings;
00005 use Data::Dumper;
00006 use Time::HiRes qw /time/;
00007 use Bio::EnsEMBL::Compara::Graph::NewickParser;
00008 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
00009 
00010 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00011 
00012 # We should receive:
00013 # nc_tree_id
00014 sub fetch_input {
00015     my ($self) = @_;
00016     my $nc_tree_id = $self->param('nc_tree_id');
00017     my $nc_tree = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id);
00018     $self->throw("tree with id $nc_tree_id is undefined") unless (defined $nc_tree);
00019     $self->param('nc_tree', $nc_tree);
00020     $self->param('input_fasta', $self->dump_sequences_to_workdir($nc_tree));
00021 
00022     # Autovivification
00023     $self->param("method_treefile", {});
00024 }
00025 
00026 sub run {
00027     my ($self) = @_;
00028     my $nc_tree_id = $self->param('nc_tree_id');
00029     if ($self->param('single_peptide_tree')) {
00030         $self->input_job->incomplete(0);
00031         die "single peptide tree\n";
00032     }
00033 
00034     if ($self->param('tag_gene_count') > 1000) { ## Too much
00035         $self->input_job->incomplete(0);
00036         my $tag_gene_count = $self->param('tag_gene_count');
00037         die "family $nc_tree_id has too many member ($tag_gene_count). No genomic alignments will be computed\n";
00038     }
00039 
00040     if ($self->param('tag_residue_count') > 150000) {  ## Likely to take too long
00041         $self->run_mafft;
00042         # We put the alignment into the db
00043         $self->store_fasta_alignment('mafft_output');
00044 
00045         $self->dataflow_output_id (
00046                                    {
00047                                     'nc_tree_id' => $self->param('nc_tree_id'),
00048                                     'fastTreeTag' => "ftga_IT_nj",
00049                                     'raxmlLightTag' => "ftga_IT_ml",
00050                                     'alignment_id' => $self->param('alignment_id'),
00051                                    },3
00052                                   );
00053 
00054         $self->input_job->incomplete(0);
00055         my $tag_residue_count = $self->param('tag_residue_count');
00056         die "Family too big for normal branch ($tag_residue_count bps) -- Only FastTrees will be generated\n";
00057     }
00058     if (($self->param('tag_residue_count') > 40000) && $self->param('inhugemem') != 1) { ## Big family -- queue in hugemem
00059         $self->dataflow_output_id (
00060                                    {
00061                                     'nc_tree_id' => $self->param('nc_tree_id'),
00062                                     'alignment_id' => $self->param('alignment_id'),
00063                                     'inhugemem' => 1,
00064                                    }, -1
00065                                   );
00066         # Should we die here? Nothing more to do in the Runnable?
00067         my $tag_residue_count = $self->param('tag_residue_count');
00068         $self->input_job->incomplete(0);
00069         die "Re-scheduled in hugemem queue ($tag_residue_count bps)\n";
00070     } else {
00071         $self->run_mafft;
00072         $self->fasta2phylip;
00073         ## FIXME -- RAxML will fail for families with < 4 members.
00074         $self->run_RAxML;
00075         $self->run_prank;
00076 #    $self->run_ncgenomic_tree('phyml');
00077 #    $self->run_ncgenomic_tree('nj'); # Useful for 3-membered trees
00078     }
00079 }
00080 
00081 sub write_output {
00082     my ($self) = @_;
00083 #     if ($self->param("MEMLIMIT")) { ## We had a problem in RAxML -- re-schedule in hugemem
00084 #         $self->dataflow_output_id (
00085 #                                    {
00086 #                                     'nc_tree_id' => $self->param('nc_tree_id')
00087 #                                    }, -1
00088 #                                   );
00089 #     } else {
00090         for my $method (qw/phyml nj/) {
00091             $self->dataflow_output_id (
00092                                        {
00093                                         'nc_tree_id'   => $self->param('nc_tree_id'),
00094                                         'method'       => $method,
00095                                         'alignment_id' => $self->param('alignment_id'),
00096                                        }, 2
00097                                       );
00098         }
00099 #    }
00100 }
00101 
00102 sub dump_sequences_to_workdir {
00103     my ($self,$cluster) = @_;
00104     my $fastafile = $self->worker_temp_directory . "cluster_" . $cluster->node_id . ".fasta";
00105 
00106     my $member_list = $cluster->get_all_leaves;
00107     $self->param('tag_gene_count', scalar (@{$member_list}) );
00108 
00109     open (OUTSEQ, ">$fastafile") or $self->throw("Error opening $fastafile for writing: $!");
00110     if (scalar @{$member_list} < 2) {
00111         $self->param('single_peptide_tree', 1);
00112         return 1;
00113     }
00114 
00115     my $residues = 0;
00116     my $count = 0;
00117     foreach my $member (@{$member_list}) {
00118         my $gene_member = $member->gene_member;
00119         $self->throw("Error fetching gene member") unless (defined $gene_member) ;
00120         my $gene = $gene_member -> get_Gene;
00121         $self->throw("Error fetching gene") unless (defined $gene);
00122         my $slice = $gene->slice->adaptor->fetch_by_Feature($gene, '500%');
00123         $self->throw("Error fetching slice") unless (defined $slice);
00124         my $seq = $slice->seq;
00125         # fetch_by_Feature returns always the + strand
00126         if ($gene->strand() < 0) {
00127             reverse_comp(\$seq);
00128         }
00129         $residues += length($seq);
00130         $seq =~ s/(.{72})/$1\n/g;
00131         chomp $seq;
00132         $count++;
00133         print STDERR $member->stable_id. "\n" if ($self->debug);
00134         print OUTSEQ ">" . $member->member_id . "\n$seq\n";
00135         print STDERR "sequences $count\n" if ($count % 50 == 0);
00136     }
00137     close OUTSEQ;
00138 
00139     if(scalar (@{$member_list}) <= 1) {
00140         $self->update_single_peptide_tree($cluster);
00141         $self->param('single_peptide_tree', 1);
00142     }
00143 
00144     $self->param('tag_residue_count', $residues);
00145 
00146     return $fastafile;
00147 }
00148 
00149 sub update_single_peptide_tree {
00150     my ($self, $tree) = @_;
00151 
00152     foreach my $member (@{$tree->get_all_leaves}) {
00153         next unless($member->isa('Bio::EnsEMBL::Compara::GeneTreeMember'));
00154         next unless($member->sequence);
00155         $member->cigar_line(length($member->sequence)."M");
00156         $self->compara_dba->get_NCTreeAdaptor->update_node($member);
00157         printf("single_pepide_tree %s : %s\n", $member->stable_id, $member->cigar_line) if($self->debug);
00158     }
00159 }
00160 
00161 sub run_mafft {
00162     my ($self) = @_;
00163 
00164 #    return if ($self->param('too_few_sequences') == 1); # return? die? $self->throw?
00165     my $nc_tree_id = $self->param('nc_tree_id');
00166     my $input_fasta = $self->param('input_fasta');
00167     my $mafft_output = $self->worker_temp_directory . "/mafft_".$nc_tree_id . ".msa";
00168     $self->param('mafft_output',$mafft_output);
00169 
00170     my $mafft_exe      = $self->param('mafft_exe')
00171         or die "'mafft_exe' is an obligatory parameter";
00172 
00173     die "Cannot execute '$mafft_exe'" unless(-x $mafft_exe);
00174 
00175     my $mafft_binaries = $self->param('mafft_binaries')
00176         or die "'mafft_binaries' is an obligatory parameter";
00177 
00178     $ENV{MAFFT_BINARIES} = $mafft_binaries;
00179 
00180     my $cmd = "$mafft_exe --auto $input_fasta > $mafft_output";
00181     print STDERR "Running mafft\n$cmd\n" if ($self->debug);
00182     print STDERR "mafft_output has been set to " . $self->param('mafft_output') . "\n" if ($self->debug);
00183     $self->compara_dba->dbc->disconnect_when_inactive(0);
00184     unless ((my $err = system($cmd)) == 0) {
00185         $self->throw("problem running command $cmd: $err\n");
00186     }
00187     $self->compara_dba->dbc->disconnect_when_inactive(1);
00188 }
00189 
00190 sub run_RAxML {
00191     my ($self) = @_;
00192 
00193 #    return if ($self->param('too_few_sequences') == 1);  # return? die? $self->throw? This has been checked before
00194     my $nc_tree_id = $self->param('nc_tree_id');
00195     my $aln_file = $self->param('phylip_output');
00196     return unless (defined $aln_file);
00197 
00198     my $raxml_outdir = $self->worker_temp_directory;
00199     my $raxml_outfile = "raxml_${nc_tree_id}.raxml";
00200     my $raxml_output = $raxml_outdir."/$raxml_outfile";
00201 
00202     $self->param('raxml_output',"$raxml_outdir/RAxML_bestTree.$raxml_outfile");
00203 
00204     my $raxml_exe = $self->param('raxml_exe')
00205         or die "'raxml_exe' is an obligatory parameter";
00206 
00207     die "Cannot execute '$raxml_exe'" unless(-x $raxml_exe);
00208 
00209     my $bootstrap_num = 10;  ## Should be soft-coded?
00210     my $raxml_err_file = $self->worker_temp_directory . "raxml.err";
00211     my $cmd = $raxml_exe;
00212     $cmd .= " -T 2";
00213     $cmd .= " -m GTRGAMMA";
00214     $cmd .= " -s $aln_file";
00215     $cmd .= " -N $bootstrap_num";
00216     $cmd .= " -n $raxml_outfile";
00217     $cmd .= " 2> $raxml_err_file";
00218     $self->compara_dba->dbc->disconnect_when_inactive(1);
00219 #    my $bootstrap_starttime = time() * 1000;
00220 
00221     print STDERR "$cmd\n" if ($self->debug);
00222     unless (system("cd $raxml_outdir; $cmd") == 0) {
00223         # memory problem?
00224         print STDERR "We have a problem running RAxML -- Inspecting $raxml_err_file\n";
00225         open my $raxml_err_fh, "<", $raxml_err_file or die $!;
00226         while (<$raxml_err_fh>) {
00227             chomp;
00228             if (/malloc_aligned/) {
00229                 $self->dataflow_output_id (
00230                                            {
00231                                             'nc_tree_id' => $self->param('nc_tree_id'),
00232                                            }, -1
00233                                           );
00234                 $self->input_job->incomplete(0);
00235                 die "RAXML ERROR: $_";
00236             }
00237         }
00238         close($raxml_err_fh);
00239     }
00240 
00241     $self->compara_dba->dbc->disconnect_when_inactive(0);
00242 #    my $boostrap_msec = int(time() * 1000-$bootstrap_starttime);
00243 #    $self->_get_bootstraps($bootstrap_msec,$bootstrap_num);  # Don't needed -- we don't run the second RAxML for now
00244     return
00245 }
00246 
00247 sub _get_bootstraps {
00248     my ($self,$bootstrap_msec, $bootstrap_num) = @_;
00249 
00250     my $ideal_msec = 30000; # 5 minutes
00251     my $time_per_sample = $bootstrap_msec / $bootstrap_num;
00252     my $ideal_bootstrap_num = $ideal_msec / $time_per_sample;
00253     if ($ideal_bootstrap_num < 5) {
00254         $self->param('bootstrap_num',1);
00255     } elsif ($ideal_bootstrap_num < 10) {
00256         $self->param('bootstrap_num',10);
00257     } elsif ($ideal_bootstrap_num > 100) {
00258         $self->param('bootstrap_num',100)
00259     } else {
00260         $self->param('bootstrap_num',int ($ideal_bootstrap_num));
00261     }
00262     return
00263 }
00264 
00265 sub run_prank {
00266     my ($self) = @_;
00267 
00268 #    return if ($self->param('too_few_sequences') == 1);  # return? die? $self->throw? This has been checked before
00269     my $nc_tree_id = $self->param('nc_tree_id');
00270     my $input_fasta = $self->param('input_fasta');
00271     my $tree_file = $self->param('raxml_output');
00272 #    return unless (defined $tree_file);
00273     $self->throw("$tree_file does not exist\n") unless (-e $tree_file);
00274 
00275     ## FIXME -- The alignment has to be passed to NCGenomicTree. We have several options:
00276     # 1.- Store the alignments in the database
00277     # 2.- Store the alignments in a shared filesystem (i.e. lustre)
00278     # 3.- Pass it in memory as a string.
00279     # For now, we will be using #1
00280     my $prank_output = $self->worker_temp_directory . "/prank_${nc_tree_id}.prank";
00281 #    my $prank_output = "/lustre/scratch103/ensembl/mp12/ncRNA_pipeline/prank_${nc_tree_id}.prank";
00282 
00283     my $prank_exe = $self->param('prank_exe')
00284         or die "'prank_exe' is an obligatory parameter";
00285 
00286     die "Cannot execute '$prank_exe'" unless(-x $prank_exe);
00287 
00288     my $cmd = $prank_exe;
00289     # /software/ensembl/compara/prank/090707/src/prank -noxml -notree -f=Fasta -o=/tmp/worker.904/cluster_17438.mfa -d=/tmp/worker.904/cluster_17438.fast -t=/tmp/worker.904/cluster17438/RAxML.tree
00290     $cmd .= " -noxml -notree -once -f=Fasta";
00291     $cmd .= " -t=$tree_file";
00292     $cmd .= " -o=$prank_output";
00293     $cmd .= " -d=$input_fasta";
00294     $self->compara_dba->dbc->disconnect_when_inactive(1);
00295     print("$cmd\n") if($self->debug);
00296     unless ((my $err = system ($cmd)) == 0) {
00297         $self->throw("problem running prank $cmd: $err\n");
00298     }
00299 
00300     # prank renames the output by adding ".2.fas" => .1.fas" because it doesn't need to make the tree
00301     print STDERR "Prank output : ${prank_output}.1.fas\n" if ($self->debug);
00302     $self->param('prank_output',"${prank_output}.1.fas");
00303     $self->store_fasta_alignment("prank_output");
00304 }
00305 
00306 sub fasta2phylip {
00307     my ($self) = @_;
00308 #    return 1 if ($self->param('too_few_sequences') == 1); # This has been checked before
00309     my $fasta_in = $self->param('mafft_output');
00310     my $nc_tree_id = $self->param('nc_tree_id');
00311     my $phylip_out = $self->worker_temp_directory . "/mafft_${nc_tree_id}.phylip";
00312     my %seqs;
00313     open my $msa, "<", $fasta_in or $self->throw("I can not open the prank msa file $fasta_in : $!\n");
00314     my ($header,$seq);
00315     while (<$msa>) {
00316         chomp;
00317         if (/^>/) {
00318             $seqs{$header} = $seq if (defined $header);
00319             $header = substr($_,1);
00320             $seq = "";
00321             next;
00322         }
00323         $seq .= $_;
00324     }
00325     $seqs{$header} = $seq;
00326 
00327     close($msa);
00328 
00329     my $nseqs = scalar(keys %seqs);
00330     my $length = length($seqs{$header});
00331 
00332     if ($nseqs < 4) {
00333 #        $self->param('too_few_sequences',1);
00334         $self->input_job->incomplete(0);
00335         die "Too few sequences (< 4), we can not compute RAxML tree";
00336     }
00337 
00338     open my $phy, ">", $phylip_out or $self->throw("I can not open the phylip output file $phylip_out : $!\n");
00339     print $phy "$nseqs $length\n";
00340     for my $h (keys %seqs) {
00341         printf $phy ("%-9.9s ",$h);
00342         $seqs{$h} =~ s/^-/A/;
00343         print $phy $seqs{$h}, "\n";
00344     }
00345     close($phy);
00346     $self->param('phylip_output',$phylip_out);
00347 }
00348 
00349 sub store_fasta_alignment {
00350     my ($self, $param) = @_;
00351 
00352     my $nc_tree_id = $self->param('nc_tree_id');
00353     my $uniq_alignment_id = "$param" . "_" . $self->input_job->dbID ;
00354     my $aln_file = $self->param($param);
00355 
00356     # Insert a new alignment in the DB
00357     my $sql_new_alignment = "INSERT IGNORE INTO alignment (alignment_id, compara_table, compara_key) VALUES (?, 'ncrna', ?)";
00358     print STDERR "$sql_new_alignment\n" if ($self->debug);
00359     my $sth_new_alignment = $self->dbc->prepare($sql_new_alignment);
00360     $sth_new_alignment->execute($uniq_alignment_id, $nc_tree_id);
00361     $sth_new_alignment->finish();
00362 
00363     # read the alignment back from file
00364     print "Reading alignment fasta file $aln_file\n" if ($self->debug());
00365     open my $aln_fh, "<", $aln_file or die "I can't open $aln_file for reading\n";
00366     my $aln_header;
00367     my $aln_seq;
00368     my $sql_new_alnseq = "INSERT INTO aligned_sequence (alignment_id, aligned_length, member_id, aligned_sequence) VALUES (?,?,?,?)";
00369     my $sth_new_alnseq = $self->dbc->prepare($sql_new_alnseq);
00370     while (<$aln_fh>) {
00371         chomp;
00372         if (/^>/) {
00373             if (! defined ($aln_header)) {
00374                 ($aln_header) = $_ =~ />(.+)/;
00375                 next;
00376             }
00377             my $l = length($aln_seq);
00378             print STDERR "INSERT INTO aligned_sequence (alignment_id, aligned_length, member_id, aligned_sequence) VALUES ($uniq_alignment_id, $l, $aln_header, $aln_seq)\n";
00379             $sth_new_alnseq->execute($uniq_alignment_id, $l, $aln_header, $aln_seq);
00380             ($aln_header) = $_ =~ />(.+)/;
00381             $aln_seq = "";
00382         } else {
00383             $aln_seq .= $_;
00384         }
00385     }
00386     my $l = length($aln_seq);
00387     $sth_new_alnseq->execute($uniq_alignment_id, $l, $aln_header, $aln_seq);
00388     $sth_new_alnseq->finish();
00389 
00390     $self->param('alignment_id', $uniq_alignment_id);
00391     return;
00392 }
00393 
00394 1;