Archive Ensembl HomeArchive Ensembl Home
Infernal.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::Infernal
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db           = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $infernal = Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::Infernal->new
00018   (
00019    -db         => $db,
00020    -input_id   => $input_id,
00021    -analysis   => $analysis
00022   );
00023 $infernal->fetch_input(); #reads from DB
00024 $infernal->run();
00025 $infernal->output();
00026 $infernal->write_output(); #writes to DB
00027 
00028 =cut
00029 
00030 
00031 =head1 DESCRIPTION
00032 
00033 This Analysis will take the sequences from a cluster, the cm from
00034 nc_profile and run a profiled alignment, storing the results as
00035 cigar_lines for each sequence.
00036 
00037 =cut
00038 
00039 
00040 =head1 CONTACT
00041 
00042   Contact Albert Vilella on module implementation/design detail: avilella@ebi.ac.uk
00043   Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
00044 
00045 =cut
00046 
00047 
00048 =head1 APPENDIX
00049 
00050 The rest of the documentation details each of the object methods. 
00051 Internal methods are usually preceded with a _
00052 
00053 =cut
00054 
00055 
00056 package Bio::EnsEMBL::Compara::RunnableDB::ncRNAtrees::Infernal;
00057 
00058 use strict;
00059 use Time::HiRes qw(time gettimeofday tv_interval);
00060 use Data::Dumper;
00061 
00062 use Bio::AlignIO;
00063 use Bio::EnsEMBL::BaseAlignFeature;
00064 use Bio::EnsEMBL::Compara::Member;
00065 
00066 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00067 
00068 
00069 sub param_defaults {
00070     return {
00071             'method'      => 'Infernal',
00072            };
00073 }
00074 
00075 
00076 =head2 fetch_input
00077 
00078     Title   :   fetch_input
00079     Usage   :   $self->fetch_input
00080     Function:   Fetches input data for repeatmasker from the database
00081     Returns :   none
00082     Args    :   none
00083 
00084 =cut
00085 
00086 
00087 sub fetch_input {
00088     my $self = shift @_;
00089 
00090     $self->input_job->transient_error(0);
00091     my $nc_tree_id = $self->param('nc_tree_id') || die "'nc_tree_id' is an obligatory numeric parameter\n";
00092     $self->input_job->transient_error(1);
00093 
00094     my $nc_tree    = $self->compara_dba->get_NCTreeAdaptor->fetch_node_by_node_id($nc_tree_id) or die "Could not fetch nc_tree with id=$nc_tree_id\n";
00095 
00096 #     my $n_nodes = $nc_tree->get_tagvalue('gene_count');
00097 #     if ($n_nodes == 1) {
00098 #         die "Only one member in tree $nc_tree_id";
00099 #     }
00100 
00101     $self->param('nc_tree', $nc_tree);
00102 
00103     $self->param('model_id_hash', {});
00104 
00105     $self->param('input_fasta', $self->dump_sequences_to_workdir($nc_tree));
00106 
00107     print STDERR Dumper $self->param('model_id_hash') if ($self->debug);
00108 
00109     $self->param('infernal_starttime', time()*1000);
00110 }
00111 
00112 
00113 =head2 run
00114 
00115     Title   :   run
00116     Usage   :   $self->run
00117     Function:   runs hmmbuild
00118     Returns :   none
00119     Args    :   none
00120 
00121 =cut
00122 
00123 sub run {
00124     my $self = shift @_;
00125 
00126     # It would be better to die!? We can't do nothing with single peptide trees.
00127     # Hopefully single peptide trees doesn't reach this runnable
00128     return if ($self->param('single_peptide_tree'));
00129     $self->run_infernal;
00130 }
00131 
00132 
00133 =head2 write_output
00134 
00135     Title   :   write_output
00136     Usage   :   $self->write_output
00137     Function:   stores nctree
00138     Returns :   none
00139     Args    :   none
00140 
00141 =cut
00142 
00143 
00144 sub write_output {
00145     my $self = shift @_;
00146 
00147     $self->parse_and_store_alignment_into_tree;
00148     $self->_store_aln_tags;
00149 }
00150 
00151 
00152 ##########################################
00153 #
00154 # internal methods
00155 #
00156 ##########################################
00157 
00158 1;
00159 
00160 sub dump_sequences_to_workdir {
00161   my $self = shift;
00162   my $cluster = shift;
00163 
00164   print STDERR Dumper $cluster if ($self->debug);
00165 
00166   my $fastafile = $self->worker_temp_directory . "cluster_" . $cluster->node_id . ".fasta";
00167   print("fastafile = '$fastafile'\n") if($self->debug);
00168 
00169   my $seq_id_hash;
00170   my $residues = 0;
00171   print "fetching sequences...\n" if ($self->debug);
00172 
00173   my $node_id = $cluster->node_id;
00174   my $member_list = $cluster->get_all_leaves;
00175   if (2 > scalar @$member_list) {
00176 #      $self->input_job->transient_error(0);
00177       $self->input_job->incomplete(0);
00178       die ("Only one member for cluster [$node_id]");
00179 #      return undef
00180   }
00181   print STDERR "Counting number of members\n" if ($self->debug);
00182   my $tag_gene_count = scalar(@{$member_list});
00183 
00184   open(OUTSEQ, ">$fastafile")
00185     or $self->throw("Error opening $fastafile for write!");
00186   my $count = 0;
00187 
00188   my @no_acc_members = ();
00189   foreach my $member (@{$member_list}) {
00190     my $sequence_id;
00191     eval {$sequence_id = $member->sequence_id;};
00192     if ($@) {
00193       $DB::single=1;1;
00194     }
00195     next if($seq_id_hash->{$sequence_id});
00196     my $description;
00197     eval { $description = $member->description; };
00198     unless (defined($description) && $description =~ /Acc\:(\w+)/) {
00199       warn ("No accession for [$description]");
00200       push @no_acc_members, $member->dbID;
00201     }
00202     $seq_id_hash->{$sequence_id} = 1;
00203     $count++;
00204     my $member_model_id = $1;
00205     $self->param('model_id_hash')->{$member_model_id} = 1;
00206 
00207     my $seq = $member->sequence;
00208     $residues += $member->seq_length;
00209     $seq =~ s/(.{72})/$1\n/g;
00210     chomp $seq;
00211     print STDERR $member->sequence_id. "\n" if ($self->debug);
00212     print OUTSEQ ">". $member->sequence_id. "\n$seq\n";
00213     print STDERR "sequences $count\n" if ($count % 50 == 0);
00214   }
00215   close(OUTSEQ);
00216   unless (keys %{$self->param('model_id_hash')}) {
00217       die "No Accs found for nc_tree_id $node_id : ", join ",",@no_acc_members;
00218   }
00219 
00220 
00221   if(scalar keys (%{$seq_id_hash}) <= 1) {
00222     $self->update_single_peptide_tree($cluster);
00223     $self->param('single_peptide_tree', 1);
00224   }
00225 
00226   my $this_hash_count = scalar keys %$seq_id_hash;
00227   my $perc_unique = ($this_hash_count / $tag_gene_count) * 100;
00228   print "tag_gene_count $tag_gene_count\n";
00229   print "Percent unique sequences: $perc_unique ($this_hash_count / $tag_gene_count)\n" if ($self->debug);
00230 
00231   return $fastafile;
00232 }
00233 
00234 sub update_single_peptide_tree {
00235   my $self   = shift;
00236   my $tree   = shift;
00237 
00238   foreach my $member (@{$tree->get_all_leaves}) {
00239     next unless($member->isa('Bio::EnsEMBL::Compara::GeneTreeMember'));
00240     next unless($member->sequence);
00241     $DB::single=1;1;
00242     $member->cigar_line(length($member->sequence)."M");
00243     $self->compara_dba->get_NCTreeAdaptor->update_node($member);
00244     printf("single_pepide_tree %s : %s\n", $member->stable_id, $member->cigar_line) if($self->debug);
00245   }
00246 }
00247 
00248 
00249 sub run_infernal {
00250   my $self = shift;
00251 
00252   my $stk_output = $self->worker_temp_directory . "output.stk";
00253   my $nc_tree_id = $self->param('nc_tree_id');
00254 
00255   my $cmalign_exe = $self->param('cmalign_exe')
00256     or die "'cmalign_exe' is an obligatory parameter";
00257 
00258   die "Cannot execute '$cmalign_exe'" unless(-x $cmalign_exe);
00259 
00260 
00261   my $model_id;
00262 
00263 #   if (1 < scalar keys %{$self->param('model_id_hash')}) {
00264 #     # We revert to the clustering_id tag, which maps to the RFAM
00265 #     # 'name' field in nc_profile (e.g. 'mir-135' instead of 'RF00246')
00266 #     print STDERR "WARNING: More than one model: ", join(",",keys %{$self->param('model_id_hash')}), "\n";
00267 #     $model_id = $self->param('nc_tree')->get_tagvalue('clustering_id') or $self->throw("'clustering_id' tag for this tree is not defined");
00268 #     # $self->throw("This cluster has more than one associated model");
00269 #   } else {
00270 #     my @models = keys %{$self->param('model_id_hash')};
00271 #     $model_id = $models[0] or die ("model_id_hash is empty?");
00272 #   }
00273 
00274   if (scalar keys %{$self->param('model_id_hash')} > 1) {
00275       print STDERR "WARNING: More than one model: ", join(",",keys %{$self->param('model_id_hash')}), "\n";
00276   }
00277   $model_id = $self->param('nc_tree')->tree->get_tagvalue('clustering_id') or $self->throw("'clustering_id' tag for this tree is not defined");
00278 
00279   $self->param('model_id', $model_id );
00280 
00281   print STDERR "Model_id : $model_id\n" if ($self->debug);
00282   my $ret1 = $self->dump_model('model_id', $model_id );
00283   my $ret2 = $self->dump_model('name',     $model_id ) if (1 == $ret1);
00284   if (1 == $ret2) {
00285     $self->param('nc_tree')->release_tree;
00286     $self->param('nc_tree', undef);
00287     $self->input_job->transient_error(0);
00288     die ("Failed to find '$model_id' both in 'model_id' and 'name' fields of 'nc_profile' table");
00289   }
00290 
00291 
00292   my $cmd = $cmalign_exe;
00293   # infernal -o cluster_6357.stk RF00599_profile.cm cluster_6357.fasta
00294 
00295   $cmd .= " --mxsize 4000 " if($self->input_job->retry_count >= 1); # large alignments FIXME separate Infernal_huge
00296   $cmd .= " -o " . $stk_output;
00297   $cmd .= " " . $self->param('profile_file');
00298   $cmd .= " " . $self->param('input_fasta');
00299 
00300   $self->compara_dba->dbc->disconnect_when_inactive(1);
00301   print("$cmd\n") if($self->debug);
00302   $DB::single=1;1;
00303   unless(system($cmd) == 0) {
00304     $self->throw("error running infernal, $!\n");
00305   }
00306   $self->compara_dba->dbc->disconnect_when_inactive(0);
00307 
00308   # cmbuild --refine the alignment
00309   ######################
00310   # Attempt to refine the alignment before building the CM using
00311   # expectation-maximization (EM). A CM is first built from the
00312   # initial alignment as usual. Then, the sequences in the alignment
00313   # are realigned optimally (with the HMM banded CYK algorithm,
00314   # optimal means optimal given the bands) to the CM, and a new CM is
00315   # built from the resulting alignment. The sequences are then
00316   # realigned to the new CM, and a new CM is built from that
00317   # alignment. This is continued until convergence, specifically when
00318   # the alignments for two successive iterations are not significantly
00319   # different (the summed bit scores of all the sequences in the
00320   # alignment changes less than 1% be- tween two successive
00321   # iterations). The final alignment (the alignment used to build the
00322   # CM that gets written to cmfile) is written to <f>.
00323 
00324   # cmbuild --refine output.stk.new -F mir-32_profile.cm.new output.stk
00325   my $refined_stk_output = $stk_output . ".refined";
00326   my $refined_profile = $self->param('profile_file') . ".refined";
00327 
00328   my $cmbuild_exe = $self->param('cmbuild_exe')
00329     or die "'cmbuild_exe' is an obligatory parameter";
00330 
00331   die "Cannot execute '$cmbuild_exe'" unless(-x $cmbuild_exe);
00332 
00333   $cmd = $cmbuild_exe;
00334   $cmd .= " --refine $refined_stk_output";
00335   $cmd .= " -F $refined_profile";
00336   $cmd .= " $stk_output";
00337   $self->compara_dba->dbc->disconnect_when_inactive(1);
00338 
00339   unless(system($cmd) == 0) {
00340     $self->throw("error running cmbuild refine, $!\n");
00341   }
00342   $self->compara_dba->dbc->disconnect_when_inactive(0);
00343 
00344   $self->param('stk_output', $refined_stk_output);
00345   # Reformat with sreformat
00346   my $fasta_output = $self->worker_temp_directory . "output.fasta";
00347   my $cmd = "/usr/local/ensembl/bin/sreformat a2m $refined_stk_output > $fasta_output";
00348   unless( system("$cmd") == 0) {
00349     print("$cmd\n");
00350     $self->throw("error running sreformat, $!\n");
00351   }
00352 
00353   $self->param('infernal_output', $fasta_output);
00354 
00355   return 0;
00356 }
00357 
00358 sub dump_model {
00359   my $self = shift;
00360   my $field = shift;
00361   my $model_id = shift;
00362 
00363   my $sql = 
00364     "SELECT hc_profile FROM nc_profile ".
00365       "WHERE $field=\"$model_id\"";
00366   my $sth = $self->compara_dba->dbc->prepare($sql);
00367   $sth->execute();
00368   my $nc_profile  = $sth->fetchrow;
00369   unless (defined($nc_profile)) {
00370     return 1;
00371   }
00372   my $profile_file = $self->worker_temp_directory . $model_id . "_profile.cm";
00373   open FILE, ">$profile_file" or die "$!";
00374   print FILE $nc_profile;
00375   close FILE;
00376 
00377   $self->param('profile_file', $profile_file);
00378   return 0;
00379 }
00380 
00381 sub parse_and_store_alignment_into_tree {
00382   my $self = shift;
00383   my $infernal_output =  $self->param('infernal_output');
00384   my $tree = $self->param('nc_tree');
00385 
00386   return unless($infernal_output);
00387 
00388   #
00389   # parse SS_cons lines and store into nc_tree_tag
00390   #
00391 
00392   my $stk_output = $self->param('stk_output');
00393   open (STKFILE, $stk_output) or $self->throw("Couldnt open STK file [$stk_output]");
00394   my $ss_cons_string = '';
00395   while(<STKFILE>) {
00396     next unless ($_ =~ /SS_cons/);
00397     my $line = $_;
00398     $line =~ /\#=GC\s+SS_cons\s+(\S+)\n/;
00399     $self->throw("Malformed SS_cons line") unless (defined($1));
00400     $ss_cons_string .= $1;
00401   }
00402   close(STKFILE);
00403   $self->param('nc_tree')->tree->store_tag('ss_cons', $ss_cons_string);
00404 
00405   #
00406   # parse alignment file into hash: combine alignment lines
00407   #
00408   my %align_hash;
00409 
00410   # fasta format
00411   my $aln_io = Bio::AlignIO->new
00412     (-file => "$infernal_output",
00413      -format => 'fasta');
00414   my $aln = $aln_io->next_aln;
00415   foreach my $seq ($aln->each_seq) {
00416     $align_hash{$seq->display_id} = $seq->seq;
00417   }
00418   $aln_io->close;
00419 
00420   #
00421   # convert alignment string into a cigar_line
00422   #
00423 
00424   my $alignment_length;
00425   foreach my $id (keys %align_hash) {
00426     my $alignment_string = $align_hash{$id};
00427     unless (defined $alignment_length) {
00428       $alignment_length = length($alignment_string);
00429     } else {
00430       if ($alignment_length != length($alignment_string)) {
00431         $self->throw("While parsing the alignment, some id did not return the expected alignment length\n");
00432       }
00433     }
00434 
00435     # From the Infernal UserGuide:
00436     # ###########################
00437     # In the aligned sequences, a '.' character indicates an inserted column
00438     # relative to consensus; the '.' character is an alignment pad. A '-'
00439     # character is a deletion relative to consensus.  The symbols in the
00440     # consensus secondary structure annotation line have the same meaning
00441     # that they did in a pairwise alignment from cmsearch. The #=GC RF line
00442     # is reference annotation. Non-gap characters in this line mark
00443     # consensus columns; cmalign uses the residues of the consensus sequence
00444     # here, with UPPER CASE denoting STRONGLY CONSERVED RESIDUES, and LOWER
00445     # CASE denoting WEAKLY CONSERVED RESIDUES. Gap characters (specifically,
00446     # the '.' pads) mark insertions relative to consensus. As described below,
00447     # cmbuild is capable of reading these RF lines, so you can specify which
00448     # columns are consensus and which are inserts (otherwise, cmbuild makes
00449     # an automated guess, based on the frequency of gaps in each column)
00450     $alignment_string =~ s/\./\-/g;            # Infernal returns dots even though they are gaps
00451     $alignment_string = uc($alignment_string); # Infernal can lower-case regions
00452     $alignment_string =~ s/\-([A-Z])/\- $1/g;
00453     $alignment_string =~ s/([A-Z])\-/$1 \-/g;
00454 
00455     my @cigar_segments = split " ",$alignment_string;
00456 
00457     my $cigar_line = "";
00458     foreach my $segment (@cigar_segments) {
00459       my $seglength = length($segment);
00460       $seglength = "" if ($seglength == 1);
00461       if ($segment =~ /^\-+$/) {
00462         $cigar_line .= $seglength . "D";
00463       } else {
00464         $cigar_line .= $seglength . "M";
00465       }
00466     }
00467     $align_hash{$id} = $cigar_line;
00468   }
00469 
00470   #
00471   # align cigar_line to member and store
00472   #
00473   foreach my $member (@{$tree->get_all_leaves}) {
00474     if ($align_hash{$member->sequence_id} eq "") {
00475       $self->throw("infernal produced an empty cigar_line for ".$member->stable_id."\n");
00476     }
00477 
00478     $member->cigar_line($align_hash{$member->sequence_id});
00479     ## Check that the cigar length (Ms) matches the sequence length
00480     my @cigar_match_lengths = map { if ($_ eq '') {$_ = 1} else {$_ = $_;} } map { $_ =~ /^(\d*)/ } ( $member->cigar_line =~ /(\d*[M])/g );
00481     my $seq_cigar_length; map { $seq_cigar_length += $_ } @cigar_match_lengths;
00482     my $member_sequence = $member->sequence; $member_sequence =~ s/\*//g;
00483     if ($seq_cigar_length != length($member_sequence)) {
00484       $self->throw("While storing the cigar line, the returned cigar length did not match the sequence length\n");
00485     }
00486     #
00487 #    printf("update nc_tree_member %s : %s\n",$member->stable_id, $member->cigar_line) if($self->debug);
00488     $self->compara_dba->get_NCTreeAdaptor->update_node($member);
00489   }
00490 
00491   return undef;
00492 }
00493 
00494 sub _store_aln_tags {
00495     my $self = shift;
00496     my $tree = $self->param('nc_tree');
00497     return unless($tree);
00498 
00499     my $pta = $self->compara_dba->get_NCTreeAdaptor;
00500 
00501     print "Storing Alignment tags...\n";
00502     my $sa = $tree->get_SimpleAlign;
00503     $DB::single=1;1;
00504     # Model id
00505     $tree->tree->store_tag("model_id",$self->param('model_id') );
00506 
00507     # Alignment percent identity.
00508     my $aln_pi = $sa->average_percentage_identity;
00509     $tree->tree->store_tag("aln_percent_identity",$aln_pi);
00510 
00511     # Alignment length.
00512     my $aln_length = $sa->length;
00513     $tree->tree->store_tag("aln_length",$aln_length);
00514 
00515     # Alignment runtime.
00516     my $aln_runtime = int(time()*1000-$self->param('infernal_starttime'));
00517     $tree->tree->store_tag("aln_runtime",$aln_runtime);
00518 
00519     # Alignment method.
00520     my $aln_method = $self->param('method');
00521     $tree->tree->store_tag("aln_method",$aln_method);
00522 
00523     # Alignment residue count.
00524     my $aln_num_residues = $sa->no_residues;
00525     $tree->tree->store_tag("aln_num_residues",$aln_num_residues);
00526 
00527 }
00528 
00529 
00530 1;