Archive Ensembl HomeArchive Ensembl Home
PairAligner.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PairAligner
00022 
00023 =cut
00024 
00025 =head1 SYNOPSIS
00026 
00027 =cut
00028 
00029 =head1 DESCRIPTION
00030 
00031 This object is an abstract superclass which must be inherited from.
00032 It uses a runnable which takes sequence as input and returns
00033 FeaturePair objects as output (like Bio::EnsEMBL::Analysis::Runnable::Blastz)
00034 
00035 It adds functionality to read and write to a compara databases.
00036 It takes as input (via input_id or analysis->parameters) DnaFragChunk or DnaFragChunkSet
00037 objects (via dbID reference) and stores GenomicAlignBlock entries.
00038 
00039 The appropriate Bio::EnsEMBL::Analysis object must be passed for
00040 extraction of appropriate parameters. 
00041 
00042 =cut
00043 
00044 =head1 APPENDIX
00045 
00046 The rest of the documentation details each of the object methods. 
00047 Internal methods are usually preceded with a _
00048 
00049 =cut
00050 
00051 package Bio::EnsEMBL::Compara::RunnableDB::PairAligner::PairAligner;
00052 
00053 use strict;
00054 
00055 use Bio::EnsEMBL::DBSQL::DBAdaptor;
00056 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00057 use Bio::EnsEMBL::Compara::GenomicAlign;
00058 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00059 use Bio::EnsEMBL::Compara::GenomicAlignBlock;
00060 use Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00061 use Bio::EnsEMBL::Utils::Exception;
00062 use Time::HiRes qw(time gettimeofday tv_interval);
00063 use File::Basename;
00064 
00065 use Bio::EnsEMBL::Analysis::RunnableDB;
00066 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00067 
00068 use Bio::EnsEMBL::Utils::SqlHelper;
00069 
00070 
00071 ##########################################
00072 #
00073 # subclass override methods
00074 # 
00075 ##########################################
00076 
00077 sub configure_defaults {
00078   my $self = shift;
00079   return 0;
00080 }
00081 
00082 sub configure_runnable {
00083   my $self = shift;
00084   throw("subclass must implement configure_runnable method\n");
00085 }
00086 
00087 ##########################################
00088 #
00089 # internal RunnableDB methods that should
00090 # not be subclassed
00091 # 
00092 ##########################################
00093 
00094 =head2 fetch_input
00095 
00096     Title   :   fetch_input
00097     Usage   :   $self->fetch_input
00098     Function:   Fetches input data for repeatmasker from the database
00099     Returns :   none
00100     Args    :   none
00101 
00102 =cut
00103 
00104 sub fetch_input {
00105   my( $self) = @_;
00106 
00107   #
00108   # run subclass configure_defaults method
00109   #
00110   $self->configure_defaults();
00111   
00112   
00113   my $query_DnaFragChunkSet = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00114 
00115   if(defined($self->param('qyChunkSetID'))) {
00116       my $chunkset = $self->compara_dba->get_DnaFragChunkSetAdaptor->fetch_by_dbID($self->param('qyChunkSetID'));
00117       $query_DnaFragChunkSet = $chunkset;
00118   }
00119   if(defined($self->param('qyChunk'))) {
00120       my $qy_chunk = $self->compara_dba->get_DnaFragChunkAdaptor->fetch_by_dbID($self->param('qyChunk'));
00121       $query_DnaFragChunkSet->add_DnaFragChunk($qy_chunk);
00122   }
00123   $self->param('query_DnaFragChunkSet',$query_DnaFragChunkSet);
00124 
00125   my $db_DnaFragChunkSet = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00126 
00127   if(defined($self->param('dbChunkSetID'))) {
00128       my $chunkset = $self->compara_dba->get_DnaFragChunkSetAdaptor->fetch_by_dbID($self->param('dbChunkSetID'));
00129       $db_DnaFragChunkSet = $chunkset;
00130   }
00131  if(defined($self->param('dbChunk'))) {
00132       my $db_chunk = $self->compara_dba->get_DnaFragChunkAdaptor->fetch_by_dbID($self->param('dbChunk'));
00133       $db_DnaFragChunkSet->add_DnaFragChunk($db_chunk);
00134   }
00135   $self->param('db_DnaFragChunkSet',$db_DnaFragChunkSet);
00136 
00137   #create a Compara::DBAdaptor which shares the same DBI handle
00138   #with $self->db (Hive DBAdaptor)
00139   $self->compara_dba->dbc->disconnect_when_inactive(0);
00140 
00141   throw("Missing qyChunk(s)") unless($query_DnaFragChunkSet->count > 0);
00142   throw("Missing dbChunk")    unless($db_DnaFragChunkSet->count > 0);
00143   throw("Missing method_link_type") unless($self->param('method_link_type'));
00144   
00145   my ($first_qy_chunk) = @{$query_DnaFragChunkSet->get_all_DnaFragChunks};
00146   my ($first_db_chunk) = @{$db_DnaFragChunkSet->get_all_DnaFragChunks};
00147   
00148   #
00149   # create method_link_species_set
00150   #
00151   my $mlss = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00152   $mlss->method_link_type($self->param('method_link_type'));
00153   if ($first_qy_chunk->dnafrag->genome_db->dbID == $first_db_chunk->dnafrag->genome_db->dbID) {
00154     $mlss->species_set([$first_qy_chunk->dnafrag->genome_db]);
00155   } else {
00156     $mlss->species_set([$first_qy_chunk->dnafrag->genome_db,
00157                         $first_db_chunk->dnafrag->genome_db]);
00158   } 
00159 
00160   $self->compara_dba->get_MethodLinkSpeciesSetAdaptor->store($mlss);
00161   $self->param('method_link_species_set', $mlss);
00162 
00163   if (defined $self->param('max_alignments')) {
00164       my $sth = $self->compara_dba->dbc->prepare("SELECT count(*) FROM genomic_align_block".
00165                          " WHERE method_link_species_set_id = ".$mlss->dbID);
00166       $sth->execute();
00167       my ($num_alignments) = $sth->fetchrow_array();
00168       $sth->finish();
00169       if ($num_alignments >= $self->max_alignments) {
00170       throw("Too many alignments ($num_alignments) have been stored already for MLSS ".$mlss->dbID."\n".
00171         "  Try changing the parameters or increase the max_alignments option if you think\n".
00172         "  your system can cope with so many alignments.");
00173       }
00174   }
00175 
00176   #
00177   # execute subclass configure_runnable method
00178   #
00179   $self->configure_runnable();
00180   
00181   return 1;
00182 }
00183 
00184 
00185 sub run
00186 {
00187   my $self = shift;
00188 
00189   $self->compara_dba->dbc->disconnect_when_inactive(1);  
00190 
00191   my $starttime = time();
00192   foreach my $runnable (@{$self->runnable}) {
00193     throw("Runnable module not set") unless($runnable);
00194     $runnable->run();
00195   }
00196   if($self->debug){printf("%1.3f secs to run %s pairwise\n", (time()-$starttime), $self->param('method_link_type'));}
00197 
00198   $self->compara_dba->dbc->disconnect_when_inactive(0);
00199   return 1;
00200 }
00201 
00202 sub delete_fasta_dumps_but_these {
00203   my $self = shift;
00204   my $fasta_files_not_to_delete = shift;
00205 
00206   my $work_dir = $self->worker_temp_directory;
00207 
00208   open F, "ls $work_dir|";
00209   while (my $file = <F>) {
00210     chomp $file;
00211     next unless ($file =~ /\.fasta$/);
00212     my $delete = 1;
00213     foreach my $fasta_file (@{$fasta_files_not_to_delete}) {
00214       if ($file eq basename($fasta_file)) {
00215         $delete = 0;
00216         last;
00217       }
00218     }
00219     unlink "$work_dir/$file" if ($delete);
00220   }
00221   close F;
00222 }
00223 
00224 sub write_output {
00225   my( $self) = @_;
00226 
00227   my $starttime = time();
00228 
00229   #since the Blast runnable takes in analysis parameters rather than an
00230   #analysis object, it creates new Analysis objects internally
00231   #(a new one for EACH FeaturePair generated)
00232   #which are a shadow of the real analysis object ($self->analysis)
00233   #The returned FeaturePair objects thus need to be reset to the real analysis object
00234 
00235   #
00236   #Start transaction
00237   #
00238   if ($self->param('do_transactions'))  {
00239       my $compara_conn = $self->compara_dba->dbc;
00240       my $compara_helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $compara_conn);
00241       $compara_helper->transaction(-CALLBACK => sub {
00242       $self->_write_output;
00243       });
00244   } else {
00245       $self->_write_output;
00246   }
00247 
00248   return 1;
00249 }
00250 
00251 sub _write_output {
00252     my ($self) = @_;
00253 
00254   #Set use_autoincrement to 1 otherwise the GenomicAlignBlockAdaptor will use
00255   #LOCK TABLES which does an implicit commit and prevent any rollback
00256   $self->compara_dba->get_GenomicAlignBlockAdaptor->use_autoincrement(1);
00257 
00258   $DB::single = 1;
00259   foreach my $fp ($self->output) {
00260     if($fp->isa('Bio::EnsEMBL::FeaturePair')) {
00261       $fp->analysis($self->analysis);
00262 
00263       $self->store_featurePair_as_genomicAlignBlock($fp);
00264     }
00265   }
00266 
00267   if($self->debug){printf("%d FeaturePairs found\n", scalar($self->output));}
00268   #print STDERR (time()-$starttime), " secs to write_output\n";
00269 }
00270 
00271 ##########################################
00272 #
00273 # internal methods
00274 #
00275 ##########################################
00276 
00277 sub dumpChunkSetToWorkdir
00278 {
00279   my $self      = shift;
00280   my $chunkSet   = shift;
00281 
00282   my $starttime = time();
00283 
00284   my $fastafile = $self->worker_temp_directory. "chunk_set_". $chunkSet->dbID .".fasta";
00285 
00286   $fastafile =~ s/\/\//\//g;  # converts any // in path to /
00287   return $fastafile if(-e $fastafile);
00288   #print("fastafile = '$fastafile'\n");
00289 
00290   open(OUTSEQ, ">$fastafile")
00291     or $self->throw("Error opening $fastafile for write");
00292   my $output_seq = Bio::SeqIO->new( -fh =>\*OUTSEQ, -format => 'Fasta');
00293   
00294   my $chunk_array = $chunkSet->get_all_DnaFragChunks;
00295   if($self->debug){printf("dumpChunkSetToWorkdir : %s : %d chunks\n", $fastafile, $chunkSet->count());}
00296   
00297   foreach my $chunk (@$chunk_array) {
00298     #rintf("  writing chunk %s\n", $chunk->display_id);
00299     my $bioseq = $chunk->bioseq;
00300     if($chunk->sequence_id==0) {
00301       $self->compara_dba->get_DnaFragChunkAdaptor->update_sequence($chunk);
00302     }
00303 
00304     $output_seq->write_seq($bioseq);
00305   }
00306   close OUTSEQ;
00307   if($self->debug){printf("  %1.3f secs to dump\n", (time()-$starttime));}
00308 
00309   return $fastafile
00310 }
00311 
00312 
00313 sub dumpChunkToWorkdir
00314 {
00315   my $self = shift;
00316   my $chunk = shift;
00317 
00318   my $starttime = time();
00319 
00320   my $fastafile = $self->worker_temp_directory .
00321                   "chunk_" . $chunk->dbID . ".fasta";
00322   $fastafile =~ s/\/\//\//g;  # converts any // in path to /
00323   return $fastafile if(-e $fastafile);
00324   #print("fastafile = '$fastafile'\n");
00325 
00326   if($self->debug){print("dumpChunkToWorkdir : $fastafile\n");}
00327 
00328   $chunk->cache_sequence;
00329   $chunk->dump_to_fasta_file($fastafile);
00330 
00331   if($self->debug){printf("  %1.3f secs to dump\n", (time()-$starttime));}
00332 
00333   return $fastafile
00334 }
00335 
00336 sub store_featurePair_as_genomicAlignBlock
00337 {
00338   my $self = shift;
00339   my $fp   = shift;
00340 
00341   my $qyChunk = undef;
00342   my $dbChunk = undef;
00343   
00344   if($fp->seqname =~ /chunkID(\d*):/) {
00345     my $chunk_id = $1;
00346     #printf("%s => %d\n", $fp->seqname, $chunk_id);
00347     $qyChunk = $self->compara_dba->get_DnaFragChunkAdaptor->
00348                      fetch_by_dbID($chunk_id);
00349   }
00350   if($fp->hseqname =~ /chunkID(\d*):/) {
00351     my $chunk_id = $1;
00352     #printf("%s => %d\n", $fp->hseqname, $chunk_id);
00353     $dbChunk = $self->compara_dba->get_DnaFragChunkAdaptor->
00354                      fetch_by_dbID($chunk_id);
00355   }
00356   unless($qyChunk and $dbChunk) {
00357     warn("unable to determine DnaFragChunk objects from FeaturePair");
00358     return undef;
00359   }
00360 
00361   if($self->debug > 1) {
00362     print("qyChunk : ",$qyChunk->display_id,"\n");
00363     print("dbChunk : ",$dbChunk->display_id,"\n");
00364     print STDOUT $fp->seqname."\t".
00365                  $fp->start."\t".
00366                  $fp->end."\t".
00367                  $fp->strand."\t".
00368                  $fp->hseqname."\t".
00369                  $fp->hstart."\t".
00370                  $fp->hend."\t".
00371                  $fp->hstrand."\t".
00372                  $fp->score."\t".
00373                  $fp->percent_id."\t".
00374                  $fp->cigar_string."\n";
00375   }                 
00376 
00377   $fp->slice($qyChunk->slice);
00378   $fp->hslice($dbChunk->slice);               
00379 
00380   #
00381   # test if I'm getting the indexes right
00382   #
00383   if($self->debug > 2) {
00384     print_simple_align($fp->get_SimpleAlign, 80);
00385 
00386     my $testChunk = new Bio::EnsEMBL::Compara::Production::DnaFragChunk();
00387     $testChunk->dnafrag($qyChunk->dnafrag);
00388     $testChunk->seq_start($qyChunk->seq_start+$fp->start-1);
00389     $testChunk->seq_end($qyChunk->seq_start+$fp->end-1);
00390     my $bioseq = $testChunk->bioseq;
00391     print($bioseq->seq, "\n");
00392   }
00393 
00394 
00395   my $genomic_align1 = new Bio::EnsEMBL::Compara::GenomicAlign;
00396   $genomic_align1->method_link_species_set($self->param('method_link_species_set'));
00397   $genomic_align1->dnafrag($qyChunk->dnafrag);
00398   $genomic_align1->dnafrag_start($qyChunk->seq_start + $fp->start -1);
00399   $genomic_align1->dnafrag_end($qyChunk->seq_start + $fp->end -1);
00400   $genomic_align1->dnafrag_strand($fp->strand);
00401   $genomic_align1->visible(1);
00402   
00403   my $cigar1 = $fp->cigar_string;
00404   $cigar1 =~ s/I/M/g;
00405   $cigar1 = compact_cigar_line($cigar1);
00406   $cigar1 =~ s/D/G/g;
00407   $genomic_align1->cigar_line($cigar1);
00408 
00409   my $genomic_align2 = new Bio::EnsEMBL::Compara::GenomicAlign;
00410   $genomic_align2->method_link_species_set($self->param('method_link_species_set'));
00411   $genomic_align2->dnafrag($dbChunk->dnafrag);
00412   $genomic_align2->dnafrag_start($dbChunk->seq_start + $fp->hstart -1);
00413   $genomic_align2->dnafrag_end($dbChunk->seq_start + $fp->hend -1);
00414   $genomic_align2->dnafrag_strand($fp->hstrand);
00415   $genomic_align2->visible(1);
00416 
00417   my $cigar2 = $fp->cigar_string;
00418   $cigar2 =~ s/D/M/g;
00419   $cigar2 =~ s/I/D/g;
00420   $cigar2 = compact_cigar_line($cigar2);
00421   $cigar2 =~ s/D/G/g;
00422   $genomic_align2->cigar_line($cigar2);
00423 
00424   if($self->debug > 1) {
00425     print("original cigar_line ",$fp->cigar_string,"\n");
00426     print("   $cigar1\n");
00427     print("   $cigar2\n");
00428   }
00429 
00430   my $GAB = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
00431   $GAB->method_link_species_set($self->param('method_link_species_set'));
00432   $GAB->genomic_align_array([$genomic_align1, $genomic_align2]);
00433   $GAB->score($fp->score);
00434   $GAB->perc_id($fp->percent_id);
00435   $GAB->length($fp->alignment_length);
00436   $GAB->level_id(1);
00437 
00438   $self->compara_dba->get_GenomicAlignBlockAdaptor->store($GAB);
00439   
00440   if ($self->debug) {
00441     my $track_sql = "INSERT IGNORE INTO genomic_align_block_job_track ".
00442       "(genomic_align_block_id, analysis_job_id) ".
00443         "VALUES (".$GAB->dbID.",".$self->input_job->dbID.")";
00444     print("$track_sql\n") if($self->debug);
00445     $self->compara_dba->dbc->do($track_sql);
00446   }
00447 
00448   if($self->debug > 2) { print_simple_align($GAB->get_SimpleAlign, 80);}
00449 
00450   return $GAB;
00451 }
00452 
00453 
00454 sub compact_cigar_line
00455 {
00456   my $cigar_line = shift;
00457 
00458   #print("cigar_line '$cigar_line' => ");
00459   my @pieces = ( $cigar_line =~ /(\d*[MDI])/g );
00460   my @new_pieces = ();
00461   foreach my $piece (@pieces) {
00462     $piece =~ s/I/M/;
00463     if (! scalar @new_pieces || $piece =~ /D/) {
00464       push @new_pieces, $piece;
00465       next;
00466     }
00467     if ($piece =~ /\d*M/ && $new_pieces[-1] =~ /\d*M/) {
00468       my ($matches1) = ($piece =~ /(\d*)M/);
00469       my ($matches2) = ($new_pieces[-1] =~ /(\d*)M/);
00470       if (! defined $matches1 || $matches1 eq "") {
00471         $matches1 = 1;
00472       }
00473       if (! defined $matches2 || $matches2 eq "") {
00474         $matches2 = 1;
00475       }
00476       $new_pieces[-1] = $matches1 + $matches2 . "M";
00477     } else {
00478       push @new_pieces, $piece;
00479     }
00480   }
00481   my $new_cigar_line = join("", @new_pieces);
00482   #print(" '$new_cigar_line'\n");
00483   return $new_cigar_line;
00484 }
00485 
00486 
00487 sub print_simple_align
00488 {
00489   my $alignment = shift;
00490   my $aaPerLine = shift;
00491   $aaPerLine=40 unless($aaPerLine and $aaPerLine > 0);
00492   
00493   my ($seq1, $seq2)  = $alignment->each_seq;
00494   my $seqStr1 = "|".$seq1->seq().'|';
00495   my $seqStr2 = "|".$seq2->seq().'|';
00496 
00497   my $enddiff = length($seqStr1) - length($seqStr2);
00498   while($enddiff>0) { $seqStr2 .= " "; $enddiff--; }
00499   while($enddiff<0) { $seqStr1 .= " "; $enddiff++; }
00500 
00501   my $label1 = sprintf("%40s : ", $seq1->id);
00502   my $label2 = sprintf("%40s : ", "");
00503   my $label3 = sprintf("%40s : ", $seq2->id);
00504 
00505   my $line2 = "";
00506   for(my $x=0; $x<length($seqStr1); $x++) {
00507     if(substr($seqStr1,$x,1) eq substr($seqStr2, $x,1)) { $line2.='|'; } else { $line2.=' '; }
00508   }
00509 
00510   my $offset=0;
00511   my $numLines = (length($seqStr1) / $aaPerLine);
00512   while($numLines>0) {
00513     printf("$label1 %s\n", substr($seqStr1,$offset,$aaPerLine));
00514     printf("$label2 %s\n", substr($line2,$offset,$aaPerLine));
00515     printf("$label3 %s\n", substr($seqStr2,$offset,$aaPerLine));
00516     print("\n\n");
00517     $offset+=$aaPerLine;
00518     $numLines--;
00519   }
00520 }
00521 
00522 1;