Archive Ensembl HomeArchive Ensembl Home
PairAligner.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PairAligner
00022 
00023 =cut
00024 
00025 =head1 SYNOPSIS
00026 
00027 =cut
00028 
00029 =head1 DESCRIPTION
00030 
00031 This object is an abstract superclass which must be inherited from.
00032 It uses a runnable which takes sequence as input and returns
00033 FeaturePair objects as output (like Bio::EnsEMBL::Analysis::Runnable::Blastz)
00034 
00035 It adds functionality to read and write to a compara databases.
00036 It takes as input (via input_id or analysis->parameters) DnaFragChunk or DnaFragChunkSet
00037 objects (via dbID reference) and stores GenomicAlignBlock entries.
00038 
00039 The appropriate Bio::EnsEMBL::Analysis object must be passed for
00040 extraction of appropriate parameters. 
00041 
00042 =cut
00043 
00044 =head1 APPENDIX
00045 
00046 The rest of the documentation details each of the object methods. 
00047 Internal methods are usually preceded with a _
00048 
00049 =cut
00050 
00051 package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::PairAligner;
00052 
00053 use strict;
00054 
00055 use Bio::EnsEMBL::DBSQL::DBAdaptor;
00056 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00057 use Bio::EnsEMBL::Compara::GenomicAlign;
00058 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00059 use Bio::EnsEMBL::Compara::GenomicAlignBlock;
00060 use Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00061 use Bio::EnsEMBL::Utils::Exception;
00062 use Time::HiRes qw(time gettimeofday tv_interval);
00063 use File::Basename;
00064 
00065 use Bio::EnsEMBL::Analysis::RunnableDB;
00066 use Bio::EnsEMBL::Hive::Process;
00067 
00068 use Bio::EnsEMBL::Utils::SqlHelper;
00069 
00070 our @ISA = qw( Bio::EnsEMBL::Hive::Process );
00071 
00072 
00073 ##########################################
00074 #
00075 # subclass override methods
00076 # 
00077 ##########################################
00078 
00079 sub configure_defaults {
00080   my $self = shift;
00081   return 0;
00082 }
00083 
00084 sub configure_runnable {
00085   my $self = shift;
00086   throw("subclass must implement configure_runnable method\n");
00087 }
00088 
00089 sub get_params {
00090   my $self         = shift;
00091   my $param_string = shift;
00092 
00093   return unless($param_string);
00094   #print("parsing parameter string : ",$param_string,"\n");
00095 
00096   my $params = eval($param_string);
00097   return unless($params);
00098 
00099   if(defined($params->{'qyChunkSetID'})) {
00100     my $chunkset = $self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->fetch_by_dbID($params->{'qyChunkSetID'});
00101     $self->query_DnaFragChunkSet($chunkset);
00102   }
00103   if(defined($params->{'qyChunk'})) {
00104     my $qy_chunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->fetch_by_dbID($params->{'qyChunk'});
00105     $self->query_DnaFragChunkSet->add_DnaFragChunk($qy_chunk);
00106   }
00107 
00108   if(defined($params->{'dbChunkSetID'})) {
00109     my $chunkset = $self->{'comparaDBA'}->get_DnaFragChunkSetAdaptor->fetch_by_dbID($params->{'dbChunkSetID'});
00110     $self->db_DnaFragChunkSet($chunkset);
00111   }
00112   if(defined($params->{'dbChunk'})) {
00113     my $db_chunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->fetch_by_dbID($params->{'dbChunk'});
00114     $self->db_DnaFragChunkSet->add_DnaFragChunk($db_chunk);
00115   }
00116 
00117   $self->options($params->{'options'})              if(defined($params->{'options'}));
00118   $self->method_link_type($params->{'method_link'}) if(defined($params->{'method_link'}));
00119   $self->max_alignments($params->{'max_alignments'}) if(defined($params->{'max_alignments'}));
00120   $self->dump_loc($params->{'dump_loc'}) if(defined($params->{'dump_loc'}));
00121 
00122   if (defined($params->{'do_transactions'})) {
00123       $self->{_do_transactions} = $params->{'do_transactions'};
00124   } else {
00125       #default is to do transactions
00126       $self->{_do_transactions} = 1;
00127   }
00128   return;
00129 }
00130 
00131 
00132 ##########################################
00133 #
00134 # getter/setter methods
00135 # 
00136 ##########################################
00137 
00138 sub options {
00139   my $self = shift;
00140   $self->{'_options'} = shift if(@_);
00141   return $self->{'_options'};
00142 }
00143 
00144 sub dump_loc {
00145   my $self = shift;
00146   $self->{'_dump_loc'} = shift if(@_);
00147   return $self->{'_dump_loc'};
00148 }
00149 
00150 sub method_link_type {
00151   my $self = shift;
00152   $self->{'_method_link_type'} = shift if(@_);
00153   return $self->{'_method_link_type'};
00154 }
00155 
00156 sub max_alignments {
00157   my $self = shift;
00158   $self->{'_max_alignments'} = shift if(@_);
00159   return $self->{'_max_alignments'};
00160 }
00161 
00162 sub query_DnaFragChunkSet {
00163   my $self = shift;
00164   $self->{'_query_DnaFragChunkSet'} = shift if(@_);
00165   unless(defined($self->{'_query_DnaFragChunkSet'})) {
00166     $self->{'_query_DnaFragChunkSet'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00167   }
00168   return $self->{'_query_DnaFragChunkSet'};
00169 }
00170 
00171 sub db_DnaFragChunkSet {
00172   my $self = shift;
00173   $self->{'_db_DnaFragChunkSet'} = shift if(@_);
00174   unless(defined($self->{'_db_DnaFragChunkSet'})) {
00175     $self->{'_db_DnaFragChunkSet'} = new Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00176   }
00177   return $self->{'_db_DnaFragChunkSet'};
00178 }
00179 
00180 sub do_transactions {
00181   my $self = shift;
00182   $self->{'_do_transactions'} = shift if(@_);
00183   return $self->{'_do_transactions'};
00184 }
00185 
00186 ##########################################
00187 #
00188 # internal RunnableDB methods that should
00189 # not be subclassed
00190 # 
00191 ##########################################
00192 
00193 =head2 fetch_input
00194 
00195     Title   :   fetch_input
00196     Usage   :   $self->fetch_input
00197     Function:   Fetches input data for repeatmasker from the database
00198     Returns :   none
00199     Args    :   none
00200 
00201 =cut
00202 
00203 sub fetch_input {
00204   my( $self) = @_;
00205 
00206   #
00207   # run subclass configure_defaults method
00208   #
00209   $self->configure_defaults();
00210   
00211   #create a Compara::DBAdaptor which shares the same DBI handle
00212   #with $self->db (Hive DBAdaptor)
00213   $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN=>$self->db->dbc);
00214   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00215 
00216   $self->get_params($self->parameters);
00217   $self->get_params($self->input_id);
00218 
00219   throw("Missing qyChunk(s)") unless($self->query_DnaFragChunkSet->count > 0);
00220   throw("Missing dbChunk")    unless($self->db_DnaFragChunkSet->count > 0);
00221   throw("Missing method_link_type") unless($self->method_link_type);
00222   
00223   my ($first_qy_chunk) = @{$self->query_DnaFragChunkSet->get_all_DnaFragChunks};
00224   my ($first_db_chunk) = @{$self->db_DnaFragChunkSet->get_all_DnaFragChunks};
00225   
00226   #
00227   # create method_link_species_set
00228   #
00229   my $mlss = new Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00230   $mlss->method_link_type($self->method_link_type);
00231   if ($first_qy_chunk->dnafrag->genome_db->dbID == $first_db_chunk->dnafrag->genome_db->dbID) {
00232     $mlss->species_set([$first_qy_chunk->dnafrag->genome_db]);
00233   } else {
00234     $mlss->species_set([$first_qy_chunk->dnafrag->genome_db,
00235                         $first_db_chunk->dnafrag->genome_db]);
00236   } 
00237 
00238   $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor->store($mlss);
00239   $self->{'method_link_species_set'} = $mlss;
00240 
00241   if ($self->max_alignments) {
00242     my $sth = $self->{'comparaDBA'}->dbc->prepare("SELECT count(*) FROM genomic_align_block".
00243         " WHERE method_link_species_set_id = ".$mlss->dbID);
00244     $sth->execute();
00245     my ($num_alignments) = $sth->fetchrow_array();
00246     $sth->finish();
00247     if ($num_alignments >= $self->max_alignments) {
00248       throw("Too many alignments ($num_alignments) have been stored already for MLSS ".$mlss->dbID."\n".
00249           "  Try changing the parameters or increase the max_alignments option if you think\n".
00250           "  your system can cope with so many alignments.");
00251     }
00252   }
00253 
00254   #
00255   # execute subclass configure_runnable method
00256   #
00257   $self->configure_runnable();
00258   
00259   return 1;
00260 }
00261 
00262 
00263 sub run
00264 {
00265   my $self = shift;
00266 
00267   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(1);  
00268 
00269   my $starttime = time();
00270   foreach my $runnable (@{$self->runnable}) {
00271     throw("Runnable module not set") unless($runnable);
00272     $runnable->run();
00273   }
00274   if($self->debug){printf("%1.3f secs to run %s pairwise\n", (time()-$starttime), $self->method_link_type);}
00275 
00276   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00277   return 1;
00278 }
00279 
00280 sub delete_fasta_dumps_but_these {
00281   my $self = shift;
00282   my $fasta_files_not_to_delete = shift;
00283 
00284   my $work_dir = $self->worker_temp_directory;
00285 
00286   open F, "ls $work_dir|";
00287   while (my $file = <F>) {
00288     chomp $file;
00289     next unless ($file =~ /\.fasta$/);
00290     my $delete = 1;
00291     foreach my $fasta_file (@{$fasta_files_not_to_delete}) {
00292       if ($file eq basename($fasta_file)) {
00293         $delete = 0;
00294         last;
00295       }
00296     }
00297     unlink "$work_dir/$file" if ($delete);
00298   }
00299   close F;
00300 }
00301 
00302 sub write_output {
00303   my( $self) = @_;
00304 
00305   my $starttime = time();
00306 
00307   #since the Blast runnable takes in analysis parameters rather than an
00308   #analysis object, it creates new Analysis objects internally
00309   #(a new one for EACH FeaturePair generated)
00310   #which are a shadow of the real analysis object ($self->analysis)
00311   #The returned FeaturePair objects thus need to be reset to the real analysis object
00312 
00313   #
00314   #Start transaction
00315   #
00316   if ($self->do_transactions)  {
00317       my $compara_conn = $self->{'comparaDBA'}->dbc;
00318       my $compara_helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $compara_conn);
00319       $compara_helper->transaction(-CALLBACK => sub {
00320       $self->_write_output;
00321       });
00322   } else {
00323       $self->_write_output;
00324   }
00325 
00326   return 1;
00327 }
00328 
00329 sub _write_output {
00330     my ($self) = @_;
00331 
00332   #Set use_autoincrement to 1 otherwise the GenomicAlignBlockAdaptor will use
00333   #LOCK TABLES which does an implicit commit and prevent any rollback
00334   $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor->use_autoincrement(1);
00335 
00336   foreach my $fp ($self->output) {
00337     if($fp->isa('Bio::EnsEMBL::FeaturePair')) {
00338       $fp->analysis($self->analysis);
00339 
00340       $self->store_featurePair_as_genomicAlignBlock($fp);
00341     }
00342   }
00343 
00344   if($self->debug){printf("%d FeaturePairs found\n", scalar($self->output));}
00345   #print STDERR (time()-$starttime), " secs to write_output\n";
00346 }
00347 
00348 ##########################################
00349 #
00350 # internal methods
00351 #
00352 ##########################################
00353 
00354 sub dumpChunkSetToWorkdir
00355 {
00356   my $self      = shift;
00357   my $chunkSet   = shift;
00358 
00359   my $starttime = time();
00360 
00361   my $fastafile = $self->worker_temp_directory. "chunk_set_". $chunkSet->dbID .".fasta";
00362 
00363   $fastafile =~ s/\/\//\//g;  # converts any // in path to /
00364   return $fastafile if(-e $fastafile);
00365   #print("fastafile = '$fastafile'\n");
00366 
00367   open(OUTSEQ, ">$fastafile")
00368     or $self->throw("Error opening $fastafile for write");
00369   my $output_seq = Bio::SeqIO->new( -fh =>\*OUTSEQ, -format => 'Fasta');
00370   
00371   my $chunk_array = $chunkSet->get_all_DnaFragChunks;
00372   if($self->debug){printf("dumpChunkSetToWorkdir : %s : %d chunks\n", $fastafile, $chunkSet->count());}
00373   
00374   foreach my $chunk (@$chunk_array) {
00375     #rintf("  writing chunk %s\n", $chunk->display_id);
00376     my $bioseq = $chunk->bioseq;
00377     if($chunk->sequence_id==0) {
00378       $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->update_sequence($chunk);
00379     }
00380 
00381     $output_seq->write_seq($bioseq);
00382   }
00383   close OUTSEQ;
00384   if($self->debug){printf("  %1.3f secs to dump\n", (time()-$starttime));}
00385 
00386   return $fastafile
00387 }
00388 
00389 
00390 sub dumpChunkToWorkdir
00391 {
00392   my $self = shift;
00393   my $chunk = shift;
00394 
00395   my $starttime = time();
00396 
00397   my $fastafile = $self->worker_temp_directory .
00398                   "chunk_" . $chunk->dbID . ".fasta";
00399   $fastafile =~ s/\/\//\//g;  # converts any // in path to /
00400   return $fastafile if(-e $fastafile);
00401   #print("fastafile = '$fastafile'\n");
00402 
00403   if($self->debug){print("dumpChunkToWorkdir : $fastafile\n");}
00404 
00405   $chunk->cache_sequence;
00406   $chunk->dump_to_fasta_file($fastafile);
00407 
00408   if($self->debug){printf("  %1.3f secs to dump\n", (time()-$starttime));}
00409 
00410   return $fastafile
00411 }
00412 
00413 sub store_featurePair_as_genomicAlignBlock
00414 {
00415   my $self = shift;
00416   my $fp   = shift;
00417 
00418   my $qyChunk = undef;
00419   my $dbChunk = undef;
00420   
00421   if($fp->seqname =~ /chunkID(\d*):/) {
00422     my $chunk_id = $1;
00423     #printf("%s => %d\n", $fp->seqname, $chunk_id);
00424     $qyChunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->
00425                      fetch_by_dbID($chunk_id);
00426   }
00427   if($fp->hseqname =~ /chunkID(\d*):/) {
00428     my $chunk_id = $1;
00429     #printf("%s => %d\n", $fp->hseqname, $chunk_id);
00430     $dbChunk = $self->{'comparaDBA'}->get_DnaFragChunkAdaptor->
00431                      fetch_by_dbID($chunk_id);
00432   }
00433   unless($qyChunk and $dbChunk) {
00434     warn("unable to determine DnaFragChunk objects from FeaturePair");
00435     return undef;
00436   }
00437 
00438   if($self->debug > 1) {
00439     print("qyChunk : ",$qyChunk->display_id,"\n");
00440     print("dbChunk : ",$dbChunk->display_id,"\n");
00441     print STDOUT $fp->seqname."\t".
00442                  $fp->start."\t".
00443                  $fp->end."\t".
00444                  $fp->strand."\t".
00445                  $fp->hseqname."\t".
00446                  $fp->hstart."\t".
00447                  $fp->hend."\t".
00448                  $fp->hstrand."\t".
00449                  $fp->score."\t".
00450                  $fp->percent_id."\t".
00451                  $fp->cigar_string."\n";
00452   }                 
00453 
00454   $fp->slice($qyChunk->slice);
00455   $fp->hslice($dbChunk->slice);               
00456 
00457   #
00458   # test if I'm getting the indexes right
00459   #
00460   if($self->debug > 2) {
00461     print_simple_align($fp->get_SimpleAlign, 80);
00462 
00463     my $testChunk = new Bio::EnsEMBL::Compara::Production::DnaFragChunk();
00464     $testChunk->dnafrag($qyChunk->dnafrag);
00465     $testChunk->seq_start($qyChunk->seq_start+$fp->start-1);
00466     $testChunk->seq_end($qyChunk->seq_start+$fp->end-1);
00467     my $bioseq = $testChunk->bioseq;
00468     print($bioseq->seq, "\n");
00469   }
00470 
00471 
00472   my $genomic_align1 = new Bio::EnsEMBL::Compara::GenomicAlign;
00473   $genomic_align1->method_link_species_set($self->{'method_link_species_set'});
00474   $genomic_align1->dnafrag($qyChunk->dnafrag);
00475   $genomic_align1->dnafrag_start($qyChunk->seq_start + $fp->start -1);
00476   $genomic_align1->dnafrag_end($qyChunk->seq_start + $fp->end -1);
00477   $genomic_align1->dnafrag_strand($fp->strand);
00478   $genomic_align1->level_id(1);
00479   
00480   my $cigar1 = $fp->cigar_string;
00481   $cigar1 =~ s/I/M/g;
00482   $cigar1 = compact_cigar_line($cigar1);
00483   $cigar1 =~ s/D/G/g;
00484   $genomic_align1->cigar_line($cigar1);
00485 
00486   my $genomic_align2 = new Bio::EnsEMBL::Compara::GenomicAlign;
00487   $genomic_align2->method_link_species_set($self->{'method_link_species_set'});
00488   $genomic_align2->dnafrag($dbChunk->dnafrag);
00489   $genomic_align2->dnafrag_start($dbChunk->seq_start + $fp->hstart -1);
00490   $genomic_align2->dnafrag_end($dbChunk->seq_start + $fp->hend -1);
00491   $genomic_align2->dnafrag_strand($fp->hstrand);
00492   $genomic_align2->level_id(1);
00493 
00494   my $cigar2 = $fp->cigar_string;
00495   $cigar2 =~ s/D/M/g;
00496   $cigar2 =~ s/I/D/g;
00497   $cigar2 = compact_cigar_line($cigar2);
00498   $cigar2 =~ s/D/G/g;
00499   $genomic_align2->cigar_line($cigar2);
00500 
00501   if($self->debug > 1) {
00502     print("original cigar_line ",$fp->cigar_string,"\n");
00503     print("   $cigar1\n");
00504     print("   $cigar2\n");
00505   }
00506 
00507   my $GAB = new Bio::EnsEMBL::Compara::GenomicAlignBlock;
00508   $GAB->method_link_species_set($self->{'method_link_species_set'});
00509   $GAB->genomic_align_array([$genomic_align1, $genomic_align2]);
00510   $GAB->score($fp->score);
00511   $GAB->perc_id($fp->percent_id);
00512   $GAB->length($fp->alignment_length);
00513 
00514   $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor->store($GAB);
00515   
00516   if ($self->debug) {
00517     my $track_sql = "INSERT IGNORE INTO genomic_align_block_job_track ".
00518       "(genomic_align_block_id, analysis_job_id) ".
00519         "VALUES (".$GAB->dbID.",".$self->input_job->dbID.")";
00520     print("$track_sql\n") if($self->debug);
00521     $self->{'comparaDBA'}->dbc->do($track_sql);
00522   }
00523 
00524   if($self->debug > 2) { print_simple_align($GAB->get_SimpleAlign, 80);}
00525 
00526   return $GAB;
00527 }
00528 
00529 
00530 sub compact_cigar_line
00531 {
00532   my $cigar_line = shift;
00533 
00534   #print("cigar_line '$cigar_line' => ");
00535   my @pieces = ( $cigar_line =~ /(\d*[MDI])/g );
00536   my @new_pieces = ();
00537   foreach my $piece (@pieces) {
00538     $piece =~ s/I/M/;
00539     if (! scalar @new_pieces || $piece =~ /D/) {
00540       push @new_pieces, $piece;
00541       next;
00542     }
00543     if ($piece =~ /\d*M/ && $new_pieces[-1] =~ /\d*M/) {
00544       my ($matches1) = ($piece =~ /(\d*)M/);
00545       my ($matches2) = ($new_pieces[-1] =~ /(\d*)M/);
00546       if (! defined $matches1 || $matches1 eq "") {
00547         $matches1 = 1;
00548       }
00549       if (! defined $matches2 || $matches2 eq "") {
00550         $matches2 = 1;
00551       }
00552       $new_pieces[-1] = $matches1 + $matches2 . "M";
00553     } else {
00554       push @new_pieces, $piece;
00555     }
00556   }
00557   my $new_cigar_line = join("", @new_pieces);
00558   #print(" '$new_cigar_line'\n");
00559   return $new_cigar_line;
00560 }
00561 
00562 
00563 sub print_simple_align
00564 {
00565   my $alignment = shift;
00566   my $aaPerLine = shift;
00567   $aaPerLine=40 unless($aaPerLine and $aaPerLine > 0);
00568   
00569   my ($seq1, $seq2)  = $alignment->each_seq;
00570   my $seqStr1 = "|".$seq1->seq().'|';
00571   my $seqStr2 = "|".$seq2->seq().'|';
00572 
00573   my $enddiff = length($seqStr1) - length($seqStr2);
00574   while($enddiff>0) { $seqStr2 .= " "; $enddiff--; }
00575   while($enddiff<0) { $seqStr1 .= " "; $enddiff++; }
00576 
00577   my $label1 = sprintf("%40s : ", $seq1->id);
00578   my $label2 = sprintf("%40s : ", "");
00579   my $label3 = sprintf("%40s : ", $seq2->id);
00580 
00581   my $line2 = "";
00582   for(my $x=0; $x<length($seqStr1); $x++) {
00583     if(substr($seqStr1,$x,1) eq substr($seqStr2, $x,1)) { $line2.='|'; } else { $line2.=' '; }
00584   }
00585 
00586   my $offset=0;
00587   my $numLines = (length($seqStr1) / $aaPerLine);
00588   while($numLines>0) {
00589     printf("$label1 %s\n", substr($seqStr1,$offset,$aaPerLine));
00590     printf("$label2 %s\n", substr($line2,$offset,$aaPerLine));
00591     printf("$label3 %s\n", substr($seqStr2,$offset,$aaPerLine));
00592     print("\n\n");
00593     $offset+=$aaPerLine;
00594     $numLines--;
00595   }
00596 }
00597 
00598 1;