Archive Ensembl HomeArchive Ensembl Home
AlignmentProcessing.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::PairAligner::AlignmentProcessing
00022 
00023 =head1 SYNOPSIS
00024 
00025 Abstract base class of AlignmentChains and AlignmentNets
00026 
00027 =head1 DESCRIPTION
00028 
00029 =cut
00030 package Bio::EnsEMBL::Compara::RunnableDB::PairAligner::AlignmentProcessing;
00031 
00032 use vars qw(@ISA);
00033 use strict;
00034 
00035 #use Bio::EnsEMBL::Hive::Process;
00036 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00037 use Bio::EnsEMBL::Compara::GenomicAlignBlock;
00038 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00039 use Bio::EnsEMBL::Utils::SqlHelper;
00040 
00041 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00042 
00043 ############################################################
00044 sub new {
00045   my ($class,@args) = @_;
00046   my $self = $class->SUPER::new(@args);
00047  
00048   return $self;
00049 }
00050 
00051 sub fetch_input {
00052     my ($self) = @_;
00053     
00054     $self->param('query_DnaFrag_hash', {});
00055     $self->param('target_DnaFrag_hash', {});
00056 
00057     unless (defined $self->param('do_transactions')) {
00058     $self->param('do_transactions', 1);
00059     }
00060 }
00061 
00062 
00063 =head2 run
00064 
00065   Arg [1]   : Bio::EnsEMBL::Analysis::RunnableDB
00066   Function  : cycles through all the runnables, calls run and pushes
00067   their output into the RunnableDBs output array
00068   Returntype: array ref
00069   Exceptions: none
00070   Example   : 
00071 
00072 =cut
00073 
00074 sub run{
00075   my ($self) = @_;
00076   foreach my $runnable(@{$self->runnable}){
00077     $runnable->run;
00078     my $converted_output = $self->convert_output($runnable->output);
00079     $self->output($converted_output);
00080     rmdir($runnable->workdir) if (defined $runnable->workdir);
00081   }
00082 }
00083 
00084 
00085 
00086 
00087 =head2 write_output
00088 
00089     Title   :   write_output
00090     Usage   :   $self->write_output()
00091     Function:   Writes contents of $self->output into $self->db
00092     Returns :   1
00093     Args    :   None
00094 
00095 =cut
00096 
00097 sub write_output {
00098   my($self) = @_;
00099 
00100   my @gen_al_groups;
00101 
00102   #
00103   #Start transaction
00104   #
00105   if ($self->param('do_transactions'))  {
00106       my $compara_conn = $self->compara_dba->dbc;
00107       my $compara_helper = Bio::EnsEMBL::Utils::SqlHelper->new(-DB_CONNECTION => $compara_conn);
00108       $compara_helper->transaction(-CALLBACK => sub {
00109       $self->_write_output;
00110       });
00111   } else {
00112       $self->_write_output;
00113   }
00114 
00115   return 1;
00116 
00117 }
00118 
00119 sub _write_output {
00120   my ($self) = @_;
00121 
00122   #Set use_autoincrement to 1 otherwise the GenomicAlignBlockAdaptor will use
00123   #LOCK TABLES which does an implicit commit and prevent any rollback
00124   $self->compara_dba->get_GenomicAlignBlockAdaptor->use_autoincrement(1);
00125 
00126   foreach my $chain (@{$self->output}) {
00127       my $group_id;
00128 
00129       #store first block
00130       my $first_block = shift @$chain;
00131       $self->compara_dba->get_GenomicAlignBlockAdaptor->store($first_block);
00132     
00133       #Set the group_id if one doesn't already exist ie for chains, to be the
00134       #dbID of the first genomic_align_block. For nets,the group_id has already
00135       #been set and is the same as it's chain.
00136       unless (defined($first_block->group_id)) {
00137       $group_id = $first_block->dbID;
00138       $self->compara_dba->get_GenomicAlignBlockAdaptor->store_group_id($first_block, $group_id);
00139       }
00140 
00141       #store the rest of the genomic_align_blocks
00142       foreach my $block (@$chain) {
00143       if (defined $group_id) {
00144           $block->group_id($group_id);
00145       }
00146       $self->compara_dba->get_GenomicAlignBlockAdaptor->store($block);
00147      }
00148   }
00149 }
00150 
00151 ###########################################
00152 # chain sorting
00153 ###########################################
00154 sub sort_chains_by_max_block_score {
00155   my ($self, $chains) = @_;
00156 
00157   # sort the chains by maximum score
00158   my @chain_hashes;
00159   foreach my $chain (@$chains) {
00160     my $chain_hash = { chain => $chain };
00161     foreach my $block (@$chain) {
00162       if (not exists $chain_hash->{qname}) {
00163         $chain_hash->{qname} = $block->seqname;
00164         $chain_hash->{tname} = $block->hseqname;
00165       }
00166       if (not exists $chain_hash->{score} or
00167           $block->score > $chain_hash->{score}) {
00168         $chain_hash->{score} = $block->score;
00169       }
00170     }
00171     push @chain_hashes, $chain_hash;
00172   }
00173   
00174   my @sorted = map { $_->{chain}} sort {
00175     $b->{score} <=> $a->{score} 
00176     or $a->{qname} cmp $b->{qname}
00177     or $a->{tname} cmp $b->{tname}
00178   } @chain_hashes;
00179 
00180   return \@sorted;
00181 }
00182 
00183 
00184 ###########################################
00185 # feature splitting
00186 ###########################################
00187 sub split_feature {
00188   my ($self, $f, $max_gap) = @_;
00189 
00190   my @split_dafs;
00191   
00192   my $need_to_split = 0;
00193 
00194   my @pieces = split(/(\d*[MDI])/, $f->cigar_string);
00195   foreach my $piece ( @pieces ) {
00196     next if ($piece !~ /^(\d*)([MDI])$/);
00197     my $num = ($1 or 1);
00198     my $type = $2;
00199 
00200     if (($type eq "I" or $type eq "D") and $num >= $max_gap) {
00201       $need_to_split = 1;
00202       last;
00203     }
00204   }
00205   
00206   if ($need_to_split) {
00207     my (@new_feats);
00208     foreach my $ug (sort {$a->start <=> $b->start} $f->ungapped_features) {
00209       if (@new_feats) {
00210         my ($dist, $hdist);
00211 
00212         my $last_ug = $new_feats[-1]->[-1];
00213 
00214         if ($ug->end < $last_ug->start) {
00215           # blocks in reverse orienation
00216           $dist = $last_ug->start - $ug->end - 1;
00217         } else {
00218           # blocks in forward orienatation
00219           $dist = $ug->start - $last_ug->end - 1;
00220         }
00221         if ($ug->hend < $last_ug->hstart) {
00222           # blocks in reverse orienation
00223           $hdist = $last_ug->hstart - $ug->hend - 1;
00224         } else {
00225           # blocks in forward orienatation
00226           $hdist = $ug->hstart - $last_ug->hend - 1;
00227         }
00228 
00229         if ($dist >= $max_gap or $hdist >= $max_gap) {
00230           push @new_feats, [];
00231         }
00232       } else {
00233         push @new_feats, [];
00234       }
00235       push @{$new_feats[-1]}, $ug;
00236     }
00237     
00238     foreach my $mini_list (@new_feats) {
00239       push @split_dafs, Bio::EnsEMBL::DnaDnaAlignFeature->new(-features => $mini_list);
00240     }
00241 
00242   } else {
00243     @split_dafs = ($f)
00244   }  
00245 
00246   return @split_dafs;
00247 }
00248 
00249 ############################################
00250 # cigar conversion
00251 ############################################
00252 
00253 sub compara_cigars_from_daf_cigar {
00254   my ($self, $daf_cigar) = @_;
00255 
00256   my ($q_cigar_line, $t_cigar_line, $align_length);
00257 
00258   my @pieces = split(/(\d*[MDI])/, $daf_cigar);
00259 
00260   my ($q_counter, $t_counter) = (0,0);
00261 
00262   foreach my $piece ( @pieces ) {
00263 
00264     next if ($piece !~ /^(\d*)([MDI])$/);
00265     
00266     my $num = ($1 or 1);
00267     my $type = $2;
00268     
00269     if( $type eq "M" ) {
00270       $q_counter += $num;
00271       $t_counter += $num;
00272       
00273     } elsif( $type eq "D" ) {
00274       $q_cigar_line .= (($q_counter == 1) ? "" : $q_counter)."M";
00275       $q_counter = 0;
00276       $q_cigar_line .= (($num == 1) ? "" : $num)."D";
00277       $t_counter += $num;
00278       
00279     } elsif( $type eq "I" ) {
00280       $q_counter += $num;
00281       $t_cigar_line .= (($t_counter == 1) ? "" : $t_counter)."M";
00282       $t_counter = 0;
00283       $t_cigar_line .= (($num == 1) ? "" : $num)."D";
00284     }
00285     $align_length += $num;
00286   }
00287 
00288   $q_cigar_line .= (($q_counter == 1) ? "" : $q_counter)."M"
00289       if $q_counter;
00290   $t_cigar_line .= (($t_counter == 1) ? "" : $t_counter)."M"
00291       if $t_counter;
00292   
00293   return ($q_cigar_line, $t_cigar_line, $align_length);
00294 }
00295 
00296 
00297 sub daf_cigar_from_compara_cigars {
00298   my ($self, $q_cigar, $t_cigar) = @_;
00299 
00300   my (@q_pieces, @t_pieces);
00301   foreach my $piece (split(/(\d*[MDGI])/, $q_cigar)) {
00302     next if ($piece !~ /^(\d*)([MDGI])$/);
00303 
00304     my $num = $1; $num = 1 if $num eq "";
00305     my $type = $2; $type = 'D' if $type ne 'M';
00306 
00307     if ($num > 0) {
00308       push @q_pieces, { num  => $num,
00309                         type => $type, 
00310                       };
00311     }
00312   }
00313   foreach my $piece (split(/(\d*[MDGI])/, $t_cigar)) {
00314     next if ($piece !~ /^(\d*)([MDGI])$/);
00315     
00316     my $num = $1; $num = 1 if $num eq "";
00317     my $type = $2; $type = 'D' if $type ne 'M';
00318 
00319     if ($num > 0) {
00320       push @t_pieces, { num  => $num,
00321                         type => $type,
00322                       };
00323     }
00324   }
00325 
00326   my $daf_cigar = "";
00327 
00328   while(@q_pieces and @t_pieces) {
00329     # should never be left with a q piece and no target pieces, or vice versa
00330     my $q = shift @q_pieces;
00331     my $t = shift @t_pieces;
00332 
00333     if ($q->{num} == $t->{num}) {
00334       if ($q->{type} eq 'M' and $t->{type} eq 'M') {
00335         $daf_cigar .= ($q->{num} > 1 ? $q->{num} : "") . 'M';
00336       } elsif ($q->{type} eq 'M' and $t->{type} eq 'D') {
00337         $daf_cigar .= ($q->{num} > 1 ? $q->{num} : "") . 'I';
00338       } elsif ($q->{type} eq 'D' and $t->{type} eq 'M') {
00339         $daf_cigar .= ($q->{num} > 1 ? $q->{num} : "") . 'D';
00340       } else {
00341         # must be a delete in both seqs; warn and ignore
00342         warn("The following cigars have a simultaneous gap:\n" . 
00343              $q_cigar . "\n". 
00344              $t_cigar . "\n");
00345       }
00346     } elsif ($q->{num} > $t->{num}) {
00347       if ($q->{type} ne 'M') {
00348         warn("The following cigars are strange:\n" . 
00349              $q_cigar . "\n". 
00350              $t_cigar . "\n");
00351       }
00352       
00353       if ($t->{type} eq 'M') {
00354         $daf_cigar .= ($t->{num} > 1 ? $t->{num} : "") . 'M';
00355       } elsif ($t->{type} eq 'D') {
00356         $daf_cigar .= ($t->{num} > 1 ? $t->{num} : "") . 'I';
00357       } 
00358 
00359       unshift @q_pieces, { 
00360         type => 'M',
00361         num  => $q->{num} - $t->{num}, 
00362       };
00363 
00364     } else {
00365       # $t->{num} > $q->{num}
00366       if ($t->{type} ne 'M') {
00367         warn("The following cigars are strange:\n" . 
00368              $q_cigar . "\n". 
00369              $t_cigar . "\n");
00370       }
00371       
00372       if ($q->{type} eq 'M') {
00373         $daf_cigar .= ($q->{num} > 1 ? $q->{num} : "") . 'M';
00374       } elsif ($q->{type} eq 'D') {
00375         $daf_cigar .= ($q->{num} > 1 ? $q->{num} : "") . 'D';
00376       } 
00377       unshift @t_pieces, { 
00378         type => 'M',
00379         num  => $t->{num} - $q->{num},
00380       };
00381     } 
00382   }
00383 
00384   # final sanity checks
00385 
00386   if (@q_pieces or @t_pieces) {
00387     warn("Left with dangling pieces in the following cigars:\n" .
00388           $q_cigar . "\n". 
00389           $t_cigar . "\n");
00390     return undef;
00391   }
00392   
00393   my $last_type;
00394   foreach my $piece (split(/(\d*[MDI])/, $daf_cigar)) {
00395     next if not $piece;
00396     my ($type) = ($piece =~ /\d*([MDI])/);
00397 
00398     if (defined $last_type and
00399        (($last_type eq 'I' and $type eq 'D') or
00400         ($last_type eq 'D' and $type eq 'I'))) {
00401 
00402       warn("Adjacent Insert/Delete in the following cigars:\n" .
00403            $q_cigar . "\n". 
00404            $t_cigar . "\n".
00405            $daf_cigar . "\n");
00406 
00407       return undef;
00408     }
00409     $last_type = $type;
00410   }
00411   
00412   return $daf_cigar;
00413 }
00414 
00415 
00416 sub convert_output {
00417   my ($self, $chains_of_dafs, $t_visible) = @_; 
00418 
00419   my (@chains_of_blocks);
00420   
00421   $t_visible = 1 if (!defined $t_visible);
00422 
00423   foreach my $chain_of_dafs (@$chains_of_dafs) {
00424     my @chain_of_blocks;
00425 
00426     foreach my $raw_daf (sort {$a->start <=> $b->start} @$chain_of_dafs) {
00427       my @split_dafs;
00428       if ($self->param('max_gap')) {
00429         @split_dafs = $self->split_feature($raw_daf, $self->param('max_gap'));
00430       } else {
00431         @split_dafs = ($raw_daf);
00432       }
00433 
00434       foreach my $daf (@split_dafs) {
00435         my ($q_cigar, $t_cigar, $al_len) = 
00436             $self->compara_cigars_from_daf_cigar($daf->cigar_string);
00437         
00438         my $q_dnafrag = $self->param('query_DnaFrag_hash')->{$daf->seqname};
00439         my $t_dnafrag = $self->param('target_DnaFrag_hash')->{$daf->hseqname};
00440         
00441         my $out_mlss = $self->param('output_MethodLinkSpeciesSet');
00442 
00443         my $q_genomic_align = Bio::EnsEMBL::Compara::GenomicAlign->new
00444             (-dnafrag        => $q_dnafrag,
00445              -dnafrag_start  => $daf->start,
00446              -dnafrag_end    => $daf->end,
00447              -dnafrag_strand => $daf->strand,
00448              -cigar_line     => $q_cigar,
00449          -visible        => 1,      #always set to true
00450              -method_link_species_set => $out_mlss);
00451   
00452         my $t_genomic_align = Bio::EnsEMBL::Compara::GenomicAlign->new
00453             (-dnafrag        => $t_dnafrag,
00454              -dnafrag_start  => $daf->hstart,
00455              -dnafrag_end    => $daf->hend,
00456              -dnafrag_strand => $daf->hstrand,
00457              -cigar_line     => $t_cigar,
00458          -visible        => $t_visible, #will be false for example, self alignments
00459              -method_link_species_set => $out_mlss);
00460 
00461         my $gen_al_block = Bio::EnsEMBL::Compara::GenomicAlignBlock->new
00462             (-genomic_align_array     => [$q_genomic_align, $t_genomic_align],
00463              -score                   => $daf->score,
00464              -length                  => $al_len,
00465              -method_link_species_set => $out_mlss,
00466          -group_id                => $daf->group_id,
00467          -level_id                => $daf->level_id ? $daf->level_id : 1);
00468         push @chain_of_blocks, $gen_al_block;
00469       }
00470     }
00471 
00472     push @chains_of_blocks, \@chain_of_blocks;
00473   }
00474     
00475   return \@chains_of_blocks;
00476 }
00477 
00478 sub cleanse_output {
00479   my ($self, $chains) = @_;
00480 
00481   # need to "cleanse" the of its original database attachments, so 
00482   # that it is stored as a fresh blocks. This involves touching the
00483   # object's privates, but more efficent than creating brand-new
00484   # blocks from scratch
00485   # NB don't undef group_id - I want to keep the chain group_id for the net.
00486 
00487   foreach my $chain (@{$chains}) {
00488     foreach my $gab (@{$chain}) {
00489 
00490       $gab->{'adaptor'} = undef;
00491       $gab->{'dbID'} = undef;
00492       $gab->{'method_link_species_set_id'} = undef;
00493       $gab->method_link_species_set($self->param('output_MethodLinkSpeciesSet'));
00494       foreach my $ga (@{$gab->get_all_GenomicAligns}) {
00495         $ga->{'adaptor'} = undef;
00496         $ga->{'dbID'} = undef;
00497         $ga->{'method_link_species_set_id'} = undef;
00498         $ga->method_link_species_set($self->param('output_MethodLinkSpeciesSet'));
00499       }
00500     }
00501   }
00502 
00503 }
00504 
00505 
00506 ###################################
00507 # redundant alignment deletion
00508 
00509 sub delete_alignments {
00510   my ($self, $mlss, $qy_dnafrag, $tg_dnafrag) = @_;
00511 
00512   my $dbc = $self->compara_dba->dbc;
00513   my $sql = "SELECT ga1.genomic_align_block_id, ga1.genomic_align_id, ga2.genomic_align_id
00514       FROM genomic_align ga1, genomic_align ga2
00515       WHERE ga1.genomic_align_block_id=ga2.genomic_align_block_id
00516       AND ga1.genomic_align_id != ga2.genomic_align_id
00517       AND ga1.dnafrag_id = ?
00518       AND ga1.method_link_species_set_id = ?";
00519   if (defined $tg_dnafrag) {
00520     $sql .= " AND ga2.dnafrag_id = ?";
00521   }
00522 
00523 
00524   my $sth = $dbc->prepare($sql);
00525   if (defined $tg_dnafrag) {
00526     $sth->execute( $qy_dnafrag->dbID, $mlss->dbID, $tg_dnafrag->dbID);
00527   } else {
00528     $sth->execute( $qy_dnafrag->dbID, $mlss->dbID);
00529   }
00530 
00531   my $nb_gabs = 0;
00532   my @gabs;
00533   while (my $aref = $sth->fetchrow_arrayref) {
00534     my ($gab_id, $ga_id1, $ga_id2) = @$aref;
00535     push @gabs, [$gab_id, $ga_id1, $ga_id2];
00536     $nb_gabs++;
00537   }
00538 
00539   my $sql_gab = "delete from genomic_align_block where genomic_align_block_id in ";
00540   my $sql_ga = "delete from genomic_align where genomic_align_id in ";
00541 
00542   for (my $i=0; $i < scalar @gabs; $i=$i+20000) {
00543     my (@gab_ids, @ga1_ids, @ga2_ids);
00544     for (my $j = $i; ($j < scalar @gabs && $j < $i+20000); $j++) {
00545       push @gab_ids, $gabs[$j][0];
00546       push @ga1_ids, $gabs[$j][1];
00547       push @ga2_ids, $gabs[$j][2];
00548     }
00549     my $sql_gab_to_exec = $sql_gab . "(" . join(",", @gab_ids) . ")";
00550     my $sql_ga_to_exec1 = $sql_ga . "(" . join(",", @ga1_ids) . ")";
00551     my $sql_ga_to_exec2 = $sql_ga . "(" . join(",", @ga2_ids) . ")";
00552 
00553     foreach my $sql ($sql_gab_to_exec,$sql_ga_to_exec1,$sql_ga_to_exec2) {
00554       my $sth = $dbc->prepare($sql);
00555       $sth->execute;
00556       $sth->finish;
00557     }
00558   }
00559 }
00560 
00561 
00562 ###################################
00563 
00564 sub output{
00565     my ($self, $output) = @_;
00566     if(!$self->param('output')){
00567     $self->param('output',[]);
00568     }
00569     if($output){
00570     my $this_output = $self->param('output');
00571     if(ref($output) ne 'ARRAY'){
00572         throw('Must pass RunnableDB:output an array ref not a '.$output);
00573     }
00574     push(@{$this_output}, @$output);
00575     $self->param('output', $this_output);
00576     }
00577     return $self->param('output');
00578 }
00579 
00580 1;