Archive Ensembl HomeArchive Ensembl Home
FilterDuplicates.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::PairAligner::FilterDuplicates
00022 
00023 =cut
00024 
00025 =head1 SYNOPSIS
00026 
00027 =cut
00028 
00029 =head1 DESCRIPTION
00030 
00031 This analysis/RunnableDB is designed to run after all GenomicAlignBlock entries for a 
00032 specific MethodLinkSpeciesSet has been completed and filters out all duplicate entries
00033 which can result from jobs being rerun or from regions of overlapping chunks generating
00034 the same HSP hits.  It takes as input (on the input_id string) 
00035 
00036 =cut
00037 
00038 =head1 APPENDIX
00039 
00040 The rest of the documentation details each of the object methods.
00041 Internal methods are usually preceded with a _
00042 
00043 =cut
00044 
00045 package Bio::EnsEMBL::Compara::RunnableDB::PairAligner::FilterDuplicates;
00046 
00047 use strict;
00048 use Time::HiRes qw(time gettimeofday tv_interval);
00049 
00050 use Bio::EnsEMBL::DBSQL::DBAdaptor;
00051 use Bio::EnsEMBL::DBLoader;
00052 
00053 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00054 use Bio::EnsEMBL::Compara::Production::DnaFragChunk;
00055 use Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00056 use Bio::EnsEMBL::Compara::GenomicAlignBlock;
00057 
00058 use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor;
00059 
00060 use Bio::EnsEMBL::Analysis::RunnableDB;
00061 
00062 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00063 
00064 =head2 fetch_input
00065 
00066     Title   :   fetch_input
00067     Usage   :   $self->fetch_input
00068     Function:   prepares global variables and DB connections
00069     Returns :   none
00070     Args    :   none
00071 
00072 =cut
00073 
00074 sub fetch_input {
00075   my( $self) = @_;
00076 
00077   unless (defined $self->param('window_size')) {
00078       $self->param('window_size',1000000);  
00079   }
00080 
00081   my $meta_container = $self->compara_dba->get_MetaContainer;
00082 
00083   #Read chunk_size from meta table
00084   unless (defined $self->param('chunk_size')) {
00085       my $key = "chunk_" . $self->param('method_link_species_set_id');
00086       my @option_list = $meta_container->list_value_by_key($key);
00087       my $option = $option_list[0][0];
00088       my $chunks = eval $option;
00089       $self->param('chunks', $chunks);
00090 
00091       if ($self->param('reference_species')) {
00092       $self->param('chunk_size', $chunks->{'ref'}{'chunk_size'});
00093       $self->param('overlap', $chunks->{'ref'}{'overlap'});
00094       } else {
00095       $self->param('chunk_size', $chunks->{'non_ref'}{'chunk_size'});
00096       $self->param('overlap', $chunks->{'non_ref'}{'overlap'});
00097       }
00098 
00099   }
00100 
00101   throw("No dnafrag_id specified") unless defined($self->param('dnafrag_id'));
00102   throw("Window size (".$self->param('window_size').")must be > 0") if (!$self->param('window_size') or $self->param('window_size') <= 0);
00103   $self->print_params;
00104 
00105   $self->compara_dba->dbc->disconnect_when_inactive(0);
00106     
00107   return 1;
00108 }
00109 
00110 
00111 sub run
00112 {
00113   my $self = shift;
00114   $self->filter_duplicates;
00115   return 1;
00116 }
00117 
00118 
00119 sub write_output 
00120 {
00121   my $self = shift;
00122 
00123   my $output_id = $self->input_id;
00124 
00125   print("output_id = $output_id\n");
00126   #$self->input_id($output_id);
00127   return 1;
00128 }
00129 
00130 
00131 ######################################
00132 #
00133 # subroutines
00134 #
00135 #####################################
00136 
00137 
00138 sub print_params {
00139   my $self = shift;
00140 
00141   print(" params:\n");
00142   print("   method_link_species_set_id : ", $self->param('method_link_species_set_id'),"\n");
00143 }
00144 
00145 
00146 sub filter_duplicates {
00147   my $self = shift;
00148 
00149   my $overlap = $self->param('overlap');
00150   my $chunk_size = $self->param('chunk_size');
00151   my $window_size = $self->param('window_size');
00152   $self->param('overlap_count', 0);
00153   $self->param('identical_count', 0);
00154   $self->param('gab_count', 0);
00155   $self->param('truncate_count', 0);
00156   $self->param('not_truncate_count', 0);
00157   $self->param('delete_hash', {}); #all the genomic_align_blocks that need to be deleted
00158 
00159   my $mlss = $self->compara_dba->get_MethodLinkSpeciesSetAdaptor->fetch_by_dbID($self->param('method_link_species_set_id'));
00160 #  my ($gdb1, $gdb2) = @{$mlss->species_set};
00161 #  if($gdb1->dbID > $gdb2->dbID) {
00162 #    my $tmp = $gdb1; $gdb1=$gdb2; $gdb2=$tmp;
00163 #  }
00164 
00165   my $GAB_DBA = $self->compara_dba->get_GenomicAlignBlockAdaptor;
00166   # create a list of dnafrag here in case we want to make this runnable working with an input_id 
00167   # containing a list of dnafrag_ids
00168   my $dnafrag_list = [$self->compara_dba->get_DnaFragAdaptor->fetch_by_dbID($self->param('dnafrag_id'))];
00169 
00170   foreach my $dnafrag (@{$dnafrag_list}) {
00171     my $seq_region_start = $self->param('seq_region_start');
00172     $seq_region_start = 1 unless (defined $seq_region_start);
00173     #printf("dnafrag (%d)%s:%s len=%d\n", $dnafrag->dbID, $dnafrag->coord_system_name, $dnafrag->name, $dnafrag->length);
00174     my $seq_region_end = $self->param('seq_region_end');
00175     $seq_region_end = $dnafrag->length unless (defined $seq_region_end);
00176 
00177     if ($dnafrag->is_reference) {
00178     #find identical matches over all the dnafrag
00179     $self->find_identical_matches($seq_region_start, $seq_region_end, $window_size, $mlss, $dnafrag);
00180     
00181     #find edge artefacts only in the overlap regions
00182     if (defined $overlap && $overlap > 0) {
00183         $self->find_edge_artefacts($seq_region_start, $seq_region_end, $overlap, $chunk_size, $mlss, $dnafrag);
00184     }
00185     } else {
00186     #get correct start and end if non_reference eg haplotype. 
00187     #NB cannot overwrite by defining seq_region_start and seq_region_end
00188     if (!$dnafrag->is_reference) {
00189         my $slice_adaptor = $dnafrag->genome_db->db_adaptor->get_SliceAdaptor;
00190         my $slices = $slice_adaptor->fetch_by_region_unique($dnafrag->coord_system_name, $dnafrag->name);
00191         foreach my $slice (@$slices) {
00192         my $seq_region_start = $slice->start;
00193         my $seq_region_end = $slice->end;
00194 
00195         #find identical matches over all the dnafrag
00196         $self->find_identical_matches($seq_region_start, $seq_region_end, $window_size, $mlss, $dnafrag);
00197         
00198         #find edge artefacts only in the overlap regions
00199         if (defined $overlap && $overlap > 0) {
00200             $self->find_edge_artefacts($seq_region_start, $seq_region_end, $overlap, $chunk_size, $mlss, $dnafrag);
00201         }
00202         }
00203     }
00204     }
00205     
00206   }
00207 
00208   my @del_list = values(%{$self->param('delete_hash')});
00209 
00210   my $sql_ga = "delete ignore from genomic_align where genomic_align_id in ";
00211   my $sql_gab = "delete ignore from genomic_align_block where genomic_align_block_id in ";
00212 
00213   for (my $i=0; $i < scalar @del_list; $i=$i+1000) {
00214       my (@gab_ids, @ga_ids);
00215       for (my $j = $i; ($j < scalar @del_list && $j < $i+1000); $j++) {
00216       my $gab = $del_list[$j];
00217       push @gab_ids, $gab->dbID;
00218       foreach my $ga (@{$gab->genomic_align_array}) {
00219           push @ga_ids, $ga->dbID;
00220       }
00221       }
00222       my $sql_gab_to_exec = $sql_gab . "(" . join(",", @gab_ids) . ")";
00223       my $sql_ga_to_exec = $sql_ga . "(" . join(",", @ga_ids) . ")";
00224       
00225       foreach my $sql ($sql_ga_to_exec,$sql_gab_to_exec) {
00226       my $sth = $self->compara_dba->dbc->prepare($sql);
00227       $sth->execute;
00228       $sth->finish;
00229        }
00230   }
00231 
00232 #   foreach my $gab (@del_list) {
00233 #     #print("DELETE "); print_gab($gab);
00234 #     foreach my $ga (@{$gab->genomic_align_array}) {
00235 #       $sth_genomic_align_group->execute($ga->dbID);
00236 #       $sth_genomic_align->execute($ga->dbID);
00237 #     }
00238 #     $sth_genomic_align_block->execute($gab->dbID);
00239 #   }
00240 
00241 #   $sth_genomic_align_group->finish;
00242 #   $sth_genomic_align->finish;
00243 #   $sth_genomic_align_block->finish;
00244 
00245   printf("%d gabs to delete\n", scalar(keys(%{$self->param('delete_hash')})));
00246   printf("found %d equal GAB pairs\n", $self->param('identical_count'));
00247   printf("found %d overlapping GABs\n", $self->param('overlap_count'));
00248   printf("%d GABs loadled\n", $self->param('gab_count'));
00249   printf("%d TRUNCATE gabs\n", $self->param('truncate_count'));
00250   printf("%d not TRUNCATE gabs\n", $self->param('not_truncate_count'));
00251 }
00252 
00253 #Remove identical matches over all the dnafrag to remove matches either from 
00254 #overlaps or because a job has been run more than once
00255 sub find_identical_matches {
00256     my ($self, $region_start, $seq_region_end, $window_size, $mlss, $dnafrag) = @_;
00257 
00258     my $GAB_DBA = $self->compara_dba->get_GenomicAlignBlockAdaptor;
00259 
00260     while($region_start <= $seq_region_end) {
00261     my  $region_end = $region_start + $window_size;
00262 
00263     my $genomic_align_block_list = $GAB_DBA->fetch_all_by_MethodLinkSpeciesSet_DnaFrag
00264       ($mlss, $dnafrag, $region_start, $region_end);
00265     
00266     printf STDERR "IDENTICAL MATCHES: dnafrag %s %s %d:%d has %d GABs\n", $dnafrag->coord_system_name, $dnafrag->name, 
00267           $region_start, $region_end, scalar(@$genomic_align_block_list);
00268 
00269     if ($self->debug) {
00270         foreach my $gab (@{$genomic_align_block_list}) {
00271         $self->assign_jobID_to_genomic_align_block($gab);
00272         }
00273     }
00274 
00275     my $sort_time = time();
00276     # first sort the list for processing
00277     my @sorted_GABs = sort sort_alignments @$genomic_align_block_list;
00278     $self->param('gab_count', $self->param('gap_count')+scalar(@sorted_GABs));
00279     
00280     # remove all the equal duplicates from the list
00281     $self->removed_equals_from_genomic_align_block_list(\@sorted_GABs);
00282     
00283         $region_start = $region_end;
00284     }
00285 }
00286 
00287 #Remove matches spanning the overlap using "in_chunk_overlap" mode which 
00288 #checks just the region of the overlap and not the whole dnafrag
00289 sub find_edge_artefacts {
00290     my ($self, $region_start, $seq_region_end, $overlap, $chunk_size, $mlss, $dnafrag) = @_;
00291 
00292     my $GAB_DBA = $self->compara_dba->get_GenomicAlignBlockAdaptor;
00293 
00294     $region_start += $chunk_size - $overlap;
00295     while($region_start <= $seq_region_end) {
00296        my $region_end = $region_start + $overlap - 1;
00297 
00298        my $genomic_align_block_list = $GAB_DBA->fetch_all_by_MethodLinkSpeciesSet_DnaFrag
00299         ($mlss, $dnafrag, $region_start, $region_end);
00300        
00301        printf STDERR "EDGE ARTEFACTS: dnafrag %s %s %d:%d has %d GABs\n", $dnafrag->coord_system_name, $dnafrag->name, $region_start, $region_end, scalar(@$genomic_align_block_list);
00302        
00303        if ($self->debug) {
00304        foreach my $gab (@{$genomic_align_block_list}) {
00305            $self->assign_jobID_to_genomic_align_block($gab);
00306        }
00307        }
00308 
00309        # first sort the list for processing
00310        my @sorted_GABs = sort sort_alignments @$genomic_align_block_list;
00311        $self->param('gab_count', $self->param('gab_count')+scalar(@sorted_GABs));
00312 
00313        # now process remaining list (still sorted) for overlaps
00314        $self->remove_edge_artifacts_from_genomic_align_block_list(\@sorted_GABs, $region_start, $region_end);
00315        
00316         $region_start += $chunk_size - $overlap;
00317     }
00318 
00319 }
00320 
00321 sub sort_alignments{
00322   # $a,$b are GenomicAlignBlock objects
00323   my $a_ref = $a->reference_genomic_align;
00324   my ($a_other) = @{$a->get_all_non_reference_genomic_aligns};
00325   my $b_ref = $b->reference_genomic_align;
00326   my ($b_other) = @{$b->get_all_non_reference_genomic_aligns};
00327 
00328   return 
00329     ($a_other->dnafrag_id <=> $b_other->dnafrag_id) ||
00330     ($a_ref->dnafrag_strand <=> $b_ref->dnafrag_strand) ||
00331     ($a_other->dnafrag_strand <=> $b_other->dnafrag_strand) ||
00332     ($a_ref->dnafrag_start <=> $b_ref->dnafrag_start) ||
00333     ($a_ref->dnafrag_end <=> $b_ref->dnafrag_end) ||
00334     ($a_other->dnafrag_start <=> $b_other->dnafrag_start) ||
00335     ($a_other->dnafrag_end <=> $b_other->dnafrag_end) ||
00336     ($a->score <=> $b->score) ||
00337     ($a->dbID <=> $b->dbID);
00338 }
00339 
00340 
00341 sub assign_jobID_to_genomic_align_block {
00342   my $self = shift;
00343   my $gab = shift;
00344 
00345   my $sql = "SELECT analysis_job_id FROM genomic_align_block_job_track ".
00346                   "WHERE genomic_align_block_id=?";
00347   my $sth = $self->compara_dba->prepare($sql);
00348   $sth->execute($gab->dbID);
00349   my ($job_id) = $sth->fetchrow_array();
00350   $sth->finish;
00351   $gab->{'analysis_job_id'} = $job_id;
00352 }
00353 
00354 
00355 sub remove_deletes_from_list {
00356   my $self = shift;
00357   my $genomic_align_block_list = shift;
00358 
00359   my @new_list;  
00360   foreach my $gab (@$genomic_align_block_list) {
00361     push @new_list, $gab unless($self->param('delete_hash')->{$gab->dbID});
00362   }
00363   @$genomic_align_block_list = @new_list;
00364   return $genomic_align_block_list; 
00365 }
00366 
00367 
00368 sub removed_equals_from_genomic_align_block_list {
00369   my $self = shift;
00370   my $genomic_align_block_list = shift;
00371   my $region_start = shift;
00372   my $region_end = shift;
00373   
00374   return unless(scalar(@$genomic_align_block_list));
00375   if($self->debug > 2) {
00376     for(my $index=0; $index<(scalar(@$genomic_align_block_list)); $index++) {
00377       my $gab1 = $genomic_align_block_list->[$index];
00378       print_gab($gab1);
00379     }
00380   }
00381   for(my $index=0; $index<(scalar(@$genomic_align_block_list)); $index++) {
00382     my $gab1 = $genomic_align_block_list->[$index];
00383     next if($self->param('delete_hash')->{$gab1->dbID}); #already deleted so skip it
00384     
00385     for(my $index2=$index+1; $index2<(scalar(@$genomic_align_block_list)); $index2++) {
00386       my $gab2 = $genomic_align_block_list->[$index2];
00387       last if($gab2->reference_genomic_align->dnafrag_start > 
00388               $gab1->reference_genomic_align->dnafrag_start);
00389 
00390       next if($self->param('delete_hash')->{$gab2->dbID}); #already deleted so skip it
00391 
00392       if(genomic_align_blocks_identical($gab1, $gab2)) {
00393         if ($self->debug) {
00394           if($gab1->{'analysis_job_id'} == $gab2->{'analysis_job_id'}) {
00395             printf("WARNING!!!!!! identical GABs dbID:%d,%d  SAME JOB:%d,%d\n", 
00396                    $gab1->dbID, $gab2->dbID, 
00397                    $gab1->{'analysis_job_id'},$gab2->{'analysis_job_id'},);
00398             print("  "); print_gab($gab1);
00399             print("  "); print_gab($gab2);
00400           }
00401         }
00402         if($gab1->dbID < $gab2->dbID) {
00403           $self->param('delete_hash')->{$gab2->dbID} = $gab2;
00404         } else {
00405           $self->param('delete_hash')->{$gab1->dbID} = $gab1;
00406         }
00407         $self->param('identical_count', $self->param('identical_count')+1);        
00408         
00409         if($self->debug > 1) {
00410           if($gab1->{'analysis_job_id'} == $gab2->{'analysis_job_id'}) {
00411             printf("  EQUAL - SAME JOB (gab_ids %d %d) (jobs %d,%d)\n", 
00412                  $gab1->dbID, $gab2->dbID, 
00413                  $gab1->{'analysis_job_id'},$gab2->{'analysis_job_id'},);         
00414           }
00415           else {
00416             printf("  EQUAL - DIFFERENT JOB (gab_ids %d %d) (jobs %d,%d)\n",
00417                  $gab1->dbID, $gab2->dbID, 
00418                  $gab1->{'analysis_job_id'},$gab2->{'analysis_job_id'},);         
00419           }
00420           print("      "); print_gab($gab1);
00421           print("  del "); print_gab($gab2);
00422         }
00423       }
00424     }
00425   }
00426 
00427   $self->remove_deletes_from_list($genomic_align_block_list);
00428 }
00429 
00430 
00431 sub remove_edge_artifacts_from_genomic_align_block_list {
00432   my $self = shift;
00433   my $genomic_align_block_list = shift;
00434   my $region_start = shift;
00435   my $region_end = shift;
00436   
00437   return unless(scalar(@$genomic_align_block_list));
00438   
00439   for(my $index=0; $index<(scalar(@$genomic_align_block_list)); $index++) {
00440     my $gab1 = $genomic_align_block_list->[$index];
00441     next if($self->param('delete_hash')->{$gab1->dbID}); #already deleted so skip it
00442 
00443     for(my $index2=$index+1; $index2<(scalar(@$genomic_align_block_list)); $index2++) {
00444       my $gab2 = $genomic_align_block_list->[$index2];
00445       last if($gab2->reference_genomic_align->dnafrag_start > 
00446               $gab1->reference_genomic_align->dnafrag_end);
00447       next if($self->param('delete_hash')->{$gab2->dbID}); #already deleted so skip it
00448 
00449       if(genomic_align_blocks_overlap($gab1, $gab2)) {
00450         $self->param('overlap_count', $self->param('overlap_count')+1);
00451 
00452         unless($self->process_overlap_for_chunk_edge_truncation($gab1, $gab2, $region_start, $region_end)) {
00453           if($self->debug) {
00454             if($gab1->{'analysis_job_id'} == $gab2->{'analysis_job_id'}) {
00455               printf("SAME JOB overlaping GABs %d %d\n", $gab1->dbID, $gab2->dbID);
00456             }
00457             else {
00458               printf("DIFFERENT JOB overlaping GABs %d %d\n", $gab1->dbID, $gab2->dbID);
00459             }
00460             print("  "); print_gab($gab1);
00461             print("  "); print_gab($gab2);
00462           }
00463         }
00464       } 
00465     }
00466   }
00467   #printf("found %d identical, %d overlapping GABs\n", $self->param('identical_count'), $self->param('overlap_count'));  
00468 
00469   $self->remove_deletes_from_list($genomic_align_block_list);
00470 }
00471 
00472 
00473 sub print_gab {
00474   my $gab = shift;
00475   
00476   my ($gab_1, $gab_2) = @{$gab->genomic_align_array};
00477   printf(" id(%d)  %s:(%d)%d-%d    %s:(%d)%d-%d  score=%d\n", 
00478          $gab->dbID,
00479          $gab_1->dnafrag->name, $gab_1->dnafrag_strand, $gab_1->dnafrag_start, $gab_1->dnafrag_end,
00480          $gab_2->dnafrag->name, $gab_2->dnafrag_strand, $gab_2->dnafrag_start, $gab_2->dnafrag_end,
00481          $gab->score);
00482 }
00483 
00484 
00485 sub genomic_align_blocks_identical {
00486   my ($gab1, $gab2) = @_;
00487   
00488   my ($gab1_1, $gab1_2) = sort {$a->dnafrag_id <=> $b->dnafrag_id} @{$gab1->genomic_align_array};
00489   my ($gab2_1, $gab2_2) = sort {$a->dnafrag_id <=> $b->dnafrag_id} @{$gab2->genomic_align_array};
00490 
00491   return 0 if(($gab1_1->dnafrag_id != $gab2_1->dnafrag_id) or ($gab1_2->dnafrag_id != $gab2_2->dnafrag_id));
00492   return 0 if(($gab1_1->dnafrag_strand != $gab2_1->dnafrag_strand) or ($gab1_2->dnafrag_strand != $gab2_2->dnafrag_strand));
00493 
00494   return 0 if(($gab1_1->dnafrag_start != $gab2_1->dnafrag_start) or ($gab1_1->dnafrag_end != $gab2_1->dnafrag_end));  
00495   return 0 if(($gab1_2->dnafrag_start != $gab2_2->dnafrag_start) or ($gab1_2->dnafrag_end != $gab2_2->dnafrag_end));  
00496 
00497   return 0 if($gab1->score != $gab2->score);
00498 
00499   return 0 if($gab1_1->cigar_line ne $gab2_1->cigar_line);  
00500   return 0 if($gab1_2->cigar_line ne $gab2_2->cigar_line);
00501   
00502   return 1;
00503 }
00504 
00505 
00506 sub genomic_align_blocks_overlap {
00507   my ($gab1, $gab2) = @_;
00508   
00509   my ($gab1_1, $gab1_2) = sort {$a->dnafrag_id <=> $b->dnafrag_id} @{$gab1->genomic_align_array};
00510   my ($gab2_1, $gab2_2) = sort {$a->dnafrag_id <=> $b->dnafrag_id} @{$gab2->genomic_align_array};
00511   
00512   return 0 if(($gab1_1->dnafrag_id != $gab2_1->dnafrag_id) or ($gab1_2->dnafrag_id != $gab2_2->dnafrag_id));
00513   return 0 if(($gab1_1->dnafrag_strand != $gab2_1->dnafrag_strand) or ($gab1_2->dnafrag_strand != $gab2_2->dnafrag_strand));
00514 
00515   return 0 if(($gab1_1->dnafrag_end < $gab2_1->dnafrag_start) or ($gab1_1->dnafrag_start > $gab2_1->dnafrag_end));  
00516   return 0 if(($gab1_2->dnafrag_end < $gab2_2->dnafrag_start) or ($gab1_2->dnafrag_start > $gab2_2->dnafrag_end));  
00517   
00518   return 1;
00519 }
00520 
00521 
00522 sub process_overlap_for_chunk_edge_truncation {
00523   my ($self, $gab1, $gab2, $region_start, $region_end) = @_;
00524    
00525   my $aligns_1_1 = $gab1->reference_genomic_align;
00526   my $aligns_1_2 = $gab1->get_all_non_reference_genomic_aligns->[0];
00527   my $aligns_2_1 = $gab2->reference_genomic_align;
00528   my $aligns_2_2 = $gab2->get_all_non_reference_genomic_aligns->[0];
00529     
00530   #first test if this overlapping pair is such that one of them crosses
00531   #one of the region boundaries and the other one does not
00532   #Processings is done such that we walk through the overlap regions of 
00533   #one genome, which in the GAB is the 'reference' so only need to test the
00534   #reference for region_edge artifacts.
00535   #IF both genomes are chunked and overlapped, then this needs to be run twice
00536   #1) with genome1 on the genome1 chunking regions
00537   #2) with genome2 on the genome2 chunking regions
00538   return undef 
00539     unless((($aligns_1_1->dnafrag_start < $region_start) and
00540             ($aligns_2_1->dnafrag_start >= $region_start))
00541            or
00542            (($aligns_1_1->dnafrag_start >= $region_start) and
00543             ($aligns_2_1->dnafrag_start < $region_start))
00544            or
00545            (($aligns_1_1->dnafrag_end > $region_end) and
00546             ($aligns_2_1->dnafrag_end <= $region_end))
00547            or
00548            (($aligns_1_1->dnafrag_end <= $region_end) and
00549             ($aligns_2_1->dnafrag_end > $region_end)));
00550             
00551 
00552   if(
00553       (($aligns_1_1->dnafrag_strand == $aligns_1_2->dnafrag_strand)
00554         and(
00555           (($aligns_1_1->dnafrag_start == $aligns_2_1->dnafrag_start) and 
00556            ($aligns_1_2->dnafrag_start == $aligns_2_2->dnafrag_start)) 
00557           or
00558           (($aligns_1_1->dnafrag_end   == $aligns_2_1->dnafrag_end) and 
00559            ($aligns_1_2->dnafrag_end   == $aligns_2_2->dnafrag_end))
00560         )
00561       ) 
00562       or
00563       (($aligns_1_1->dnafrag_strand != $aligns_1_2->dnafrag_strand)
00564         and(
00565           (($aligns_1_1->dnafrag_start == $aligns_2_1->dnafrag_start) and 
00566            ($aligns_1_2->dnafrag_end   == $aligns_2_2->dnafrag_end)) 
00567           or
00568           (($aligns_1_1->dnafrag_end   == $aligns_2_1->dnafrag_end) and 
00569            ($aligns_1_2->dnafrag_start == $aligns_2_2->dnafrag_start))
00570         )
00571       ) 
00572     )   
00573   {
00574     if($gab1->score > $gab2->score) {
00575       $self->param('delete_hash')->{$gab2->dbID} = $gab2;
00576     } else {
00577       $self->param('delete_hash')->{$gab1->dbID} = $gab1;
00578     }
00579     $self->param('truncate_count', $self->param('truncate_count')+1);  
00580     if ($self->debug) {
00581       if($gab1->{'analysis_job_id'} == $gab2->{'analysis_job_id'}) {
00582         printf("TRUNCATE GABs %d %d\n", $gab2->dbID, $gab1->dbID);
00583         if($self->param('delete_hash')->{$gab1->dbID}) { print("  DEL ");} else{ print("      ");}
00584         print_gab($gab1);
00585         if($self->param('delete_hash')->{$gab2->dbID}) { print("  DEL ");} else{ print("      ");}
00586         print_gab($gab2);
00587       } 
00588     }
00589   }
00590   else {
00591     $self->param('not_truncate_count', $self->param('not_truncate_count')+1);
00592     if ($self->debug) {
00593       if($gab1->{'analysis_job_id'} == $gab2->{'analysis_job_id'}) {
00594         printf("overlaping GABs %d %d - not truncate\n", $gab2->dbID, $gab1->dbID);
00595         print("      "); print_gab($gab1);
00596         print("      "); print_gab($gab2);
00597       }
00598     }
00599   }
00600 }
00601 
00602 1;