Archive Ensembl HomeArchive Ensembl Home
FilterStack.pm
Go to the documentation of this file.
00001 #
00002 # You may distribute this module under the same terms as perl itself
00003 #
00004 # POD documentation - main docs before the code
00005 
00006 =pod 
00007 
00008 =head1 NAME
00009 
00010 Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::FilterStack
00011 
00012 =cut
00013 
00014 =head1 SYNOPSIS
00015 
00016 my $db       = Bio::EnsEMBL::Compara::DBAdaptor->new($locator);
00017 my $runnable = Bio::EnsEMBL::Pipeline::RunnableDB::FilterStack->new (
00018                                                     -db      => $db,
00019                                                     -input_id   => $input_id
00020                                                     -analysis   => $analysis );
00021 $runnable->fetch_input(); #reads from DB
00022 $runnable->run();
00023 $runnable->write_output(); #writes to DB
00024 
00025 =cut
00026 
00027 =head1 DESCRIPTION
00028 
00029 This is designed to run after Chaining and Netting has been done on a PairWise analysis. It will perform a sort of pseudo softmasking on regions of large numbers of overlapping genomic_aligns on the non-reference species. It marks regions where there are more than "threshold" genomic_aligns and will remove all genomic_aligns and genomic_align_blocks that are entirely within this region. It takes as input (on the input_id string).
00030 
00031 =cut
00032 
00033 =head1 CONTACT
00034 
00035 Describe contact details here
00036 
00037 =cut
00038 
00039 =head1 APPENDIX
00040 
00041 The rest of the documentation details each of the object methods.
00042 Internal methods are usually preceded with a _
00043 
00044 =cut
00045 
00046 package Bio::EnsEMBL::Compara::Production::GenomicAlignBlock::FilterStack;
00047 
00048 use strict;
00049 use Time::HiRes qw(time gettimeofday tv_interval);
00050 use Bio::EnsEMBL::Utils::Exception qw( throw warning verbose );
00051 
00052 use Bio::EnsEMBL::DBSQL::DBAdaptor;
00053 use Bio::EnsEMBL::DBLoader;
00054 
00055 use Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor;
00056 use Bio::EnsEMBL::Compara::Production::DnaFragChunk;
00057 use Bio::EnsEMBL::Compara::Production::DnaFragChunkSet;
00058 use Bio::EnsEMBL::Compara::GenomicAlignBlock;
00059 
00060 use Bio::EnsEMBL::Hive::DBSQL::AnalysisJobAdaptor;
00061 
00062 use Bio::EnsEMBL::Pipeline::RunnableDB;
00063 our @ISA = qw(Bio::EnsEMBL::Pipeline::RunnableDB);
00064 
00065 
00066 =head2 fetch_input
00067 
00068     Title   :   fetch_input
00069     Usage   :   $self->fetch_input
00070     Function:   prepares global variables and DB connections
00071     Returns :   none
00072     Args    :   none
00073 
00074 =cut
00075 
00076 sub fetch_input {
00077   my( $self) = @_;
00078 
00079   #
00080   # parameters which can be set either via
00081   # $self->parameters OR
00082   # $self->input_id
00083   #
00084   $self->{'threshold'}                  = undef;
00085   $self->{'method_link_species_set_id'} = undef;
00086   $self->{'options'} = undef;
00087 
00088   #default height. Only prune stacks that are above this height initially
00089   $self->{'height'} = 40; 
00090 
00091   $self->debug(0);
00092 
00093   $self->get_params($self->parameters);
00094   $self->get_params($self->input_id);
00095 
00096   $self->print_params;
00097 
00098   #create a Compara::DBAdaptor which shares the same DBI handle
00099   #with the Pipeline::DBAdaptor that is based into this runnable
00100   $self->{'comparaDBA'} = Bio::EnsEMBL::Compara::Production::DBSQL::DBAdaptor->new(-DBCONN => $self->db->dbc);
00101   $self->{'comparaDBA'}->dbc->disconnect_when_inactive(0);
00102 
00103   # get DnaCollection
00104   throw("must specify 'collection_name' to identify DnaCollection") 
00105     unless(defined($self->{'collection_name'}));
00106   $self->{'collection'} = $self->{'comparaDBA'}->get_DnaCollectionAdaptor->fetch_by_set_description($self->{'collection_name'});
00107   throw("unable to find DnaCollection with name : ". $self->{'collection_name'})
00108     unless(defined($self->{'collection'}));
00109 
00110   return 1;
00111 }
00112 
00113 
00114 sub run
00115 {
00116   my $self = shift;
00117   $self->filter_stack;
00118   return 1;
00119 }
00120 
00121 
00122 sub write_output 
00123 {
00124   my $self = shift;
00125 
00126   my $output_id = $self->input_id;
00127 
00128   print("output_id = $output_id\n");
00129   $self->input_id($output_id);
00130   return 1;
00131 }
00132 
00133 
00134 ######################################
00135 #
00136 # subroutines
00137 #
00138 #####################################
00139 
00140 sub get_params {
00141   my $self         = shift;
00142   my $param_string = shift;
00143 
00144   return unless($param_string);
00145   print("parsing parameter string : ",$param_string,"\n");
00146   
00147   my $params = eval($param_string);
00148   return unless($params);
00149 
00150   foreach my $key (keys %$params) {
00151     print("  $key : ", $params->{$key}, "\n");
00152   }
00153   # from analysis parameters
00154   $self->{'method_link'} = $params->{'method_link'} 
00155     if(defined($params->{'method_link'}));
00156 
00157   $self->{'query_genome_db_id'} = $params->{'query_genome_db_id'} 
00158     if(defined($params->{'query_genome_db_id'}));
00159 
00160   $self->{'target_genome_db_id'} = $params->{'target_genome_db_id'} 
00161     if(defined($params->{'target_genome_db_id'}));
00162   
00163   $self->{'height'} = $params->{'height'}
00164     if(defined($params->{'height'}));
00165   
00166   # from job input_id
00167   $self->{'collection_name'} = $params->{'collection_name'} 
00168     if(defined($params->{'collection_name'}));
00169 
00170   return;
00171 
00172 }
00173 
00174 
00175 sub print_params {
00176   my $self = shift;
00177 
00178   print(" params:\n");
00179   print("   method_link : ", $self->{'method_link'},"\n");
00180   print("   query_genome_db_id : ", $self->{'query_genome_db_id'},"\n");
00181   print("   target_genome_db_id : ", $self->{'target_genome_db_id'},"\n");
00182   print("   height : ", $self->{'height'},"\n");
00183   print("   collection_name : ", $self->{'collection_name'},"\n");
00184 
00185 }
00186 
00187 
00188 sub filter_stack {
00189     my $self = shift;
00190     
00191     my $height = $self->{'height'};
00192     my $dna_collection  = $self->{'collection'};
00193     
00194     $self->{'delete_group'} = {}; #all the genomic_align_blocks that need to be deleted
00195     my $mlss = $self->{'comparaDBA'}->get_MethodLinkSpeciesSetAdaptor->fetch_by_method_link_type_genome_db_ids($self->{'method_link'}, [$self->{'query_genome_db_id'}, $self->{'target_genome_db_id'}]);
00196     
00197     
00198     my $GAB_DBA = $self->{'comparaDBA'}->get_GenomicAlignBlockAdaptor;
00199     # create a list of dnafrag here in case we want to make this runnable working with an input_id 
00200     # containing a list of dnafrag_ids
00201 
00202     #my $dnafrag_list = [$self->{'comparaDBA'}->get_DnaFragAdaptor->fetch_by_dbID($self->{'dnafrag_id'})];
00203     
00204     #my $dnafrag_id_list = $dna_collection->get_all_dnafrag_ids;
00205     my $dna_list = $dna_collection->get_all_dna_objects;
00206 
00207     foreach my $dna_object (@{$dna_list}) {
00208     my $dnafrag_chunk_array;
00209     
00210     if ($dna_object->isa('Bio::EnsEMBL::Compara::Production::DnaFragChunkSet')) {
00211         $dnafrag_chunk_array = $dna_object->get_all_DnaFragChunks;
00212     } else {
00213         $dnafrag_chunk_array = [$dna_object];
00214     }
00215     foreach my $new_dna_object (@$dnafrag_chunk_array) {
00216         my $delete_group = {};
00217         my $dnafrag = $new_dna_object->dnafrag;
00218         
00219         my $genomic_align_block_list = $GAB_DBA->fetch_all_by_MethodLinkSpeciesSet_DnaFrag($mlss, $dnafrag);
00220         
00221         #printf("dnafrag %s %s has %d GABs\n", $dnafrag->coord_system_name, $dnafrag->name, scalar(@$genomic_align_block_list));
00222         
00223         #print $dnafrag->name, ": ", scalar(@$genomic_align_block_list), "\n";
00224         
00225         if ($self->debug) {
00226         foreach my $gab (@{$genomic_align_block_list}) {
00227             $self->assign_jobID_to_genomic_align_block($gab);
00228         }
00229         }
00230         
00231         @$genomic_align_block_list = sort {
00232         $a->reference_genomic_align->dnafrag_start <=>
00233           $b->reference_genomic_align->dnafrag_start} @$genomic_align_block_list;
00234         my $max_end = 0;
00235         my $blocks = [];
00236         foreach my $this_genomic_align_block (@$genomic_align_block_list) {
00237           #found all overlapping blocks
00238         if ($this_genomic_align_block->reference_genomic_align->dnafrag_start > $max_end) {
00239             if (@$blocks > 0) {
00240             $delete_group = $self->find_stack_coverage($blocks, $height, $delete_group);
00241             }
00242             $blocks = [];
00243         }
00244         push(@$blocks, $this_genomic_align_block);
00245         if ($this_genomic_align_block->reference_genomic_align->dnafrag_end > $max_end) {
00246             $max_end = $this_genomic_align_block->reference_genomic_align->dnafrag_end;
00247         }
00248         }
00249         if (@$blocks > 0) {
00250         $delete_group = $self->find_stack_coverage($blocks, $height, $delete_group);
00251         }
00252         $self->delete_alignments($GAB_DBA, $mlss, $delete_group);
00253     }
00254     }
00255 }
00256 
00257 sub assign_jobID_to_genomic_align_block {
00258   my $self = shift;
00259   my $gab = shift;
00260 
00261   my $sql = "SELECT analysis_job_id FROM genomic_align_block_job_track ".
00262                   "WHERE genomic_align_block_id=?";
00263   my $sth = $self->{'comparaDBA'}->prepare($sql);
00264   $sth->execute($gab->dbID);
00265   my ($job_id) = $sth->fetchrow_array();
00266   $sth->finish;
00267   $gab->{'analysis_job_id'} = $job_id;
00268 }
00269 
00270 sub find_stack_coverage {
00271     my ($self, $blocks, $height, $delete_group) = @_;
00272 
00273     #find coverage ie min start and max end of overlapping blocks
00274     my ($min_start, $max_end);
00275     foreach my $this_block (@$blocks) {
00276     if (!$min_start or $min_start > $this_block->reference_genomic_align->dnafrag_start) {
00277         $min_start = $this_block->reference_genomic_align->dnafrag_start;
00278     }
00279     if (!$max_end or $max_end < $this_block->reference_genomic_align->dnafrag_end) {
00280         $max_end = $this_block->reference_genomic_align->dnafrag_end;
00281     }
00282     }
00283     
00284     #print "update_coverage min_start $min_start max_end $max_end\n";
00285     
00286     #find number of overlapping blocks for each position in coverage
00287     my @coverage_by_pos;
00288     foreach my $this_block (@$blocks) {
00289     my $start = $this_block->reference_genomic_align->dnafrag_start;
00290     my $end = $this_block->reference_genomic_align->dnafrag_end;
00291     for (my $a = $start; $a <= $end; $a++) {
00292         $coverage_by_pos[$a-$min_start]++;
00293     }
00294     }
00295     
00296     my $threshold_limits;
00297     my $threshold_limit;
00298 
00299     #loop through each position in coverage
00300     for (my $a = 0; $a < @coverage_by_pos; $a++) {
00301     $coverage_by_pos[$a] ||= 0;
00302     
00303     #if number of gabs at this position is above threshold
00304     if ($coverage_by_pos[$a] > $height) {
00305         #find start of threshold coverage
00306         if (!defined $threshold_limit->{min}) {
00307         $threshold_limit->{min} = $a+$min_start;
00308         }
00309     } else {
00310         #coverage has fallen below threshold so store max as previous value if
00311         #min has already been stored.
00312         if (defined($threshold_limit->{min})) {
00313         $threshold_limit->{max} = $a+$min_start-1;
00314         push @$threshold_limits, $threshold_limit;
00315         #unset threshold_limit 
00316         undef($threshold_limit);
00317         }
00318     }
00319     }
00320     
00321     #if the last item of coverage_by_pos is above the threshold, I won't have found it before in previous loop
00322     if (defined($threshold_limit->{min}) && !defined($threshold_limit->{max} && $coverage_by_pos[-1] > $height)) {
00323     $threshold_limit->{max} = @coverage_by_pos+$min_start-1;
00324     push @$threshold_limits, $threshold_limit;
00325     undef($threshold_limit);
00326     }
00327 
00328     #remove all blocks that are entirely covered by the threshold limits.
00329     foreach my $limit (@$threshold_limits) {
00330     foreach my $this_block (@$blocks) {
00331         my $start = $this_block->reference_genomic_align->dnafrag_start;
00332         my $end = $this_block->reference_genomic_align->dnafrag_end;
00333         if ($start >= $limit->{min} && $end <= $limit->{max}) {
00334         #print "delete gab $start $end " . $this_block->dbID . "\n";
00335         push @{$delete_group->{$this_block->group_id}}, $this_block;
00336         }
00337     }
00338     }
00339     return $delete_group;
00340 }
00341 
00342 
00343 sub delete_alignments {
00344     my ($self, $GAB_DBA, $mlss, $delete_group) = @_;
00345     my @del_list;
00346     
00347     foreach my $group_id (keys %{$delete_group}) {
00348     my $group_gabs = $GAB_DBA->fetch_all_by_MethodLinkSpeciesSet_GroupID($mlss, $group_id);
00349         
00350     #Only delete a gab if all the gabs of a group are in $delete_group
00351     if (@$group_gabs == @{$delete_group->{$group_id}}) {
00352         #check the gab->dbIDs match
00353         my $error = 0;
00354         @$group_gabs = sort {$a->dbID<=>$b->dbID} @$group_gabs;
00355         @{$delete_group->{$group_id}} = sort {$a->dbID<=>$b->dbID} @{$delete_group->{$group_id}};
00356         for (my $i = 0; $i < @$group_gabs; $i++) {
00357         #Hopefully this should never happen
00358         if ($group_gabs->[$i]->dbID != $delete_group->{$group_id}->[$i]->dbID) {
00359             warn "Inconsistent genomic_align_blocks in group $group_id \n";
00360             $error = 1;
00361         }
00362         }
00363         if (!$error) {
00364         push @del_list, @$group_gabs;
00365         }
00366     }
00367     }
00368 
00369     #assume not many of these
00370     if (@del_list > 0) {
00371     my @gab_ids = map {$_->dbID} @del_list;    
00372     my $sql_genomic_align = "DELETE FROM genomic_align WHERE genomic_align_block_id IN (" . join(",", @gab_ids) . ");";
00373     my $sql_genomic_align_block = "DELETE FROM genomic_align_block WHERE genomic_align_block_id in (" . join(",", @gab_ids) . ");";
00374     
00375     #foreach my $gab (@del_list) {
00376     #   print("DELETE "); 
00377     #   print_gab($gab);
00378     #}
00379     my $sth_genomic_align = $self->{'comparaDBA'}->prepare($sql_genomic_align);
00380     $sth_genomic_align->execute;
00381     $sth_genomic_align->finish;
00382     my $sth_genomic_align_block = $self->{'comparaDBA'}->prepare($sql_genomic_align_block);
00383     $sth_genomic_align_block->execute;
00384     $sth_genomic_align->execute;
00385     }
00386     printf("%d gabs to delete\n", scalar(@del_list));
00387 }
00388 
00389 sub print_gab {
00390   my $gab = shift;
00391   
00392   my ($gab_1, $gab_2) = @{$gab->genomic_align_array};
00393   printf(" id(%d)  %s:(%d)%d-%d    %s:(%d)%d-%d  score=%d\n", 
00394          $gab->dbID,
00395          $gab_1->dnafrag->name, $gab_1->dnafrag_strand, $gab_1->dnafrag_start, $gab_1->dnafrag_end,
00396          $gab_2->dnafrag->name, $gab_2->dnafrag_strand, $gab_2->dnafrag_start, $gab_2->dnafrag_end,
00397          $gab->score);
00398 }
00399 
00400 
00401 1;