Archive Ensembl HomeArchive Ensembl Home
FindPairwiseOverlaps.pm
Go to the documentation of this file.
00001 package Bio::EnsEMBL::Compara::Production::EPOanchors::FindPairwiseOverlaps;
00002 
00003 use strict;
00004 use Data::Dumper;
00005 use Bio::EnsEMBL::Hive::Process;
00006 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00007 use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;
00008 use Bio::EnsEMBL::Registry;
00009     
00010 
00011 our @ISA = qw(Bio::EnsEMBL::Hive::Process);
00012 
00013 
00014 sub fetch_input {
00015     my ($self) = @_;
00016     # $reference_species_dba is the reference species for the pairwise alignments
00017     my $reference_species_dba = new Bio::EnsEMBL::DBSQL::DBAdaptor( %{ $self->param('reference_db') } );
00018     # $compara_dba is the pairwise alignments db
00019     my $compara_dba = new Bio::EnsEMBL::Compara::DBSQL::DBAdaptor( %{ $self->param('compara_pairwise_db') } );
00020     $self->param('compara_dba', $compara_dba);
00021     $self->param('reference_dba', $reference_species_dba);
00022     my $reference_genome_db_id = $self->param('reference_genome_db_id');
00023     my $ref_genome_db = $compara_dba->get_GenomedbAdaptor()->fetch_by_dbID($reference_genome_db_id);
00024     $self->param('ref_genome_db', $ref_genome_db);
00025     my $non_reference_genome_dbIDs = $self->param('genome_db_ids');
00026     my $ref_slice_adaptor = Bio::EnsEMBL::Registry->get_adaptor($self->param('ref_genome_db')->name, "core", "Slice");
00027     my $ref_dnafrag = $compara_dba->get_DnaFragAdaptor()->fetch_by_dbID($self->param('ref_dnafrag_id'));
00028     $self->param('ref_dnafrag', $ref_dnafrag);
00029     $self->param('dnafrag_chunks', eval{ $self->param('dnafrag_chunks') });
00030     my $genomic_align_block_adaptor = $compara_dba->get_GenomicAlignBlockAdaptor;
00031     my $method_link_species_set_adaptor = $compara_dba->get_MethodLinkSpeciesSetAdaptor;
00032     my $ref_slice = $ref_slice_adaptor->fetch_by_region($ref_dnafrag->coord_system_name,
00033                         $ref_dnafrag->name, @{ $self->param('dnafrag_chunks') });
00034     $self->param('ref_slice_adaptor', $ref_slice_adaptor);
00035     my (@multi_gab_overlaps, @mlss);
00036     my $method_type_genome_db_ids = $self->param('method_type_genome_db_ids');
00037     foreach my $this_method_link_type( keys %{ $method_type_genome_db_ids } ){
00038         foreach my $non_reference_genome_dbID( @{ $method_type_genome_db_ids->{$this_method_link_type} }){
00039             my $mlss = $method_link_species_set_adaptor->fetch_by_method_link_type_genome_db_ids(
00040                 $this_method_link_type, [ $reference_genome_db_id, $non_reference_genome_dbID ]);
00041             my $gabs = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice(
00042                     $mlss, $ref_slice);
00043         
00044             foreach my $genomic_align_block( @{ $gabs } ){
00045                 my $restricted_gab = $genomic_align_block->restrict_between_reference_positions( @{ $self->param('dnafrag_chunks') } );
00046                 next if $restricted_gab->length < $self->param('min_anchor_size');
00047                 push( @multi_gab_overlaps, [
00048                         $restricted_gab->reference_genomic_align->dnafrag_start,
00049                         $restricted_gab->reference_genomic_align->dnafrag_end,
00050                         $non_reference_genome_dbID ] );
00051             }   
00052             push(@mlss, $mlss);
00053         }
00054     }
00055     $self->param('mlss', \@mlss);
00056     # @multi_genomic_align_blocks is a list of [ref_dnafrag_start,ref_dnafrag_end,nonref-species-id] 
00057     # for each genomic_align_block associated with the reference dnafrag (and the particular method_link_species_set_id(s)) 
00058     $self->param('overlapping_gabs', [ sort {$a->[0] <=> $b->[0]} @multi_gab_overlaps ]);
00059 }
00060 
00061 sub run {
00062     my ($self) = @_;
00063     my($overlap_index_ranges, $reference_positions, $genomic_aligns_on_ref_slice, $synteny_region_jobs) = 
00064     ([],[],[],[]);
00065     my $max_size_diff = $self->param('max_frag_diff');
00066     my $overlapping_gabs = $self->param('overlapping_gabs');
00067     for(my$i=0;$i<@{ $overlapping_gabs }-1;$i++) { # find the overlapping gabs for a ref-dnafrag chunk 
00068         my $temp_end = $overlapping_gabs->[$i]->[1];
00069         for(my$j=$i+1;$j<@{ $overlapping_gabs };$j++) { 
00070             if($temp_end >= $overlapping_gabs->[$j]->[0]) {
00071                 $temp_end = $temp_end > $overlapping_gabs->[$j]->[1] ? $temp_end : $overlapping_gabs->[$j]->[1];
00072             }
00073             else {
00074                 push(@$overlap_index_ranges, [$i, --$j]); 
00075                 # @$overlaps_index_ranges contains the index ranges for overlapping gabs within the chunk
00076                 $i = $j;
00077                 last;
00078             }
00079         }
00080     }
00081     return unless( @$overlap_index_ranges);
00082     for(my$k=0;$k<@$overlap_index_ranges;$k++) {
00083         my(%bases, @bases);
00084         for(my$l=$overlap_index_ranges->[$k]->[0];$l<=$overlap_index_ranges->[$k]->[1];$l++) { 
00085         # loop through the index positions for $overlapping_gabs
00086             for(my$m=$overlapping_gabs->[$l]->[0];$m<=$overlapping_gabs->[$l]->[1];$m++) {
00087                 $bases{$m}{$overlapping_gabs->[$l]->[2]}++; 
00088                 #count the number of non_ref org hits per base
00089             }
00090         }
00091         foreach my $base(sort {$a <=> $b} keys %bases) {
00092             if((keys %{$bases{$base}}) >= $self->param('min_number_of_org_hits_per_base')) {
00093                 push(@bases, $base);
00094             }
00095         }   
00096         if(@bases){
00097             if( $bases[-1] - $bases[0] >= $self->param('min_anchor_size') ){
00098                 push( @$reference_positions, [ $bases[0], $bases[-1] ] );
00099             }
00100         }
00101     }
00102     my $genomic_align_block_adaptor = $self->param('compara_dba')->get_GenomicAlignBlockAdaptor;
00103     my $this_method_link_species_set_id = $self->param('method_link_species_set_id');
00104     foreach my $coord_pair( @$reference_positions ){
00105         my $ref_sub_slice =  $self->param('ref_slice_adaptor')->fetch_by_region(
00106                     $self->param('ref_dnafrag')->coord_system_name,
00107                     $self->param('ref_dnafrag')->name,
00108                     @$coord_pair);
00109 
00110         # get a unique id for the synteny_region
00111         my $sth = $self->dbc->prepare("INSERT INTO synteny_region (method_link_species_set_id) VALUES (?)");
00112         $sth->execute( $this_method_link_species_set_id );
00113         my $synteny_region_id = $sth->{'mysql_insertid'};
00114         push @$synteny_region_jobs, { 'synteny_region_id' => $synteny_region_id };
00115 
00116         foreach my $mlss( @{ $self->param('mlss') } ){
00117             my $gabs = $genomic_align_block_adaptor->fetch_all_by_MethodLinkSpeciesSet_Slice($mlss, $ref_sub_slice);
00118             my %non_ref_dnafrags;
00119             foreach my $gab(@$gabs){
00120                 my $rgab = $gab->restrict_between_reference_positions( @$coord_pair );
00121                 my $restricted_non_reference_genomic_aligns = $rgab->get_all_non_reference_genomic_aligns;
00122                 my $temp_start = 0;
00123                 foreach my $non_ref_genomic_align (@$restricted_non_reference_genomic_aligns) {
00124                     my $non_ref_dnafrag = $non_ref_genomic_align->dnafrag;
00125                     my $uniq_id = join(":", $non_ref_dnafrag->dbID, $non_ref_genomic_align->dnafrag_strand);    
00126                     # get the dnafrag start and end coords for all the non_ref genomic_aligns 
00127                     # which have the same dnafrag_id and strand direction
00128                     push(@{ $non_ref_dnafrags{ $uniq_id } }, [ $non_ref_genomic_align->dnafrag_start,
00129                                     $non_ref_genomic_align->dnafrag_end ]);
00130                 }
00131             }
00132             foreach my $uniq_key(keys %non_ref_dnafrags){
00133                 $non_ref_dnafrags{$uniq_key} = [ sort {$a->[0] <=> $b->[0]} @{ $non_ref_dnafrags{$uniq_key} } ];
00134                 my ($non_ref_dnafrag_id, $non_ref_strand) = split(":", $uniq_key);
00135                 my ($nr_start, $nr_end) = ( @{ $non_ref_dnafrags{$uniq_key} }[0]->[0], @{ $non_ref_dnafrags{$uniq_key} }[-1]->[1] );
00136                 # if the difference of the start and end positions of the first and last non-ref frags is much greater ($max_size_diff)
00137                 # than the length of the ref frag then they should be split into there component frags
00138                 if($nr_end - $nr_start > ($coord_pair->[1] - $coord_pair->[0] ) * $max_size_diff) {
00139                     foreach my $aligned_frag(@{ $non_ref_dnafrags{$uniq_key} }){
00140                         next if ($aligned_frag->[1] - $aligned_frag->[0] > ($coord_pair->[1] - $coord_pair->[0] ) * $max_size_diff); 
00141                         push( @$genomic_aligns_on_ref_slice, {
00142                             synteny_region_id => $synteny_region_id,    
00143                             dnafrag_id        => $non_ref_dnafrag_id,
00144                             dnafrag_start     => $aligned_frag->[0],
00145                             dnafrag_end       => $aligned_frag->[1],
00146                             dnafrag_strand    => $non_ref_strand,
00147                         });
00148                     }
00149                 }
00150                 else {
00151                     push( @$genomic_aligns_on_ref_slice, {
00152                         synteny_region_id => $synteny_region_id,
00153                         dnafrag_id        => $non_ref_dnafrag_id,
00154                         dnafrag_start     => $nr_start,
00155                         dnafrag_end       => $nr_end,
00156                         dnafrag_strand    => $non_ref_strand,
00157                     });
00158                 }
00159             }
00160         }
00161         push( @$genomic_aligns_on_ref_slice, {
00162             synteny_region_id => $synteny_region_id,
00163             dnafrag_id        => $self->param('ref_dnafrag_id'),
00164             dnafrag_start     => $coord_pair->[0],
00165             dnafrag_end       => $coord_pair->[1],
00166             dnafrag_strand    => $ref_sub_slice->strand,
00167             } );    
00168     }
00169     $self->param('synteny_region_jobs', $synteny_region_jobs);
00170     $self->param('genomic_aligns_on_ref_slice', $genomic_aligns_on_ref_slice);
00171 }   
00172 
00173 sub write_output {
00174     my ($self) = @_;
00175     return unless $self->param('synteny_region_jobs');
00176     $self->dataflow_output_id( $self->param('synteny_region_jobs'), 2);
00177     $self->dataflow_output_id( $self->param('genomic_aligns_on_ref_slice'), 3);
00178 }
00179 
00180 1;
00181