Archive Ensembl HomeArchive Ensembl Home
RangeRegistry.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Mapper::RangeRegistry
00024 
00025 =head1 SYNOPSIS
00026 
00027   use Bio::EnsEMBL::Mapper::RangeRegistry;
00028 
00029   $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
00030 
00031   # Get a fixed width chunk around the range of intereset.  This
00032   # will be used if any registration is actually necessary.
00033   $chunk_start = ( $start >> 20 ) << 20 + 1;
00034   $chunk_end = ( ( $end >> 20 ) + 1 ) << 20;
00035 
00036   # Check if any registration is necessary for the range.  If it is
00037   # register a large chunked area instead and return a listref of
00038   # unregistered areas that need to be loaded.
00039   if (
00040     $pairs = $rr->check_and_register(
00041       $id, $start, $end, $chunk_start, $chunk_end
00042     ) )
00043   {
00044     foreach my $pair (@$pairs) {
00045       my ( $pair_start, $pair_end ) = @$pair;
00046       # Fetch mappings for these regions from the assembly table and
00047       # load them into the mapper.
00048       ...;
00049     }
00050   } else {
00051     # The range ($start - $end) is already registered
00052     ...;
00053   }
00054 
00055   # Check if any registration is necessary.  If it is register the
00056   # region and return a listref of pairs that need to be loaded.
00057   if ( $pairs = $rr->check_and_register( $id, $start, $end ) ) {
00058     ...;
00059   }
00060 
00061 =head1 DESCRIPTION
00062 
00063 This module maintains an internal list of registered regions and is
00064 used to quickly ascertain if and what regions of a provided range need
00065 registration.
00066 
00067 =head1 METHODS
00068 
00069 =cut
00070 
00071 package Bio::EnsEMBL::Mapper::RangeRegistry;
00072 
00073 use strict;
00074 
00075 use Bio::EnsEMBL::Utils::Exception qw(throw);
00076 use integer;
00077 
00078 =head2 new
00079 
00080   Arg [1]    : none
00081   Example    : my $rr = Bio::EnsEMBL::Mapper::RangeRegistry->new();
00082   Description: Creates a new RangeRegistry object
00083   Returntype : Bio::EnsEMBL::Mapper::RangeRegistry
00084   Exceptions : none
00085   Caller     : AssemblyMapperAdaptor
00086   Status     : Stable
00087 
00088 =cut
00089 
00090 sub new {
00091   my ($proto) = @_;
00092 
00093   my $class = ref($proto) || $proto;
00094 
00095   return bless( { 'registry' => {} }, $class );
00096 }
00097 
00098 sub flush {
00099   my ($self) = @_;
00100   $self->{'registry'} = {};
00101 }
00102 
00103 =head2 check_and_register
00104 
00105   Arg [1]    : string $id
00106                The id of the range to be checked/registered (e.g. a sequenceid)
00107   Arg [2]    : int $start
00108                The start of the range to be checked
00109   Arg [3]    : int $end
00110                The end of the range to be checked
00111   Arg [4]    : (optional) int $rstart
00112                The start of the range to be registered
00113                if the checked range was not fully registered
00114   Arg [5]    : (optional) int $rend
00115                The end of the range to be registerd
00116                if the checked range was not fully registered
00117   Example    : $ranges=$rr->check_and_register('X',500,600, 1,1_000_000));
00118   Description: Checks the range registry to see if the entire range
00119                denoted by ($id : $start-$end) is already registered.  If
00120                it already is, then undef is returned.  If it is not then
00121                the range specified by $rstart and $rend is registered, and
00122                a list of regions that were required to completely register
00123                $rstart-$rend is returned.  If $rstart and $rend are not
00124                defined they default to $start and $end respectively.
00125 
00126                The reason there is a single call to do both the checking and
00127                registering is to reduce the overhead. Much of the work to
00128                check if a range is registered is the same as registering a
00129                region around that range.
00130   Returntype : undef or listref of [start,end] range pairs
00131   Exceptions : throw if rstart is greater than start
00132                throw if rend is less than end
00133                throw if end is less than start
00134                throw if id, start, or end are not defined
00135   Caller     : AssemblyMapperAdaptor
00136   Status     : Stable
00137 
00138 =cut
00139 
00140 #"constants"
00141 my $START = 0;
00142 my $END   = 1;
00143 
00144 sub check_and_register {
00145   my ( $self, $id, $start, $end, $rstart, $rend ) = @_;
00146 
00147   $rstart = $start if ( !defined($rstart) );
00148   $rend   = $end   if ( !defined($rend) );
00149 
00150   #
00151   # Sanity checks
00152   #
00153   if ( !defined($id) || !defined($start) || !defined($end) ) {
00154     throw("ID, start, end arguments are required");
00155   }
00156 
00157   # The following was commented out due to Ensembl Genomes requirements
00158   # for bacterial genomes.
00159   #
00160   ## if ( $start > $end ) {
00161   ##   throw(   "start argument [$start] must be less than "
00162   ##          . "(or equal to) end argument [$end]" );
00163   ## }
00164   ##
00165   ## if ( $rstart > $rend ) {
00166   ##   throw(   "rstart argument [$rstart] must be less than "
00167   ##          . "(or equal to) rend argument [$rend]  argument" );
00168   ## }
00169 
00170   if ( $rstart > $start ) {
00171     throw("rstart [$rstart] must be less than or equal to start [$start]");
00172   }
00173 
00174   if ( $rend < $end ) {
00175     throw("rend [$rend] must be greater than or equal to end [$end]");
00176   }
00177 
00178   my $reg = $self->{'registry'};
00179   my $list = $reg->{$id} ||= [];
00180 
00181   my @gap_pairs;
00182 
00183   my $len = scalar(@$list);
00184 
00185   if ( $len == 0 ) {
00186     # this is the first request for this id, return a gap pair for the
00187     # entire range and register it as seen
00188     $list->[0] = [ $rstart, $rend ];
00189     return [ [ $rstart, $rend ] ];
00190   }
00191 
00192   #####
00193   # loop through the list of existing ranges recording any "gaps" where
00194   # the existing range does not cover part of the requested range
00195   #
00196 
00197   my $start_idx = 0;
00198   my $end_idx   = $#$list;
00199   my ( $mid_idx, $range );
00200 
00201   # binary search the relevant pairs
00202   # helps if the list is big
00203   while ( ( $end_idx - $start_idx ) > 1 ) {
00204     $mid_idx = ( $start_idx + $end_idx ) >> 1;
00205     $range   = $list->[$mid_idx];
00206 
00207     if ( $range->[1] < $rstart ) {
00208       $start_idx = $mid_idx;
00209     } else {
00210       $end_idx = $mid_idx;
00211     }
00212   }
00213 
00214   my ( $gap_start, $gap_end, $r_idx, $rstart_idx, $rend_idx );
00215   $gap_start = $rstart;
00216 
00217   for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
00218     my ( $pstart, $pend ) = @{ $list->[$CUR] };
00219 
00220     # no work needs to be done at all if we find a range pair that
00221     # entirely overlaps the requested region
00222     if ( $pstart <= $start && $pend >= $end ) {
00223       return undef;
00224     }
00225 
00226     # find adjacent or overlapping regions already registered
00227     if ( $pend >= ( $rstart - 1 ) && $pstart <= ( $rend + 1 ) ) {
00228       if ( !defined($rstart_idx) ) {
00229         $rstart_idx = $CUR;
00230       }
00231       $rend_idx = $CUR;
00232     }
00233 
00234     if ( $pstart > $rstart ) {
00235       $gap_end = ( $rend < $pstart ) ? $rend : $pstart - 1;
00236       push @gap_pairs, [ $gap_start, $gap_end ];
00237     }
00238 
00239     $gap_start = ( $rstart > $pend ) ? $rstart : $pend + 1;
00240 
00241     #    if($pstart > $rend && !defined($r_idx)) {
00242     if ( $pend >= $rend && !defined($r_idx) ) {
00243       $r_idx = $CUR;
00244       last;
00245     }
00246   } ## end for ( my $CUR = $start_idx...
00247 
00248   # do we have to make another gap?
00249   if ( $gap_start <= $rend ) {
00250     push @gap_pairs, [ $gap_start, $rend ];
00251   }
00252 
00253   #
00254   # Merge the new range into the registered list
00255   #
00256   if ( defined($rstart_idx) ) {
00257     my ( $new_start, $new_end );
00258 
00259     if ( $rstart < $list->[$rstart_idx]->[0] ) {
00260       $new_start = $rstart;
00261     } else {
00262       $new_start = $list->[$rstart_idx]->[0];
00263     }
00264 
00265     if ( $rend > $list->[$rend_idx]->[1] ) {
00266       $new_end = $rend;
00267     } else {
00268       $new_end = $list->[$rend_idx]->[1];
00269     }
00270 
00271     splice( @$list, $rstart_idx,
00272             $rend_idx - $rstart_idx + 1,
00273             [ $new_start, $new_end ] );
00274 
00275   } elsif ( defined($r_idx) ) {
00276     splice( @$list, $r_idx, 0, [ $rstart, $rend ] );
00277   } else {
00278     push( @$list, [ $rstart, $rend ] );
00279   }
00280 
00281   return \@gap_pairs;
00282 } ## end sub check_and_register
00283 
00284 # overlap size is just added to make RangeRegistry generally more useful
00285 
00286 =head2 overlap_size
00287 
00288   Arg [1]    : string $id
00289   Arg [2]    : int $start
00290   Arg [3]    : int $end
00291   Example    : my $overlap_size = $rr->( "chr1", 1, 100_000_000 )
00292   Description: finds out how many bases of the given range are registered
00293   Returntype : int
00294   Exceptions : none
00295   Caller     : general
00296   Status     : Stable
00297 
00298 =cut
00299 
00300 sub overlap_size {
00301   my ( $self, $id, $start, $end ) = @_;
00302 
00303   my $overlap = 0;
00304 
00305   if ( $start > $end ) { return 0 }
00306 
00307   my $reg = $self->{'registry'};
00308   my $list = $reg->{$id} ||= [];
00309 
00310   my $len = scalar(@$list);
00311 
00312   if ( $len == 0 ) {
00313     # this is the first request for this id, return a gap pair for the
00314     # entire range and register it as seen
00315     return 0;
00316   }
00317 
00318   #####
00319   # loop through the list of existing ranges recording any "gaps" where
00320   # the existing range does not cover part of the requested range
00321   #
00322 
00323   my $start_idx = 0;
00324   my $end_idx   = $#$list;
00325   my ( $mid_idx, $range );
00326 
00327   # binary search the relevant pairs
00328   # helps if the list is big
00329   while ( ( $end_idx - $start_idx ) > 1 ) {
00330     $mid_idx = ( $start_idx + $end_idx ) >> 1;
00331     $range   = $list->[$mid_idx];
00332     if ( $range->[1] < $start ) {
00333       $start_idx = $mid_idx;
00334     } else {
00335       $end_idx = $mid_idx;
00336     }
00337   }
00338 
00339   for ( my $CUR = $start_idx ; $CUR < $len ; $CUR++ ) {
00340     my ( $pstart, $pend ) = @{ $list->[$CUR] };
00341 
00342     if ( $pstart > $end ) {
00343       # No more interesting ranges here.
00344       last;
00345     }
00346 
00347     # no work needs to be done at all if we find a range pair that
00348     # entirely overlaps the requested region
00349     if ( $pstart <= $start && $pend >= $end ) {
00350       $overlap = $end - $start + 1;
00351       last;
00352     }
00353 
00354     my $mstart = ( $start < $pstart ? $pstart : $start );
00355     my $mend   = ( $end < $pend     ? $end    : $pend );
00356 
00357     if ( $mend - $mstart >= 0 ) {
00358       $overlap += ( $mend - $mstart + 1 );
00359     }
00360   }
00361 
00362   return $overlap;
00363 } ## end sub overlap_size
00364 
00365 
00366 # low level function to access the ranges
00367 # only use for read access
00368 
00369 sub get_ranges {
00370   my ( $self, $id ) = @_;
00371 
00372   return $self->{'registry'}->{$id};
00373 }
00374 
00375 1;
00376