Archive Ensembl HomeArchive Ensembl Home
Mapper.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Mapper
00024 
00025 =head1 SYNOPSIS
00026 
00027   $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' );
00028 
00029   # add a coodinate mapping - supply two pairs or coordinates
00030   $map->add_map_coordinates(
00031     $contig_id, $contig_start, $contig_end, $contig_ori,
00032     $chr_name,  chr_start,     $chr_end
00033   );
00034 
00035   # map from one coordinate system to another
00036   my @coordlist =
00037     $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" );
00038 
00039 =head1 DESCRIPTION
00040 
00041 Generic mapper to provide coordinate transforms between two disjoint
00042 coordinate systems. This mapper is intended to be 'context neutral' - in
00043 that it does not contain any code relating to any particular coordinate
00044 system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper.
00045 
00046 =head1 METHODS
00047 
00048 =cut
00049 
00050 package Bio::EnsEMBL::Mapper;
00051 use strict;
00052 use integer;
00053 
00054 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump);
00055 use Bio::EnsEMBL::Mapper::Pair;
00056 use Bio::EnsEMBL::Mapper::IndelPair;
00057 use Bio::EnsEMBL::Mapper::Unit;
00058 use Bio::EnsEMBL::Mapper::Coordinate;
00059 use Bio::EnsEMBL::Mapper::IndelCoordinate;
00060 use Bio::EnsEMBL::Mapper::Gap;
00061 
00062 use Bio::EnsEMBL::Utils::Exception qw(throw);
00063 
00064 # use Data::Dumper;
00065 
00066 =head2 new
00067 
00068   Arg [1]    : string $from
00069                The name of the 'from' coordinate system
00070   Arg [2]    : string $to
00071                The name of the 'to' coordinate system
00072   Arg [3]    : (optional) Bio::EnsEMBL::CoordSystem $from_cs
00073                The 'from' coordinate system
00074   Arg [4]    : (optional) Bio::EnsEMBL::CoordSystem $to_cs
00075   Example    : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO');
00076   Description: Constructor.  Creates a new Bio::EnsEMBL::Mapper object.
00077   Returntype : Bio::EnsEMBL::Mapper
00078   Exceptions : none
00079   Caller     : general
00080 
00081 =cut
00082 
00083 sub new {
00084   my ( $proto, $from, $to, $from_cs, $to_cs ) = @_;
00085 
00086   if ( !defined($to) || !defined($from) ) {
00087     throw("Must supply 'to' and 'from' tags");
00088   }
00089 
00090   my $class = ref($proto) || $proto;
00091 
00092   my $self = bless( { "_pair_$from" => {},
00093                       "_pair_$to"   => {},
00094                       'pair_count'  => 0,
00095                       'to'          => $to,
00096                       'from'        => $from,
00097                       'to_cs'       => $to_cs,
00098                       'from_cs'     => $from_cs
00099                     },
00100                     $class );
00101 
00102   # do sql to get any componente with muliple assemblys.
00103 
00104   return $self;
00105 }
00106 
00107 =head2 flush
00108 
00109   Args       : none
00110   Example    : none
00111   Description: removes all cached information out of this mapper
00112   Returntype : none
00113   Exceptions : none
00114   Caller     : AssemblyMapper, ChainedAssemblyMapper
00115 
00116 =cut
00117 
00118 sub flush {
00119   my $self = shift;
00120   my $from = $self->from();
00121   my $to = $self->to();
00122 
00123   $self->{"_pair_$from"} = {};
00124   $self->{"_pair_$to"} = {};
00125 
00126   $self->{'pair_count'} = 0;
00127 }
00128 
00129 
00130 
00131 =head2 map_coordinates
00132 
00133     Arg  1      string $id
00134                 id of 'source' sequence
00135     Arg  2      int $start
00136                 start coordinate of 'source' sequence
00137     Arg  3      int $end
00138                 end coordinate of 'source' sequence
00139     Arg  4      int $strand
00140                 raw contig orientation (+/- 1)
00141     Arg  5      string $type
00142                 nature of transform - gives the type of
00143                 coordinates to be transformed *from*
00144     Function    generic map method
00145     Returntype  array of Bio::EnsEMBL::Mapper::Coordinate
00146                 and/or   Bio::EnsEMBL::Mapper::Gap
00147     Exceptions  none
00148     Caller      Bio::EnsEMBL::Mapper
00149 
00150 =cut
00151 
00152 sub map_coordinates {
00153   my ( $self, $id, $start, $end, $strand, $type ) = @_;
00154 
00155   unless (    defined($id)
00156            && defined($start)
00157            && defined($end)
00158            && defined($strand)
00159            && defined($type) )
00160   {
00161     throw("Expecting 5 arguments");
00162   }
00163 
00164   # special case for handling inserts:
00165   if ( $start == $end + 1 ) {
00166     return $self->map_insert( $id, $start, $end, $strand, $type );
00167   }
00168 
00169   if ( !$self->{'_is_sorted'} ) { $self->_sort() }
00170 
00171   my $hash = $self->{"_pair_$type"};
00172 
00173   my ( $from, $to, $cs );
00174 
00175   if ( $type eq $self->{'to'} ) {
00176     $from = 'to';
00177     $to   = 'from';
00178     $cs   = $self->{'from_cs'};
00179   } else {
00180     $from = 'from';
00181     $to   = 'to';
00182     $cs   = $self->{'to_cs'};
00183   }
00184 
00185   unless ( defined $hash ) {
00186     throw("Type $type is neither to or from coordinate systems");
00187   }
00188 
00189   if ( !defined $hash->{ uc($id) } ) {
00190     # one big gap!
00191     my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
00192     return $gap;
00193   }
00194 
00195   my $last_used_pair;
00196   my @result;
00197 
00198   my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
00199   my $lr = $hash->{ uc($id) };
00200 
00201   $start_idx = 0;
00202   $end_idx   = $#{$lr};
00203 
00204   # binary search the relevant pairs
00205   # helps if the list is big
00206   while ( ( $end_idx - $start_idx ) > 1 ) {
00207     $mid_idx    = ( $start_idx + $end_idx ) >> 1;
00208     $pair       = $lr->[$mid_idx];
00209     $self_coord = $pair->{$from};
00210     if ( $self_coord->{'end'} < $start ) {
00211       $start_idx = $mid_idx;
00212     } else {
00213       $end_idx = $mid_idx;
00214     }
00215   }
00216 
00217   my $rank              = 0;
00218   my $orig_start        = $start;
00219   my $last_target_coord = undef;
00220   for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
00221     $pair = $lr->[$i];
00222     my $self_coord   = $pair->{$from};
00223     my $target_coord = $pair->{$to};
00224 
00225     #
00226     # But not the case for haplotypes!! need to test for this case???
00227     # so removing this till a better solution is found
00228     #
00229     #
00230     #     if($self_coord->{'start'} < $start){
00231     #       $start = $orig_start;
00232     #       $rank++;
00233     #     }
00234 
00235     if ( defined($last_target_coord)
00236          and $target_coord->{'id'} ne $last_target_coord )
00237     {
00238       if ( $self_coord->{'start'} < $start )
00239       {    # i.e. the same bit is being mapped to another assembled bit
00240         $start = $orig_start;
00241       }
00242     } else {
00243       $last_target_coord = $target_coord->{'id'};
00244     }
00245 
00246     # if we haven't even reached the start, move on
00247     if ( $self_coord->{'end'} < $orig_start ) {
00248       next;
00249     }
00250 
00251     # if we have over run, break
00252     if ( $self_coord->{'start'} > $end ) {
00253       last;
00254     }
00255 
00256     if ( $start < $self_coord->{'start'} ) {
00257       # gap detected
00258       my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
00259                                     $self_coord->{'start'} - 1, $rank );
00260 
00261       push( @result, $gap );
00262       $start = $gap->{'end'} + 1;
00263     }
00264     my ( $target_start, $target_end, $target_ori );
00265     my $res;
00266     if ( exists $pair->{'indel'} ) {
00267       # When next pair is an IndelPair and not a Coordinate, create the
00268       # new mapping Coordinate, the IndelCoordinate.
00269       $target_start = $target_coord->{'start'};
00270       $target_end   = $target_coord->{'end'};
00271 
00272       #create a Gap object
00273       my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
00274         ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) );
00275       #create the Coordinate object
00276       my $coord =
00277         Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
00278               $target_start, $target_end, $pair->{'ori'}*$strand, $cs );
00279       #and finally, the IndelCoordinate object with
00280       $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord );
00281     } else {
00282       # start is somewhere inside the region
00283       if ( $pair->{'ori'} == 1 ) {
00284         $target_start =
00285           $target_coord->{'start'} +
00286           ( $start - $self_coord->{'start'} );
00287       } else {
00288         $target_end =
00289           $target_coord->{'end'} - ( $start - $self_coord->{'start'} );
00290       }
00291 
00292       # Either we are enveloping this map or not.  If yes, then end
00293       # point (self perspective) is determined solely by target.  If
00294       # not we need to adjust.
00295 
00296       if ( $end > $self_coord->{'end'} ) {
00297         # enveloped
00298         if ( $pair->{'ori'} == 1 ) {
00299           $target_end = $target_coord->{'end'};
00300         } else {
00301           $target_start = $target_coord->{'start'};
00302         }
00303       } else {
00304         # need to adjust end
00305         if ( $pair->{'ori'} == 1 ) {
00306           $target_end =
00307             $target_coord->{'start'} +
00308             ( $end - $self_coord->{'start'} );
00309         } else {
00310           $target_start =
00311             $target_coord->{'end'} - ( $end - $self_coord->{'start'} );
00312         }
00313       }
00314 
00315       $res =
00316         Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
00317                      $target_start, $target_end, $pair->{'ori'}*$strand,
00318                      $cs, $rank );
00319 
00320     } ## end else [ if ( exists $pair->{'indel'...})]
00321 
00322     push( @result, $res );
00323 
00324     $last_used_pair = $pair;
00325     $start          = $self_coord->{'end'} + 1;
00326 
00327   } ## end for ( my $i = $start_idx...)
00328 
00329   if ( !defined $last_used_pair ) {
00330     my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
00331     push( @result, $gap );
00332 
00333   } elsif ( $last_used_pair->{$from}->{'end'} < $end ) {
00334     # gap at the end
00335     my $gap =
00336       Bio::EnsEMBL::Mapper::Gap->new(
00337                                   $last_used_pair->{$from}->{'end'} + 1,
00338                                   $end, $rank );
00339     push( @result, $gap );
00340   }
00341 
00342   if ( $strand == -1 ) {
00343     @result = reverse(@result);
00344   }
00345 
00346   return @result;
00347 } ## end sub map_coordinates
00348 
00349 
00350 
00351 =head2 map_insert
00352 
00353   Arg [1]    : string $id
00354   Arg [2]    : int $start - start coord. Since this is an insert should always
00355                be one greater than end.
00356   Arg [3]    : int $end - end coord. Since this is an insert should always
00357                be one less than start.
00358   Arg [4]    : int $strand (0, 1, -1)
00359   Arg [5]    : string $type - the coordinate system name the coords are from.
00360   Arg [6]    : boolean $fastmap - if specified, this is being called from
00361                the fastmap call. The mapping done is not any faster for
00362                inserts, but the return value is different.
00363   Example    : 
00364   Description: This is in internal function which handles the special mapping
00365                case for inserts (start = end +1).  This function will be called
00366                automatically by the map function so there is no reason to
00367                call it directly.
00368   Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects
00369   Exceptions : none
00370   Caller     : map_coordinates()
00371 
00372 =cut
00373 
00374 sub map_insert {
00375   my ($self, $id, $start, $end, $strand, $type, $fastmap) = @_;
00376 
00377   # swap start/end and map the resultant 2bp coordinate
00378   ($start, $end) =($end,$start);
00379 
00380   my @coords = $self->map_coordinates($id, $start, $end, $strand, $type);
00381 
00382   if(@coords == 1) {
00383     my $c = $coords[0];
00384     # swap start and end to convert back into insert
00385     ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'});
00386   } else {
00387     throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2);
00388 
00389     # adjust coordinates, remove gaps
00390     my ($c1, $c2);
00391     if($strand == -1) {
00392       ($c2,$c1) = @coords;
00393     } else {
00394       ($c1, $c2) = @coords;
00395     }
00396     @coords = ();
00397 
00398     if(ref($c1) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
00399       # insert is after first coord
00400       if($c1->{'strand'} * $strand == -1) {
00401         $c1->{'end'}--;
00402       } else {
00403         $c1->{'start'}++;
00404       }
00405       @coords = ($c1);
00406     }
00407     if(ref($c2) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
00408       # insert is before second coord
00409       if($c2->{'strand'} * $strand == -1) {
00410         $c2->{'start'}++;
00411       } else {
00412         $c2->{'end'}--;
00413       }
00414       if($strand == -1) {
00415         unshift @coords, $c2;
00416       } else {
00417         push @coords, $c2;
00418       }
00419     }
00420   }
00421 
00422   if($fastmap) {
00423     return undef if(@coords != 1);
00424     my $c = $coords[0];
00425     return ($c->{'id'}, $c->{'start'}, $c->{'end'},
00426             $c->{'strand'}, $c->{'coord_system'});
00427   }
00428 
00429   return @coords;
00430 }
00431 
00432 
00433 
00434 
00435 
00436 
00437 =head2 fastmap
00438 
00439     Arg  1      string $id
00440                 id of 'source' sequence
00441     Arg  2      int $start
00442                 start coordinate of 'source' sequence
00443     Arg  3      int $end
00444                 end coordinate of 'source' sequence
00445     Arg  4      int $strand
00446                 raw contig orientation (+/- 1)
00447     Arg  5      int $type
00448                 nature of transform - gives the type of
00449                 coordinates to be transformed *from*
00450     Function    inferior map method. Will only do ungapped unsplit mapping.
00451                 Will return id, start, end strand in a list.
00452     Returntype  list of results
00453     Exceptions  none
00454     Caller      Bio::EnsEMBL::AssemblyMapper
00455 
00456 =cut
00457 
00458 sub fastmap {
00459    my ($self, $id, $start, $end, $strand, $type) = @_;
00460 
00461    my ($from, $to, $cs);
00462 
00463    if($end+1 == $start) {
00464      return $self->map_insert($id, $start, $end, $strand, $type, 1);
00465    }
00466 
00467    if( ! $self->{'_is_sorted'} ) { $self->_sort() }
00468 
00469    if($type eq $self->{'to'}) {
00470      $from = 'to';
00471      $to = 'from';
00472      $cs = $self->{'from_cs'};
00473    } else {
00474      $from = 'from';
00475      $to = 'to';
00476      $cs = $self->{'to_cs'};
00477    }
00478 
00479    my $hash = $self->{"_pair_$type"} or
00480        throw("Type $type is neither to or from coordinate systems");
00481 
00482 
00483    my $pairs = $hash->{uc($id)};
00484 
00485    foreach my $pair (@$pairs) {
00486      my $self_coord   = $pair->{$from};
00487      my $target_coord = $pair->{$to};
00488 
00489      # only super easy mapping is done 
00490      if( $start < $self_coord->{'start'} ||
00491          $end > $self_coord->{'end'} ) {
00492        next;
00493      }
00494 
00495      if( $pair->{'ori'} == 1 ) {
00496        return ( $target_coord->{'id'},
00497                 $target_coord->{'start'}+$start-$self_coord->{'start'},
00498                 $target_coord->{'start'}+$end-$self_coord->{'start'},
00499                 $strand, $cs );
00500      } else {
00501        return ( $target_coord->{'id'},
00502                 $target_coord->{'end'} - ($end - $self_coord->{'start'}),
00503                 $target_coord->{'end'} - ($start - $self_coord->{'start'}),
00504                 -$strand, $cs );
00505      }
00506    }
00507 
00508    return ();
00509 }
00510 
00511 
00512 
00513 =head2 add_map_coordinates
00514 
00515     Arg  1      int $id
00516                 id of 'source' sequence
00517     Arg  2      int $start
00518                 start coordinate of 'source' sequence
00519     Arg  3      int $end
00520                 end coordinate of 'source' sequence
00521     Arg  4      int $strand
00522                 relative orientation of source and target (+/- 1)
00523     Arg  5      int $id
00524                 id of 'target' sequence
00525     Arg  6      int $start
00526                 start coordinate of 'target' sequence
00527     Arg  7      int $end
00528                 end coordinate of 'target' sequence
00529     Function    Stores details of mapping between
00530                 'source' and 'target' regions.
00531     Returntype  none
00532     Exceptions  none
00533     Caller      Bio::EnsEMBL::Mapper
00534 
00535 =cut
00536 
00537 sub add_map_coordinates {
00538   my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori,
00539        $chr_name, $chr_start, $chr_end )
00540     = @_;
00541 
00542   unless (    defined($contig_id)
00543            && defined($contig_start)
00544            && defined($contig_end)
00545            && defined($contig_ori)
00546            && defined($chr_name)
00547            && defined($chr_start)
00548            && defined($chr_end) )
00549   {
00550     throw("7 arguments expected");
00551   }
00552 
00553   if ( ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) {
00554     throw("Cannot deal with mis-lengthed mappings so far");
00555   }
00556 
00557   my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start,
00558                                               $contig_end );
00559   my $to =
00560     Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end );
00561 
00562   my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori );
00563 
00564   # place into hash on both ids
00565   my $map_to   = $self->{'to'};
00566   my $map_from = $self->{'from'};
00567 
00568   push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } },    $pair );
00569   push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair );
00570 
00571   $self->{'pair_count'}++;
00572   $self->{'_is_sorted'} = 0;
00573 } ## end sub add_map_coordinates
00574 
00575 
00576 =head2 add_indel_coordinates
00577 
00578     Arg  1      int $id
00579                 id of 'source' sequence
00580     Arg  2      int $start
00581                 start coordinate of 'source' sequence
00582     Arg  3      int $end
00583                 end coordinate of 'source' sequence
00584     Arg  4      int $strand
00585                 relative orientation of source and target (+/- 1)
00586     Arg  5      int $id
00587                 id of 'targe' sequence
00588     Arg  6      int $start
00589                 start coordinate of 'targe' sequence
00590     Arg  7      int $end
00591                 end coordinate of 'targe' sequence
00592     Function    stores details of mapping between two regions:
00593                 'source' and 'target'. Returns 1 if the pair was added, 0 if it
00594                 was already in. Used when adding an indel
00595     Returntype  int 0,1
00596     Exceptions  none
00597     Caller      Bio::EnsEMBL::Mapper
00598 
00599 =cut
00600 
00601 sub add_indel_coordinates{
00602   my ($self, $contig_id, $contig_start, $contig_end, 
00603       $contig_ori, $chr_name, $chr_start, $chr_end) = @_;
00604 
00605   unless(defined($contig_id) && defined($contig_start) && defined($contig_end)
00606      && defined($contig_ori) && defined($chr_name) && defined($chr_start)
00607      && defined($chr_end)) {
00608     throw("7 arguments expected");
00609   }
00610 
00611   #we need to create the IndelPair object to add to both lists, to and from
00612   my $from =
00613     Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end);
00614   my $to   =
00615     Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end);
00616 
00617   my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori);
00618 
00619   # place into hash on both ids
00620   my $map_to = $self->{'to'};
00621   my $map_from = $self->{'from'};
00622 
00623   push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair );
00624   push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair );
00625 
00626   $self->{'pair_count'}++;
00627 
00628   $self->{'_is_sorted'} = 0;
00629   return 1;
00630 }
00631 
00632 
00633 =head2 map_indel
00634 
00635   Arg [1]    : string $id
00636   Arg [2]    : int $start - start coord. Since this is an indel should always
00637                be one greater than end.
00638   Arg [3]    : int $end - end coord. Since this is an indel should always
00639                be one less than start.
00640   Arg [4]    : int $strand (0, 1, -1)
00641   Arg [5]    : string $type - the coordinate system name the coords are from.
00642   Example    : @coords = $mapper->map_indel();
00643   Description: This is in internal function which handles the special mapping
00644                case for indels (start = end +1). It will be used to map from
00645                a coordinate system with a gap to another that contains an
00646                insertion. It will be mainly used by the Variation API.
00647   Returntype : Bio::EnsEMBL::Mapper::Unit objects
00648   Exceptions : none
00649   Caller     : general
00650 
00651 =cut
00652 
00653 sub map_indel {
00654   my ( $self, $id, $start, $end, $strand, $type ) = @_;
00655 
00656   # swap start/end and map the resultant 2bp coordinate
00657   ( $start, $end ) = ( $end, $start );
00658 
00659   if ( !$self->{'_is_sorted'} ) { $self->_sort() }
00660 
00661   my $hash = $self->{"_pair_$type"};
00662 
00663   my ( $from, $to, $cs );
00664 
00665   if ( $type eq $self->{'to'} ) {
00666     $from = 'to';
00667     $to   = 'from';
00668     $cs   = $self->{'from_cs'};
00669   } else {
00670     $from = 'from';
00671     $to   = 'to';
00672     $cs   = $self->{'to_cs'};
00673   }
00674 
00675   unless ( defined $hash ) {
00676     throw("Type $type is neither to or from coordinate systems");
00677   }
00678   my @indel_coordinates;
00679 
00680   my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
00681   my $lr = $hash->{ uc($id) };
00682 
00683   $start_idx = 0;
00684   $end_idx   = $#{$lr};
00685 
00686   # binary search the relevant pairs
00687   # helps if the list is big
00688   while ( ( $end_idx - $start_idx ) > 1 ) {
00689     $mid_idx    = ( $start_idx + $end_idx ) >> 1;
00690     $pair       = $lr->[$mid_idx];
00691     $self_coord = $pair->{$from};
00692     if ( $self_coord->{'end'} <= $start ) {
00693       $start_idx = $mid_idx;
00694     } else {
00695       $end_idx = $mid_idx;
00696     }
00697   }
00698 
00699   for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
00700     $pair = $lr->[$i];
00701     my $self_coord   = $pair->{$from};
00702     my $target_coord = $pair->{$to};
00703 
00704     if ( exists $pair->{'indel'} ) {
00705       #need to return unit coordinate
00706       my $to =
00707         Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'},
00708                                          $target_coord->{'start'},
00709                                          $target_coord->{'end'}, );
00710       push @indel_coordinates, $to;
00711       last;
00712     }
00713   }
00714 
00715   return @indel_coordinates;
00716 } ## end sub map_indel
00717 
00718 
00719 =head2 add_Mapper
00720 
00721     Arg  1      Bio::EnsEMBL::Mapper $mapper2
00722     Example     $mapper->add_Mapper($mapper2)
00723     Function    add all the map coordinates from $mapper to this mapper.
00724                 This object will contain mapping pairs from both the old
00725                 object and $mapper2.
00726     Returntype  int 0,1
00727     Exceptions  throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers
00728                 are incompatible
00729     Caller      $mapper->methodname()
00730 
00731 =cut
00732 
00733 sub add_Mapper{
00734   my ($self, $mapper) = @_;
00735 
00736   my $mapper_to = $mapper->{'to'};
00737   my $mapper_from = $mapper->{'from'};
00738   if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) {
00739     throw("Trying to add an incompatible Mapper");
00740   }
00741 
00742   my $count_a = 0;
00743   foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) {
00744     push(@{$self->{"_pair_$mapper_to"}->{$seq_name}},
00745         @{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
00746     $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
00747   }
00748   my $count_b = 0;
00749   foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) {
00750     push(@{$self->{"_pair_$mapper_from"}->{$seq_name}},
00751         @{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
00752     $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
00753   }
00754 
00755   if ($count_a == $count_b) {
00756     $self->{'pair_count'} += $count_a;
00757   } else {
00758     throw("Trying to add a funny Mapper");
00759   }
00760 
00761   $self->{'_is_sorted'} = 0;
00762   return 1;
00763 }
00764 
00765 
00766 
00767 =head2 list_pairs
00768 
00769     Arg  1      int $id
00770                 id of 'source' sequence
00771     Arg  2      int $start
00772                 start coordinate of 'source' sequence
00773     Arg  3      int $end
00774                 end coordinate of 'source' sequence
00775     Arg  4      string $type
00776                 nature of transform - gives the type of
00777                 coordinates to be transformed *from*
00778     Function    list all pairs of mappings in a region
00779     Returntype  list of Bio::EnsEMBL::Mapper::Pair
00780     Exceptions  none
00781     Caller      Bio::EnsEMBL::Mapper
00782 
00783 =cut
00784 
00785 sub list_pairs {
00786   my ( $self, $id, $start, $end, $type ) = @_;
00787 
00788   if ( !$self->{'_is_sorted'} ) { $self->_sort() }
00789 
00790   if ( !defined $type ) {
00791     throw("Expected 4 arguments");
00792   }
00793 
00794   if ( $start > $end ) {
00795     throw(   "Start is greater than end "
00796            . "for id $id, start $start, end $end\n" );
00797   }
00798 
00799   my $hash = $self->{"_pair_$type"};
00800 
00801   my ( $from, $to );
00802 
00803   if ( $type eq $self->{'to'} ) {
00804     $from = 'to';
00805     $to   = 'from';
00806   } else {
00807     $from = 'from';
00808     $to   = 'to';
00809   }
00810 
00811   unless ( defined $hash ) {
00812     throw("Type $type is neither to or from coordinate systems");
00813   }
00814 
00815   my @list;
00816 
00817   unless ( exists $hash->{ uc($id) } ) {
00818     return ();
00819   }
00820 
00821   @list = @{ $hash->{ uc($id) } };
00822 
00823   my @output;
00824   if ( $start == -1 && $end == -1 ) {
00825     return @list;
00826   } else {
00827 
00828     foreach my $p (@list) {
00829 
00830       if ( $p->{$from}->{'end'} < $start ) {
00831         next;
00832       }
00833       if ( $p->{$from}->{'start'} > $end ) {
00834         last;
00835       }
00836       push( @output, $p );
00837     }
00838     return @output;
00839   }
00840 } ## end sub list_pairs
00841 
00842 
00843 =head2 to
00844 
00845     Arg  1      Bio::EnsEMBL::Mapper::Unit $id
00846                 id of 'source' sequence
00847     Function    accessor method form the 'source'
00848                 and 'target' in a Mapper::Pair
00849     Returntype  Bio::EnsEMBL::Mapper::Unit
00850     Exceptions  none
00851     Caller      Bio::EnsEMBL::Mapper
00852 
00853 =cut
00854 
00855 sub to {
00856   my ( $self, $value ) = @_;
00857 
00858   if ( defined($value) ) {
00859     $self->{'to'} = $value;
00860   }
00861 
00862   return $self->{'to'};
00863 }
00864 
00865 =head2 from
00866 
00867     Arg  1      Bio::EnsEMBL::Mapper::Unit $id
00868                 id of 'source' sequence
00869     Function    accessor method form the 'source'
00870                 and 'target' in a Mapper::Pair
00871     Returntype  Bio::EnsEMBL::Mapper::Unit
00872     Exceptions  none
00873     Caller      Bio::EnsEMBL::Mapper
00874 
00875 =cut
00876 sub from {
00877   my ( $self, $value ) = @_;
00878 
00879   if ( defined($value) ) {
00880     $self->{'from'} = $value;
00881   }
00882 
00883   return $self->{'from'};
00884 }
00885 
00886 
00887 #  _dump
00888 #
00889 #     Arg  1      *FileHandle $fh
00890 #     Function    convenience dump function
00891 #                 possibly useful for debugging
00892 #     Returntype  none
00893 #     Exceptions  none
00894 #     Caller      internal
00895 #
00896 
00897 sub _dump{
00898    my ($self,$fh) = @_;
00899 
00900    if( !defined $fh ) {
00901        $fh = \*STDERR;
00902    }
00903 
00904    foreach my $id ( keys %{$self->{'_pair_hash_from'}} ) {
00905        print $fh "From Hash $id\n";
00906        foreach my $pair ( @{$self->{'_pair_hash_from'}->{uc($id)}} ) {
00907        print $fh "    ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n";
00908        }
00909    }
00910 
00911 }
00912 
00913 
00914 # _sort
00915 #
00916 #    Function    sort function so that all
00917 #                mappings are sorted by
00918 #                chromosome start
00919 #    Returntype  none
00920 #    Exceptions  none
00921 #    Caller      internal
00922 #
00923 
00924 sub _sort {
00925   my ($self) = @_;
00926 
00927   my $to   = $self->{'to'};
00928   my $from = $self->{'from'};
00929 
00930   foreach my $id ( keys %{ $self->{"_pair_$from"} } ) {
00931     @{ $self->{"_pair_$from"}->{$id} } =
00932       sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} }
00933       @{ $self->{"_pair_$from"}->{$id} };
00934   }
00935 
00936   foreach my $id ( keys %{ $self->{"_pair_$to"} } ) {
00937     @{ $self->{"_pair_$to"}->{$id} } =
00938       sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} }
00939       @{ $self->{"_pair_$to"}->{$id} };
00940   }
00941 
00942   $self->_merge_pairs();
00943   $self->_is_sorted(1);
00944 }
00945 
00946 # this function merges pairs that are adjacent into one
00947 sub _merge_pairs {
00948   my $self = shift;
00949 
00950   my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair );
00951 
00952   my $map_to = $self->{'to'};
00953   my $map_from = $self->{'from'};
00954   $self->{'pair_count'} = 0;
00955 
00956   for my $key ( keys %{$self->{"_pair_$map_to"}} ) {
00957     $lr = $self->{"_pair_$map_to"}->{$key}; 
00958     
00959     my $i = 0;
00960     my $next = 1;
00961     my $length = $#{$lr};
00962     while( $next <= $length ) {
00963       $current_pair = $lr->[$i];
00964       $next_pair = $lr->[$next];
00965       $del_pair = undef;
00966       
00967       if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){
00968       #necessary to modify the merge function to not merge indels
00969       $next++;
00970       $i++;
00971       }
00972       else{
00973       # duplicate filter
00974       if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'} 
00975          and $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'} ) {
00976         $del_pair = $next_pair;
00977       } elsif(( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) &&
00978           ( $next_pair->{'ori'} == $current_pair->{'ori'} ) &&
00979           ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) {
00980           
00981           if( $current_pair->{'ori'} == 1 ) {
00982           # check forward strand merge
00983           if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) {
00984               # normal merge with previous element
00985               $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
00986               $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'};
00987               $del_pair = $next_pair;
00988           }
00989           } else {
00990           # check backward strand merge
00991           if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) {
00992               # yes its a merge
00993               $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
00994               $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'};
00995               $del_pair = $next_pair;
00996           }
00997           }
00998       }
00999       
01000       if( defined $del_pair ) {
01001           splice( @$lr, $next, 1 );
01002           $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})};
01003           for( my $j=0; $j <= $#{$lr_from}; $j++ ) {
01004           if( $lr_from->[$j] == $del_pair ) {
01005               splice( @$lr_from, $j, 1 );
01006               last;
01007           }
01008           }
01009           $length--;
01010           if( $length < $next ) { last; }
01011       }
01012       else {
01013           $next++;
01014           $i++;
01015       }
01016       }  
01017       
01018    }
01019     $self->{'pair_count'} += scalar( @$lr );
01020  }
01021 }
01022 
01023 
01024 # _is_sorted
01025 #
01026 #    Arg  1      int $sorted
01027 #    Function    toggle for whether the (internal)
01028 #                map data are sorted
01029 #    Returntype  int
01030 #    Exceptions  none
01031 #    Caller      internal
01032 #
01033 
01034 sub _is_sorted {
01035    my ($self, $value) = @_;
01036    $self->{'_is_sorted'} = $value if (defined($value));
01037    return $self->{'_is_sorted'};
01038 }
01039 
01040 
01041 1;