Archive Ensembl HomeArchive Ensembl Home
TranscriptMapper.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 TranscriptMapper - A utility class used to perform coordinate conversions
00024 between a number of coordinate systems relating to transcripts
00025 
00026 =head1 SYNOPSIS
00027 
00028   my $trmapper = Bio::EnsEMBL::TranscriptMapper->new($transcript);
00029 
00030   @coords = $trmapper->cdna2genomic( 123, 554 );
00031 
00032   @coords = $trmapper->genomic2cdna( 141, 500, -1 );
00033 
00034   @coords = $trmapper->genomic2cds( 141, 500, -1 );
00035 
00036   @coords = $trmapper->pep2genomic( 10, 60 );
00037 
00038   @coords = $trmapper->genomic2pep( 123, 400, 1 );
00039 
00040 =head1 DESCRIPTION
00041 
00042 This is a utility class which can be used to perform coordinate conversions
00043 between a number of coordinate systems relating to transcripts.
00044 
00045 =head1 METHODS
00046 
00047 =cut
00048 
00049 package Bio::EnsEMBL::TranscriptMapper;
00050 
00051 use strict;
00052 use warnings;
00053 
00054 use Bio::EnsEMBL::Utils::Exception qw(throw);
00055 
00056 use Bio::EnsEMBL::Mapper;
00057 use Bio::EnsEMBL::Mapper::Gap;
00058 use Bio::EnsEMBL::Mapper::Coordinate;
00059 
00060 
00061 
00062 =head2 new
00063 
00064   Arg [1]    : Bio::EnsEMBL::Transcript $transcript
00065                The transcript for which a TranscriptMapper should be created.
00066   Example    : $trans_mapper = Bio::EnsEMBL::TranscriptMapper->new($transcript)
00067   Description: Creates a TranscriptMapper object which can be used to perform
00068                various coordinate transformations relating to transcripts.
00069                Note that the TranscriptMapper uses the transcript state at the
00070                time of creation to perform the conversions, and that a new
00071                TranscriptMapper must be created if the Transcript is altered.
00072                'Genomic' coordinates are coordinates which are relative to the
00073                slice that the Transcript is on.
00074   Returntype : Bio::EnsEMBL::TranscriptMapper
00075   Exceptions : throws if a transcript is not an argument
00076   Caller     : Transcript::get_TranscriptMapper
00077   Status     : Stable
00078 
00079 =cut
00080 
00081 sub new {
00082   my $caller = shift;
00083   my $transcript = shift;
00084 
00085   my $class = ref($caller) || $caller;
00086 
00087   if(!ref($transcript) || !$transcript->isa('Bio::EnsEMBL::Transcript')) {
00088     throw("Transcript argument is required.");
00089   }
00090 
00091 
00092   my $exons = $transcript->get_all_Exons();
00093   my $start_phase;
00094   if(@$exons) {
00095     $start_phase = $exons->[0]->phase;
00096   } else {
00097     $start_phase = -1;
00098   }
00099 
00100   # Create a cdna <-> genomic mapper and load it with exon coords
00101   my $mapper = _load_mapper($transcript,$start_phase);
00102 
00103   my $self = bless({'exon_coord_mapper' => $mapper,
00104                     'start_phase'       => $start_phase,
00105                     'cdna_coding_start' => $transcript->cdna_coding_start(),
00106                     'cdna_coding_end'   => $transcript->cdna_coding_end()},
00107                    $class);
00108 }
00109 
00110 
00111 =head2 _load_mapper
00112 
00113   Arg [1]    : Bio::EnsEMBL::Transcript $transcript
00114                The transcript for which a mapper should be created.
00115   Example    : my $mapper = _load_mapper($transcript);
00116   Description: loads the mapper
00117   Returntype : Bio::EnsEMBL::Mapper
00118   Exceptions : none
00119   Caller     : Internal
00120   Status     : Stable
00121 
00122 =cut
00123 
00124 sub _load_mapper {
00125   my $transcript = shift;
00126   my $start_phase = shift;
00127 
00128   my $mapper = Bio::EnsEMBL::Mapper->new( 'cdna', 'genomic');
00129 
00130   my $edits_on = $transcript->edits_enabled();
00131   my @edits;
00132 
00133   if($edits_on) {
00134     @edits = @{$transcript->get_all_SeqEdits()};
00135     @edits = sort {$a->start() <=> $b->start()} @edits;
00136   }
00137 
00138   my $edit_shift = 0;
00139 
00140   my $cdna_start = undef;
00141 
00142   my $cdna_end = 0;
00143 
00144 
00145   foreach my $ex (@{$transcript->get_all_Exons}) {
00146     my $gen_start = $ex->start();
00147     my $gen_end   = $ex->end();
00148 
00149     $cdna_start = $cdna_end + 1;
00150     $cdna_end   = $cdna_start + $ex->length() - 1;
00151 
00152     my $strand = $ex->strand();
00153 
00154     # add deletions and insertions into pairs when SeqEdits turned on
00155     # ignore mismatches (i.e. treat as matches)
00156     if($edits_on) {
00157       while(@edits && $edits[0]->start() + $edit_shift <= $cdna_end) {
00158 
00159         my $edit = shift(@edits);
00160         my $len_diff = $edit->length_diff();
00161 
00162         if($len_diff) {
00163           # break pair into two parts, finish first pair just before edit
00164 
00165           my $prev_cdna_end   = $edit->start() + $edit_shift - 1;
00166           my $prev_cdna_start = $cdna_start;
00167           my $prev_len        = $prev_cdna_end - $prev_cdna_start + 1;
00168 
00169           my $prev_gen_end;
00170           my $prev_gen_start;
00171           if($strand == 1) {
00172             $prev_gen_start = $gen_start;
00173             $prev_gen_end   = $gen_start + $prev_len - 1;
00174           } else {
00175             $prev_gen_start = $gen_end - $prev_len + 1;
00176             $prev_gen_end   = $gen_end;
00177           }
00178 
00179           if($prev_len > 0) { # only create map pair if not boundary case
00180             $mapper->add_map_coordinates
00181               ('cdna', $prev_cdna_start, $prev_cdna_end, $strand,
00182                'genome', $prev_gen_start,$prev_gen_end);
00183           }
00184 
00185           $cdna_start = $prev_cdna_end  + 1;
00186 
00187           if($strand == 1) {
00188             $gen_start  = $prev_gen_end + 1;
00189           } else {
00190             $gen_end    = $prev_gen_start - 1;
00191           }
00192 
00193           $cdna_end  += $len_diff;
00194 
00195           if($len_diff > 0) {
00196             # insert in cdna, shift cdna coords along
00197             $cdna_start += $len_diff;
00198           } else {
00199             # delete in cdna (insert in genomic), shift genomic coords along
00200 
00201             if($strand == 1) {
00202               $gen_start  -= $len_diff;
00203             } else {
00204               $gen_end    += $len_diff;
00205             }
00206           }
00207 
00208           $edit_shift += $len_diff;
00209         }
00210       }
00211     }
00212 
00213     my $pair_len = $cdna_end - $cdna_start + 1;
00214 
00215     if($pair_len > 0) {
00216       $mapper->add_map_coordinates('cdna', $cdna_start, $cdna_end, $strand,
00217                                    'genome', $gen_start, $gen_end);
00218     }
00219   }
00220 
00221   return $mapper;
00222 }
00223 
00224 
00225 =head2 cdna2genomic
00226 
00227   Arg [1]    : $start
00228                The start position in cdna coordinates
00229   Arg [2]    : $end
00230                The end position in cdna coordinates
00231   Example    : @cdna_coords = $transcript_mapper->cdna2genomic($start, $end);
00232   Description: Converts cdna coordinates to genomic coordinates.  The
00233                return value is a list of coordinates and gaps.
00234   Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
00235                Bio::EnsEMBL::Mapper::Gap objects
00236   Exceptions : throws if no start or end
00237   Caller     : general
00238   Status     : Stable
00239 
00240 =cut
00241 
00242 
00243 sub cdna2genomic {
00244   my ($self,$start,$end) = @_;
00245 
00246   if( !defined $end ) {
00247     throw("Must call with start/end");
00248   }
00249 
00250   my $mapper = $self->{'exon_coord_mapper'};
00251 
00252   return  $mapper->map_coordinates( 'cdna', $start, $end, 1, "cdna" );
00253 
00254 }
00255 
00256 
00257 =head2 genomic2cdna
00258 
00259   Arg [1]    : $start
00260                The start position in genomic coordinates
00261   Arg [2]    : $end
00262                The end position in genomic coordinates
00263   Arg [3]    : $strand
00264                The strand of the genomic coordinates (default value 1)
00265   Example    : @coords = $trans_mapper->genomic2cdna($start, $end, $strnd);
00266   Description: Converts genomic coordinates to cdna coordinates.  The
00267                return value is a list of coordinates and gaps.  Gaps
00268                represent intronic or upstream/downstream regions which do
00269                not comprise this transcripts cdna.  Coordinate objects
00270                represent genomic regions which map to exons (utrs included).
00271   Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
00272                Bio::EnsEMBL::Mapper::Gap objects
00273   Exceptions : throws if start, end or strand not defined
00274   Caller     : general
00275   Status     : Stable
00276 
00277 =cut
00278 
00279 sub genomic2cdna {
00280   my ($self, $start, $end, $strand) = @_;
00281 
00282   unless(defined $start && defined $end && defined $strand) {
00283     throw("start, end and strand arguments are required\n");
00284   }
00285 
00286   my $mapper = $self->{'exon_coord_mapper'};
00287 
00288   return $mapper->map_coordinates("genome", $start, $end, $strand,"genomic");
00289 
00290 }
00291 
00292 
00293 =head2 cds2genomic
00294 
00295   Arg [1]    : int $start
00296                start position in cds coords
00297   Arg [2]    : int $end
00298                end position in cds coords
00299   Example    : @genomic_coords = $transcript_mapper->cds2genomic(69, 306);
00300   Description: Converts cds coordinates into genomic coordinates.  The
00301                coordinates returned are relative to the same slice that the
00302                transcript used to construct this TranscriptMapper was on.
00303   Returntype : list of Bio::EnsEMBL::Mapper::Gap and
00304                Bio::EnsEMBL::Mapper::Coordinate objects
00305   Exceptions : throws if no end
00306   Caller     : general
00307   Status     : at risk
00308 
00309 =cut
00310 
00311 sub cds2genomic {
00312   my ( $self, $start, $end ) = @_;
00313 
00314   if ( !( defined($start) && defined($end) ) ) {
00315     throw("Must call with start and end");
00316   }
00317 
00318   # Move start end into translate cDNA coordinates now.
00319   $start = $start +( $self->{'cdna_coding_start'} - 1 ) ;
00320   $end = $end + ( $self->{'cdna_coding_start'} - 1 );
00321 
00322   return $self->cdna2genomic( $start, $end );
00323 }
00324 
00325 =head2 pep2genomic
00326 
00327   Arg [1]    : int $start
00328                start position in peptide coords
00329   Arg [2]    : int $end
00330                end position in peptide coords
00331   Example    : @genomic_coords = $transcript_mapper->pep2genomic(23, 102);
00332   Description: Converts peptide coordinates into genomic coordinates.  The
00333                coordinates returned are relative to the same slice that the
00334                transcript used to construct this TranscriptMapper was on.
00335   Returntype : list of Bio::EnsEMBL::Mapper::Gap and
00336                Bio::EnsEMBL::Mapper::Coordinate objects
00337   Exceptions : throws if no end
00338   Caller     : general
00339   Status     : Stable
00340 
00341 =cut
00342 
00343 sub pep2genomic {
00344   my ( $self, $start, $end ) = @_;
00345 
00346   if ( !( defined($start) && defined($end) ) ) {
00347     throw("Must call with start and end");
00348   }
00349 
00350   # Take possible N-padding at beginning of CDS into account.
00351   my $start_phase = $self->{'start_phase'};
00352   my $shift = ( $start_phase > 0 ) ? $start_phase : 0;
00353 
00354   # Move start end into translate cDNA coordinates now.
00355   $start = 3*$start - 2 + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
00356   $end = 3*$end + ( $self->{'cdna_coding_start'} - 1 ) - $shift;
00357 
00358   return $self->cdna2genomic( $start, $end );
00359 }
00360 
00361 
00362 =head2 genomic2cds
00363 
00364   Arg [1]    : int $start
00365                The genomic start position
00366   Arg [2]    : int $end
00367                The genomic end position
00368   Arg [3]    : int $strand
00369                The genomic strand
00370   Example    : @cds_coords = $trans_mapper->genomic2cds($start, $end, $strand);
00371   Description: Converts genomic coordinates into CDS coordinates of the
00372                transcript that was used to create this transcript mapper.
00373   Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
00374                Bio::EnsEMBL::Mapper::Gap objects
00375   Exceptions : throw if start, end or strand not defined
00376   Caller     : general
00377   Status     : Stable
00378 
00379 =cut
00380 
00381 sub genomic2cds {
00382  my ($self, $start, $end, $strand) = @_;
00383 
00384  if(!defined($start) || !defined($end) || !defined($strand)) {
00385    throw("start, end and strand arguments are required");
00386  }
00387 
00388  if($start > $end + 1) {
00389    throw("start arg must be less than or equal to end arg + 1");
00390  }
00391 
00392  my $cdna_cstart = $self->{'cdna_coding_start'};
00393  my $cdna_cend   = $self->{'cdna_coding_end'};
00394 
00395  #this is a pseudogene if there is no coding region
00396  if(!defined($cdna_cstart)) {
00397    #return a gap of the entire requested region, there is no CDS
00398    return Bio::EnsEMBL::Mapper::Gap->new($start,$end);
00399  }
00400 
00401  my @coords = $self->genomic2cdna($start, $end, $strand);
00402 
00403  my @out;
00404 
00405  foreach my $coord (@coords) {
00406    if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
00407      push @out, $coord;
00408    } else {
00409      my $start = $coord->start;
00410      my $end   = $coord->end;
00411 
00412      if($coord->strand == -1 || $end < $cdna_cstart || $start > $cdna_cend) {
00413        #is all gap - does not map to peptide
00414        push @out, Bio::EnsEMBL::Mapper::Gap->new($start,$end);
00415      } else {
00416        #we know area is at least partially overlapping CDS
00417     
00418        my $cds_start = $start - $cdna_cstart + 1;
00419        my $cds_end   = $end   - $cdna_cstart + 1;
00420 
00421        if($start < $cdna_cstart) {
00422          #start of coordinates are in the 5prime UTR
00423          push @out, Bio::EnsEMBL::Mapper::Gap->new($start, $cdna_cstart-1);
00424 
00425          #start is now relative to start of CDS
00426          $cds_start = 1;
00427        }
00428     
00429        my $end_gap = undef;
00430        if($end > $cdna_cend) {
00431          #end of coordinates are in the 3prime UTR
00432          $end_gap = Bio::EnsEMBL::Mapper::Gap->new($cdna_cend + 1, $end);
00433          #adjust end to relative to CDS start
00434          $cds_end = $cdna_cend - $cdna_cstart + 1;
00435        }
00436 
00437        #start and end are now entirely in CDS and relative to CDS start
00438        $coord->start($cds_start);
00439        $coord->end($cds_end);
00440 
00441        push @out, $coord;
00442 
00443        if($end_gap) {
00444          #push out the region which was in the 3prime utr
00445          push @out, $end_gap;
00446        }
00447      }  
00448    }
00449  }
00450 
00451  return @out;
00452 
00453 }
00454 
00455 
00456 =head2 genomic2pep
00457 
00458   Arg [1]    : $start
00459                The start position in genomic coordinates
00460   Arg [2]    : $end
00461                The end position in genomic coordinates
00462   Arg [3]    : $strand
00463                The strand of the genomic coordinates
00464   Example    : @pep_coords = $transcript->genomic2pep($start, $end, $strand);
00465   Description: Converts genomic coordinates to peptide coordinates.  The
00466                return value is a list of coordinates and gaps.
00467   Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and
00468                Bio::EnsEMBL::Mapper::Gap objects
00469   Exceptions : throw if start, end or strand not defined
00470   Caller     : general
00471   Status     : Stable
00472 
00473 =cut
00474 
00475 sub genomic2pep {
00476  my ($self, $start, $end, $strand) = @_;
00477 
00478  unless(defined $start && defined $end && defined $strand) {
00479    throw("start, end and strand arguments are required");
00480  }
00481 
00482  my @coords = $self->genomic2cds($start, $end, $strand);
00483 
00484  my @out;
00485 
00486  my $start_phase = $self->{'start_phase'};
00487 
00488  #take into account possible N padding at beginning of CDS
00489  my $shift = ($start_phase > 0) ? $start_phase : 0;
00490 
00491  foreach my $coord (@coords) {
00492    if($coord->isa('Bio::EnsEMBL::Mapper::Gap')) {
00493      push @out, $coord;
00494    } else {
00495 
00496      #start and end are now entirely in CDS and relative to CDS start
00497 
00498      #convert to peptide coordinates
00499      my $pep_start = int(($coord->start + $shift + 2) / 3);
00500      my $pep_end   = int(($coord->end   + $shift + 2) / 3);
00501      $coord->start($pep_start);
00502      $coord->end($pep_end);
00503 
00504      push @out, $coord;
00505    }    
00506  }
00507 
00508  return @out;
00509 }
00510 
00511 
00512 1;