Archive Ensembl HomeArchive Ensembl Home
Upstream.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Upstream - Object that defines an upstream region
00024 
00025 =head1 SYNOPSIS
00026 
00027   use Bio::EnsEMBL::Upstream;
00028 
00029   my $upstream = Bio::EnsEMBL::Upstream->new(
00030     -transcript => $transcript,
00031     -length     => 2000           # bp
00032   );
00033 
00034   # Retrieve coordinates of upstream region
00035   my $upstream_region_start = $upstream->upstart;
00036   my $upstream_region_end   = $upstream->upend;
00037 
00038   # Retrieve coordinates in 'downstream' first intron
00039   my $intron_region_start = $upstream->downstart;
00040   my $intron_region_end   = $upstream->downend;
00041 
00042   # Coordinates are returned in the same scheme as the input transcript.
00043   # However, the coordinates of an upstream region can be transformed to
00044   # any other scheme using a slice
00045 
00046   $upstream->transform($slice);
00047 
00048   # Coordinates can be retrieved in scheme in the same manner as the
00049   # above.
00050 
00051 =head1 DESCRIPTION
00052 
00053 An object that determines the upstream region of a transcript.  Such a
00054 region is non-coding and ensures that other genes or transcripts are
00055 not present.  Ultimately, these objects can be used to looking for
00056 promoter elements.  To this end, it is also possible to derive a region
00057 downstream of the first exon, within the first intron and where promoter
00058 elements sometimes are found.
00059 
00060 =head1 METHODS
00061 
00062 =cut
00063 
00064 package Bio::EnsEMBL::Upstream;
00065 
00066 use strict;
00067 use vars qw(@ISA);
00068 use Bio::EnsEMBL::DBSQL::DBAdaptor;
00069 use Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor;
00070 
00071 use Bio::EnsEMBL::Utils::Exception qw(throw);
00072 use Bio::EnsEMBL::Utils::Argument  qw(rearrange);
00073 
00074 @ISA = qw(Bio::EnsEMBL::Feature);
00075 
00076 
00077 =head2 new
00078 
00079   Arg [transcript] : (optional) Bio::EnsEMBL::Transcript
00080   Arg [length]     : (optional) int $length
00081   Example    : $upstream = Bio::EnsEMBL::Upstream->new(-transcript => $transcript, 
00082                                -length => 2000);
00083   Description: Creates a new upstream object
00084   Returntype : Bio::EnsEMBL::Upstream
00085   Exceptions : none
00086   Caller     : Bio::EnsEMBL::Transcript, general
00087   Status     : Stable
00088 
00089 =cut
00090 
00091 sub new {
00092   my ($class, @args) = @_;
00093   my $self = {};
00094   
00095   bless $self, $class;
00096   
00097   my ($transcript, 
00098       $length) = rearrange([qw(TRANSCRIPT
00099                    LENGTH
00100                   )],@args);
00101     
00102   $self->transcript($transcript)    if defined $transcript;
00103   $self->length($length)        if $length;
00104   
00105   return $self
00106 }
00107 
00108 =head2 transcript
00109 
00110   Arg        : (optional) Bio::EnsEMBL::Transcript
00111   Example    : $self->transcript($transcript);
00112   Description: Getter/setter for transcript object
00113   Returntype : Bio::EnsEMBL::Transcript
00114   Exceptions : Throws if argument is not undefined 
00115                or a Bio::EnsEMBL::Transcript
00116   Caller     : $self->new, $self->_derive_coords, 
00117                $self->_first_coding_Exon
00118   Status     : Stable
00119 
00120 =cut
00121 
00122 
00123 sub transcript {
00124   my $self = shift;
00125   
00126   if (@_){
00127     $self->{_transcript} = shift;
00128     
00129     if (defined $self->{_transcript}) {
00130       throw("Transcript is not a Bio::EnsEMBL::Transcript") 
00131     if (! $self->{_transcript}->isa("Bio::EnsEMBL::Transcript"));
00132       $self->_flush_cache;
00133     }
00134   }
00135   
00136   return $self->{_transcript}
00137 }
00138 
00139 =head2 length
00140 
00141   Arg        : (optional) int $length
00142   Example    : $self->length(2000); # bp
00143   Description: Getter/setter for upstream region length.
00144   Returntype : int
00145   Exceptions : Throws if length is requested before it has been set.
00146   Caller     : $self->new, $self->_derive_coords
00147   Status     : Stable
00148 
00149 =cut
00150 
00151 sub length {
00152   my $self = shift;
00153   
00154   if (@_){
00155     $self->{_length} = shift;
00156     $self->_flush_cache;
00157   }
00158   
00159   throw("Region length has not been set.")
00160     unless $self->{_length};
00161   
00162   return $self->{_length}
00163 }
00164 
00165 =head2 _flush_cache
00166 
00167   Arg        : none
00168   Example    : $self->_flush_cache;
00169   Description: Empties cached coordinates (called when 
00170            coordinate scheme or region length has changed).
00171   Returntype : none
00172   Exceptions : none
00173   Caller     : $self->length, $self->transform
00174   Status     : Stable
00175 
00176 =cut
00177 
00178 sub _flush_cache {
00179   my $self = shift;
00180   
00181   $self->upstart(undef);
00182   $self->upend(undef);
00183   $self->downstart(undef);
00184   $self->downend(undef);
00185 }
00186 
00187 =head2 upstart
00188 
00189   Arg        : none
00190   Example    : $self->upstart;
00191   Description: Returns the start coordinate of the region 
00192                upstream of the transcript.  This coordinate 
00193                is always the furthest from the translation 
00194                initiation codon, whereas upend always abutts 
00195                the translation initiation codon.
00196   Returntype : int
00197   Exceptions : none
00198   Caller     : general
00199   Status     : Stable
00200 
00201 =cut
00202 
00203 sub upstart {
00204   my $self = shift;
00205   
00206   if (@_) {
00207     $self->{_upstart} = shift @_;
00208     return
00209   }
00210 
00211   if (! defined $self->{_upstart}) {
00212     $self->_derive_coords('up');
00213   }
00214 
00215   return $self->{_upstart}
00216 }
00217 
00218 =head2 upend
00219 
00220   Arg        : none
00221   Example    : $self->upend;
00222   Description: Returns the end coordinate of the region 
00223                upstream of the transcript.  This coordinate 
00224                always always abutts the translation 
00225                initiation codon, whereas upstart always 
00226                returns the coorindate furthest from the 
00227                translation initiation codon.
00228   Returntype : int
00229   Exceptions : none
00230   Caller     : general
00231   Status     : Stable
00232 
00233 =cut
00234 
00235 sub upend {
00236   my $self = shift;
00237     
00238   if (@_) {
00239     $self->{_upend} = shift @_;
00240     return
00241   }
00242 
00243   if (! defined $self->{_upend}) {
00244     $self->_derive_coords('up');
00245   }
00246 
00247   return $self->{_upend}
00248 }
00249 
00250 =head2 downstart
00251 
00252   Arg        : none
00253   Example    : $self->downstart;
00254   Description: Returns the start coordinate of the region 
00255                in the first intron of the transcript.  This 
00256                coordinate is always closest to the first 
00257                exon (irregardless of strand).
00258   Returntype : int
00259   Exceptions : none
00260   Caller     : general
00261   Status     : Stable
00262 
00263 =cut
00264 
00265 sub downstart {
00266   my $self = shift;
00267     
00268   if (@_) {
00269     $self->{_downstart} = shift @_;
00270     return
00271   }
00272 
00273   if (! defined $self->{_downstart}) {
00274     $self->_derive_coords('down');
00275   }
00276 
00277   return $self->{_downstart}
00278 }
00279 
00280 =head2 downend
00281 
00282   Arg        : none
00283   Example    : $self->downend;
00284   Description: Returns the end coordinate of the region 
00285                in the first intron of the transcript.  This 
00286                coordinate is always furthest from the first 
00287                exon (irregardless of strand).
00288   Returntype : int
00289   Exceptions : none
00290   Caller     : general
00291   Status     : Stable
00292 
00293 =cut
00294 
00295 sub downend {
00296   my $self = shift;
00297 
00298   if (@_) {
00299     $self->{_downend} = shift @_;
00300     return
00301   }
00302 
00303   if (! defined $self->{_downend}) {
00304     $self->_derive_coords('down');
00305   }
00306 
00307   return $self->{_downend}
00308 }
00309 
00310 =head2 transform
00311 
00312   Arg        : 
00313   Example    : 
00314   Description: Not yet implemented
00315   Returntype : 
00316   Exceptions : 
00317   Caller     : 
00318   Status     : At Risk
00319 
00320 =cut
00321 
00322 
00323 # Over-riding inherited class.  As yet unimplemented.
00324 
00325 sub transform {
00326   my $self = shift;
00327 
00328   throw("No transform method implemented for " . $self);
00329 }
00330 
00331 =head2 derive_upstream_coords
00332 
00333   Arg        : none
00334   Example    : my ($upstart, $upend) 
00335                        = $self->derive_upstream_coords;
00336   Description: Derives upstream coordinates (for 
00337                compatability with older scripts).
00338   Returntype : arrayref
00339   Exceptions : none
00340   Caller     : general
00341   Status     : Stable
00342 
00343 =cut
00344 
00345 sub derive_upstream_coords {
00346   my $self = shift;
00347 
00348   return [$self->upstart, $self->upend]
00349 }
00350 
00351 =head2 derive_downstream_coords
00352 
00353   Arg        : none
00354   Example    : my ($downstart, $downend) 
00355                        = $self->derive_downstream_coords;
00356   Description: Derives downstream coordinates (for 
00357                compatability with older scripts).
00358   Returntype : arrayref
00359   Exceptions : none
00360   Caller     : general
00361   Status     : Stable
00362 
00363 =cut
00364 
00365 sub derive_downstream_coords {
00366   my $self = shift;
00367 
00368   return [$self->downstart, $self->downend]
00369 }
00370 
00371 =head2 _derive_coords
00372 
00373   Arg        : string $direction (either 'up' or 'down').
00374   Example    : $self->_derive_coords('up');
00375   Description: Determines the coordinates of either upstream 
00376                or downstream region.
00377   Returntype : none
00378   Exceptions : Throws if argument is not either 'up' or 'down'
00379   Caller     : $self->upstart, $self->upend, $self->downstart, 
00380                $self->downend
00381   Status     : Stable
00382 
00383 =cut
00384 
00385 sub _derive_coords {
00386   my ($self, $direction) = @_;
00387 
00388   # Check direction
00389   throw("Must specify either \'up\' of \'down\'-stream direction to derive coords.")
00390     unless (($direction eq 'up')||($direction eq 'down'));
00391 
00392   # Put things in easily accessible places.
00393   my $core_db_slice_adaptor = $self->transcript->slice->adaptor;
00394   my $region_length = $self->length;
00395 
00396   # Whatever coord system the gene is currently is, transform to the toplevel.
00397   my $transcript = $self->transcript->transform('toplevel');
00398 
00399   # Use our transformed transcript to determine the upstream region coords.
00400   # End should always be just before the coding start (like ATG), including 3' UTR.
00401   # Start is the outer limit of the region upstream (furthest from ATG).
00402 
00403   my $region_start;
00404   my $region_end;
00405 
00406   if ($transcript->strand == 1){
00407     if ($direction eq 'up'){
00408       $region_end = $transcript->coding_region_start - 1;
00409       $region_start = $region_end - $region_length;
00410     } elsif ($direction eq 'down'){
00411       $region_end = $self->_first_coding_Exon->end + 1;
00412       $region_start = $region_end + $region_length;
00413     }
00414   } elsif ($transcript->strand == -1) {
00415     if ($direction eq 'up'){
00416       $region_end = $transcript->coding_region_end + 1;
00417       $region_start = $region_end + $region_length;
00418 
00419     } elsif ($direction eq 'down'){
00420       $region_end = $self->_first_coding_Exon->start - 1;
00421       $region_start = $region_end - $region_length;
00422     }
00423   }
00424 
00425   # Trim the upstream/downstream region to remove extraneous coding sequences 
00426   # from other genes and/or transcripts.
00427     
00428   my ($slice_low_coord, $slice_high_coord) = sort {$a <=> $b} ($region_start, $region_end);
00429 
00430   my $region_slice 
00431       = $core_db_slice_adaptor->fetch_by_region($transcript->slice->coord_system->name,
00432                         $transcript->slice->seq_region_name, 
00433                         $slice_low_coord, 
00434                         $slice_high_coord);
00435 
00436   if ($transcript->strand == 1) {
00437     if ($direction eq 'up') {
00438       $region_start += $self->_bases_to_trim('left_end', $region_slice);
00439     } elsif ($direction eq 'down') {
00440       $region_start -= $self->_bases_to_trim('right_end', $region_slice);
00441     }
00442   } elsif ($transcript->strand == -1) {
00443     if ($direction eq 'up') {
00444       $region_start -= $self->_bases_to_trim('right_end', $region_slice);
00445     } elsif ($direction eq 'down') {
00446       $region_start += $self->_bases_to_trim('left_end', $region_slice);
00447     }
00448   }
00449 
00450   # Always return start < end
00451 
00452   ($region_start, $region_end) = sort {$a <=> $b} ($region_start, $region_end);
00453 
00454   if ($direction eq 'up') {
00455     $self->upstart($region_start);
00456     $self->upend($region_end);
00457   } elsif ($direction eq 'down') {
00458     $self->downstart($region_start);
00459     $self->downend($region_end);
00460   }
00461 }
00462 
00463 =head2 _bases_to_trim
00464 
00465   Arg        : string $end_to_trim (either 'right_end' or 
00466                'left_end').
00467   Arg        : Bio::EnsEMBL::Slice
00468   Example    : $self->_derive_coords('right_end', $slice);
00469   Description: Finds exons from other genes/transcripts that
00470                invade our upstream/downstream slice and 
00471                returns the number of bases that should be 
00472                truncated from the appropriate end of the 
00473                upstream/downstream region.
00474   Returntype : in
00475   Exceptions : Throws if argument is not either 'right_end' 
00476                or 'left_end'
00477   Caller     : $self->_derive_coords
00478   Status     : Stable
00479 
00480 =cut
00481 
00482 # Method to look for coding regions that invade the upstream region.  For
00483 # now, this method returns the number of bases to trim.  I doesn't yet
00484 # do anything special if an exon is completely swallowed (truncates at 
00485 # the end of the overlapping exon and discards any non-coding sequence 
00486 # further upstream) or overlaps the 'wrong' end of the region (cases where 
00487 # two alternate exons share one end of sequence - does this happen?).
00488 
00489 # The input argument 'end' defines the end of the slice that should be 
00490 # truncated. 
00491 
00492 sub _bases_to_trim {
00493     my ($self, $end_to_trim, $slice) = @_;
00494 
00495     throw "Slice end argument must be either left_end or right_end" 
00496     unless ($end_to_trim eq 'right_end' || $end_to_trim eq 'left_end'); 
00497 
00498     my @overlap_coords;
00499     my $slice_length = $slice->length;
00500     my $right_trim = 0;
00501     my $left_trim  = 0;
00502 
00503     foreach my $exon (@{$slice->get_all_Exons}){
00504       next if $exon->stable_id eq $self->_first_coding_Exon->stable_id;
00505 
00506       my $start = $exon->start;
00507       my $end   = $exon->end;
00508 
00509       # Choose from four possible exon arrangements
00510 
00511       #  -----|********************|----- Slice
00512       #  --|=========================|--- Exon arrangement 1
00513       #  ----------|======|-------------- Exon arrangement 2
00514       #  --|=======|--------------------- Exon arrangement 3
00515       #  -------------------|=========|-- Exon arrangement 4
00516 
00517 
00518       if ($start <=  0 && $end >= $slice_length) {     # exon arrangement 1
00519     $right_trim = $slice_length - 1;
00520     $left_trim  = $slice_length - 1;
00521     last;
00522 
00523       } elsif ($start >= 0 && $end <= $slice_length) { # exon arrangement 2
00524     my $this_right_trim = ($slice_length - $start) + 1;
00525 
00526     $right_trim = $this_right_trim 
00527       if $this_right_trim > $right_trim;
00528 
00529     $left_trim  = $end 
00530       if $end > $left_trim;
00531 
00532       } elsif ($start <= 0 && $end < $slice_length) {  # exon arrangement 3
00533     $right_trim = $slice_length; # a bit draconian
00534     $left_trim  = $end 
00535       if $end > $left_trim;
00536 
00537       } elsif ($start > 0 && $end >= $slice_length) {  # exon arrangement 4
00538     my $this_right_trim = ($slice_length - $start) + 1;
00539 
00540     $right_trim = $this_right_trim 
00541       if $this_right_trim > $right_trim;
00542 
00543     $left_trim = $slice_length; # also a bit draconian
00544       }
00545 
00546     }
00547 
00548     return $right_trim  if $end_to_trim eq 'right_end';
00549     return $left_trim   if $end_to_trim eq 'left_end';
00550 }
00551 
00552 =head2 _first_coding_Exon
00553 
00554   Arg        : none
00555   Example    : $self->_first_coding_Exon;
00556   Description: Finds the first exon of our transcript that 
00557                contains coding bases.
00558   Returntype : Bio::EnsEMBL::Exon
00559   Exceptions : none
00560   Caller     : $self->_derive_coords, $self->_bases_to_trim
00561   Status     : Stable
00562 
00563 =cut
00564 
00565 sub _first_coding_Exon {
00566   my $self = shift;
00567 
00568   unless ($self->{_first_coding_exon}){
00569 
00570     my $exons = $self->transcript->get_all_translateable_Exons;
00571 
00572     $self->{_first_coding_exon} =  $exons->[0]  
00573       if $self->transcript->strand == 1;
00574     $self->{_first_coding_exon} =  $exons->[-1] 
00575       if $self->transcript->strand == -1;
00576   }
00577 
00578   return $self->{_first_coding_exon}
00579 }
00580 
00581 
00582 return 1;