Archive Ensembl HomeArchive Ensembl Home
SequenceAdaptor.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations
00024 
00025 =head1 SYNOPSIS
00026 
00027   my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' );
00028 
00029   my $dna =
00030     ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) };
00031 
00032 =head1 DESCRIPTION
00033 
00034 An adaptor for the retrieval of DNA sequence from the EnsEMBL database
00035 
00036 =head1 METHODS
00037 
00038 =cut
00039 
00040 package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
00041 
00042 use vars qw(@ISA @EXPORT);
00043 use strict;
00044 use warnings;
00045 
00046 use Bio::EnsEMBL::DBSQL::BaseAdaptor;
00047 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate);
00048 use Bio::EnsEMBL::Utils::Sequence  qw(reverse_comp);
00049 use Bio::EnsEMBL::Utils::Cache;
00050 use Bio::EnsEMBL::Utils::Scalar qw( assert_ref );
00051 
00052 @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
00053 
00054 our $SEQ_CHUNK_PWR   = 18; # 2^18 = approx. 250KB
00055 our $SEQ_CACHE_SZ    = 5;
00056 our $SEQ_CACHE_MAX   = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ;
00057 
00058 @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
00059 
00060 =head2 new
00061 
00062   Arg [1]    : none
00063   Example    : my $sa = $db_adaptor->get_SequenceAdaptor();
00064   Description: Constructor.  Calls superclass constructor and initialises
00065                internal cache structure.
00066   Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor
00067   Exceptions : none
00068   Caller     : DBAdaptor::get_SequenceAdaptor
00069   Status     : Stable
00070 
00071 =cut
00072 
00073 sub new {
00074   my $caller = shift;
00075 
00076   my $class = ref($caller) || $caller;
00077 
00078   my $self = $class->SUPER::new(@_);
00079 
00080   # use an LRU cache to limit the size
00081   my %seq_cache;
00082   tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ);
00083 
00084   $self->{'seq_cache'} = \%seq_cache;
00085 
00086 
00087 #
00088 # See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these
00089 # in a  hash.
00090 #
00091 
00092   my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code like "_rna_edit"');
00093   
00094   $sth->execute();
00095   my ($seq_region_id, $value);
00096   $sth->bind_columns(\$seq_region_id, \$value);
00097   my %edits;
00098   my $count = 0;
00099   while($sth->fetch()){
00100     $count++;
00101     push @{$edits{$seq_region_id}}, $value;
00102   }
00103   $sth->finish;
00104   if($count){
00105     $self->{_rna_edits_cache} = \%edits;
00106   }
00107   
00108   return $self;
00109 }
00110 
00111 
00112 =head2 fetch_by_Slice_start_end_strand
00113 
00114   Arg  [1]   : Bio::EnsEMBL::Slice slice
00115                The slice from which you want the sequence
00116   Arg  [2]   : (optional) int startBasePair 
00117                The start base pair relative to the start of the slice. Negative
00118                values or values greater than the length of the slice are fine.
00119                default = 1
00120   Arg  [3]   : (optional) int endBasePair
00121                The end base pair relative to the start of the slice. Negative
00122                values or values greater than the length of the slice are fine,
00123                but the end must be greater than or equal to the start
00124                count from 1
00125                default = the length of the slice
00126   Arg  [4]   : (optional) int strand 
00127                1, -1
00128                default = 1
00129   Example    : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, 
00130                                                                   1000, -1);
00131   Description: retrieves from db the sequence for this slice
00132                uses AssemblyMapper to find the assembly
00133   Returntype : string 
00134   Exceptions : endBasePair should be less or equal to length of slice 
00135   Caller     : Bio::EnsEMBL::Slice::seq(), Slice::subseq() 
00136   Status     : Stable
00137 
00138 =cut
00139 
00140 sub fetch_by_Slice_start_end_strand {
00141    my ( $self, $slice, $start, $end, $strand ) = @_;
00142 
00143    if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
00144      throw("Slice argument is required.");
00145    }
00146 
00147    $start = 1 if(!defined($start));
00148 
00149  
00150    if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) {
00151          
00152        if ( !defined($end) || ($start > $end ) ) {
00153        return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand );
00154        }
00155 
00156        if ( defined($end) && ($end < 0) ) {
00157        $end += $slice->seq_region_length;
00158        }
00159        
00160        if ($start < 0) {
00161            $start += $slice->seq_region_length;
00162        }
00163 
00164        if($slice->start> $slice->end) {
00165            return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand );
00166        }
00167   }
00168         
00169   if ( ( !defined($end) ) && (not $slice->is_circular) ) {
00170            $end = $slice->end() - $slice->start() + 1;
00171   }
00172 
00173   if ( $start > $end ) {
00174       throw("Start must be less than or equal to end.");
00175   }
00176 
00177    $strand ||= 1;
00178 
00179    #get a new slice that spans the exact region to retrieve dna from
00180    my $right_expand  = $end - $slice->length(); #negative is fine
00181    my $left_expand   = 1 - $start; #negative is fine
00182 
00183    if($right_expand || $left_expand) {
00184      $slice = $slice->expand($left_expand, $right_expand);
00185    }
00186 
00187    #retrieve normalized 'non-symlinked' slices
00188    #this allows us to support haplotypes and PARs
00189    my $slice_adaptor = $slice->adaptor();
00190    my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};
00191 
00192    if(@symproj == 0) {
00193      throw('Could not retrieve normalized Slices. Database contains ' .
00194            'incorrect assembly_exception information.');
00195    }
00196 
00197    #call this method again with any slices that were 'symlinked' to by this
00198    #slice
00199    if(@symproj != 1 || $symproj[0]->[2] != $slice) {
00200      my $seq;
00201      foreach my $segment (@symproj) {
00202        my $symlink_slice = $segment->[2];
00203        #get sequence from each symlinked area
00204        $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
00205                                                         1,undef,1)};
00206      }
00207      if($strand == -1) {
00208        reverse_comp(\$seq);
00209      }
00210      return \$seq;
00211    }
00212 
00213    # we need to project this slice onto the sequence coordinate system
00214    # even if the slice is in the same coord system, we want to trim out
00215    # flanking gaps (if the slice is past the edges of the seqregion)
00216    my $csa = $self->db->get_CoordSystemAdaptor();
00217    my $seqlevel = $csa->fetch_sequence_level();
00218 
00219    my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
00220 
00221    my $seq = '';
00222    my $total = 0;
00223    my $tmp_seq;
00224 
00225    #fetch sequence from each of the sequence regions projected onto
00226    foreach my $segment (@projection) {
00227      my ($start, $end, $seq_slice) = @$segment;
00228 
00229      #check for gaps between segments and pad them with Ns
00230      my $gap = $start - $total - 1;
00231      if($gap) {
00232        $seq .= 'N' x $gap;
00233      }
00234 
00235      my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
00236 
00237      $tmp_seq = ${$self->_fetch_seq($seq_region_id,
00238                                     $seq_slice->start, $seq_slice->length())};
00239 
00240      #reverse compliment on negatively oriented slices
00241      if($seq_slice->strand == -1) {
00242        reverse_comp(\$tmp_seq);
00243      }
00244 
00245      $seq .= $tmp_seq;
00246 
00247      $total = $end;
00248    }
00249 
00250    #check for any remaining gaps at the end
00251    my $gap = $slice->length - $total;
00252    if($gap) {
00253      $seq .= 'N' x $gap;
00254    }
00255 
00256    #if the sequence is too short it is because we came in with a seqlevel
00257    #slice that was partially off of the seq_region.  Pad the end with Ns
00258    #to make long enough
00259    if(length($seq) != $slice->length()) {
00260      $seq .= 'N' x ($slice->length() - length($seq));
00261    }
00262 
00263    if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
00264      $self->_rna_edit($slice,\$seq);
00265    }
00266 
00267    #if they asked for the negative slice strand revcomp the whole thing
00268    reverse_comp(\$seq) if($strand == -1);
00269 
00270    return \$seq;
00271 }
00272 
00273 
00274 sub _fetch_by_Slice_start_end_strand_circular {
00275   my ( $self, $slice, $start, $end, $strand ) = @_;
00276 
00277   assert_ref( $slice, 'Bio::EnsEMBL::Slice' );
00278   
00279   $strand ||= 1;
00280   if ( !defined($start) ) {
00281     $start ||= 1;
00282   }
00283 
00284   if ( !defined($end) ) {
00285       $end = $slice->end() - $slice->start() + 1;
00286   }
00287 
00288   if ( $start > $end && $slice->is_circular() ) {
00289     my ($seq, $seq1, $seq2);
00290 
00291     my $midpoint = $slice->seq_region_length - $slice->start + 1;
00292     $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1,  $midpoint, 1 )};
00293     $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )};
00294 
00295     $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1";
00296 
00297     reverse_comp( \$seq ) if ( $strand == -1 );
00298 
00299     return \$seq;
00300   }
00301 
00302 
00303 
00304   # Get a new slice that spans the exact region to retrieve dna from
00305   my $right_expand = $end - $slice->length();    #negative is fine
00306   my $left_expand  = 1 - $start;                 #negative is fine
00307 
00308   if ( $right_expand || $left_expand ) {
00309     $slice =
00310         $slice->strand > 0
00311       ? $slice->expand( $left_expand,  $right_expand )
00312       : $slice->expand( $right_expand, $left_expand );
00313   }
00314 
00315   # Retrieve normalized 'non-symlinked' slices.  This allows us to
00316   # support haplotypes and PARs.
00317   my $slice_adaptor = $slice->adaptor();
00318   my @symproj =
00319     @{ $slice_adaptor->fetch_normalized_slice_projection($slice) };
00320 
00321   if ( @symproj == 0 ) {
00322     throw(   'Could not retrieve normalized Slices. Database contains '
00323            . 'incorrect assembly_exception information.' );
00324   }
00325 
00326   # Call this method again with any slices that were 'symlinked' to by
00327   # this slice.
00328   if ( @symproj != 1 || $symproj[0]->[2] != $slice ) {
00329     my $seq;
00330     foreach my $segment (@symproj) {
00331       my $symlink_slice = $segment->[2];
00332 
00333       # Get sequence from each symlinked area.
00334       $seq .= ${
00335         $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1,
00336                                                 undef, 1 ) };
00337     }
00338     if ( $strand == -1 ) {
00339       reverse_comp( \$seq );
00340     }
00341 
00342     return \$seq;
00343   }
00344 
00345   # We need to project this slice onto the sequence coordinate system
00346   # even if the slice is in the same coord system, we want to trim out
00347   # flanking gaps (if the slice is past the edges of the seqregion).
00348   my $csa      = $self->db->get_CoordSystemAdaptor();
00349   my $seqlevel = $csa->fetch_sequence_level();
00350 
00351   my @projection =
00352     @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) };
00353 
00354   my $seq   = '';
00355   my $total = 0;
00356   my $tmp_seq;
00357 
00358   # Fetch sequence from each of the sequence regions projected onto.
00359   foreach my $segment (@projection) {
00360     my ( $start, $end, $seq_slice ) = @{$segment};
00361 
00362     # Check for gaps between segments and pad them with Ns
00363     my $gap = $start - $total - 1;
00364     if ($gap) {
00365       $seq .= 'N' x $gap;
00366     }
00367 
00368     my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
00369 
00370     $tmp_seq = ${
00371       $self->_fetch_seq( $seq_region_id, $seq_slice->start(),
00372                          $seq_slice->length() ) };
00373 
00374     # Reverse compliment on negatively oriented slices.
00375     if ( $seq_slice->strand == -1 ) {
00376       reverse_comp( \$tmp_seq );
00377     }
00378 
00379     $seq .= $tmp_seq;
00380 
00381     $total = $end;
00382   }
00383 
00384   # Check for any remaining gaps at the end.
00385   my $gap = $slice->length() - $total;
00386 
00387   if ($gap) {
00388     $seq .= 'N' x $gap;
00389   }
00390 
00391   # If the sequence is too short it is because we came in with a
00392   # seqlevel slice that was partially off of the seq_region.  Pad the
00393   # end with Ns to make long enough
00394   if ( length($seq) != $slice->length() ) {
00395     $seq .= 'N' x ( $slice->length() - length($seq) );
00396   }
00397 
00398   if ( defined( $self->{_rna_edits_cache} )
00399        && defined(
00400             $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) )
00401   {
00402     $self->_rna_edit( $slice, \$seq );
00403   }
00404 
00405   return \$seq;
00406 } ## end sub _fetch_by_Slice_start_end_strand_circular
00407 
00408 
00409 
00410 
00411 
00412 sub _rna_edit {
00413   my $self  = shift;
00414   my $slice = shift;
00415   my $seq   = shift; #reference to string
00416 
00417   my $s_start = $slice->start;   #substr start at 0 , but seq starts at 1 (so no -1 here)
00418   my $s_end = $s_start+length($$seq);
00419 
00420   foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
00421     my ($start, $end, $txt) = split (/\s+/, $edit);
00422 # check that RNA edit is not outside the requested region : happens quite often with LRG regions
00423     next if ($end < $s_start);
00424     next if ($s_end < $start);
00425     substr($$seq,$start-$s_start, ($end-$start)+1, $txt);
00426   }
00427   return;
00428 }
00429 
00430 
00431 sub _fetch_seq {
00432   my $self          = shift;
00433   my $seq_region_id = shift;
00434   my $start         = shift;
00435   my $length           = shift;
00436 
00437   my $cache = $self->{'seq_cache'};
00438 
00439   if($length < $SEQ_CACHE_MAX) {
00440     my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR;
00441     my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR;
00442 
00443     # piece together sequence from cached component parts
00444 
00445     my $entire_seq = undef;
00446     for(my $i = $chunk_min; $i <= $chunk_max; $i++) {
00447       if($cache->{"$seq_region_id:$i"}) {
00448         $entire_seq .= $cache->{"$seq_region_id:$i"};
00449       } else {
00450         # retrieve uncached portions of the sequence
00451 
00452         my $sth =
00453           $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
00454                           . "FROM dna d "
00455                           . "WHERE d.seq_region_id = ?" );
00456 
00457         my $tmp_seq;
00458 
00459         my $min = ($i << $SEQ_CHUNK_PWR) + 1;
00460 
00461         $sth->bind_param( 1, $min,                SQL_INTEGER );
00462         $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER );
00463         $sth->bind_param( 3, $seq_region_id,      SQL_INTEGER );
00464 
00465         $sth->execute();
00466         $sth->bind_columns(\$tmp_seq);
00467         $sth->fetch();
00468         $sth->finish();
00469 
00470         # always give back uppercased sequence so it can be properly softmasked
00471         $entire_seq .= uc($tmp_seq);
00472         $cache->{"$seq_region_id:$i"} = uc($tmp_seq);
00473       }
00474     }
00475 
00476     # return only the requested portion of the entire sequence
00477     my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1;
00478     # my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR;
00479     my $seq = substr( $entire_seq, $start - $min, $length );
00480 
00481     return \$seq;
00482   } else {
00483 
00484     # do not do any caching for requests of very large sequences
00485     my $sth =
00486       $self->prepare(   "SELECT SUBSTRING(d.sequence, ?, ?) "
00487                       . "FROM dna d "
00488                       . "WHERE d.seq_region_id = ?" );
00489 
00490     my $tmp_seq;
00491 
00492     $sth->bind_param( 1, $start,         SQL_INTEGER );
00493     $sth->bind_param( 2, $length,        SQL_INTEGER );
00494     $sth->bind_param( 3, $seq_region_id, SQL_INTEGER );
00495 
00496     $sth->execute();
00497     $sth->bind_columns(\$tmp_seq);
00498     $sth->fetch();
00499     $sth->finish();
00500 
00501     # always give back uppercased sequence so it can be properly softmasked
00502     $tmp_seq = uc($tmp_seq);
00503 
00504     return \$tmp_seq;
00505   }
00506 }
00507 
00508 
00509 =head2 store
00510 
00511   Arg [1]    : int $seq_region_id the id of the sequence region this dna
00512                will be associated with.
00513   Arg [2]    : string $sequence the dna sequence to be stored 
00514                in the database.  Note that the sequence passed in will be
00515                converted to uppercase.
00516   Example    : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
00517   Description: stores a dna sequence in the databases dna table and returns the
00518                database identifier for the new record.
00519   Returntype : none
00520   Exceptions : throw if the database insert fails
00521   Caller     : sequence loading scripts
00522   Status     : Stable
00523 
00524 =cut
00525 
00526 sub store {
00527   my ($self, $seq_region_id, $sequence) = @_;
00528 
00529   if(!$seq_region_id) {
00530     throw('seq_region_id is required');
00531   }
00532 
00533   $sequence = uc($sequence);
00534 
00535   my $statement = 
00536     $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
00537 
00538   $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
00539   $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
00540   $statement->execute();
00541 
00542   $statement->finish();
00543 
00544   return;
00545 }
00546 
00547 
00548 
00549 
00550 =head2 fetch_by_assembly_location
00551 
00552   Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead.
00553 
00554 =cut
00555 
00556 sub fetch_by_assembly_location {
00557    my ( $self, $chrStart, $chrEnd, 
00558         $strand, $chrName, $assemblyType ) = @_;
00559 
00560    deprecate('Use fetch_by_Slice_start_end_strand() instead');
00561 
00562    my $csa = $self->db->get_CoordSystem();
00563    my $top_cs = @{$csa->fetch_all};
00564 
00565    my $slice_adaptor = $self->db->get_SliceAdaptor();
00566    my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName,
00567                                                $chrStart, $chrEnd,
00568                                                $strand, $top_cs->version);
00569 
00570    return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1);
00571 }
00572 
00573 
00574 =head2 fetch_by_RawContig_start_end_strand
00575 
00576   Description: DEPRECATED use fetch_by_Slice_start_end_strand instead
00577 
00578 =cut
00579 
00580 sub fetch_by_RawContig_start_end_strand {
00581   deprecate('Use fetch_by_Slice_start_end_strand instead.');
00582   fetch_by_Slice_start_end_strand(@_);
00583 }
00584 
00585 
00586 
00587 
00588 1;