Archive Ensembl HomeArchive Ensembl Home
CompressedSequenceAdaptor.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::DBSQL::CompressedSequenceAdaptor - Facilitates DB storage and retrieval of compressed sequence
00024 
00025 =head1 SYNOPSIS
00026 
00027   $seq_adptr = $database_adaptor->get_SequenceAdaptor();
00028 
00029   $dna =
00030     ${ $seq_adptr->fetch_by_Slice_start_end_strand( $slice, 1, 1000,
00031       -1 ) };
00032 
00033 =head1 DESCRIPTION
00034 
00035 An adaptor for the retrieval of compressed DNA sequence from the EnsEMBL 
00036 database
00037 
00038 =head1 METHODS
00039 
00040 =cut
00041 
00042 package Bio::EnsEMBL::DBSQL::CompressedSequenceAdaptor;
00043 
00044 use vars qw(@ISA);
00045 use strict;
00046 
00047 use Bio::EnsEMBL::DBSQL::SequenceAdaptor;
00048 
00049 @ISA = qw(Bio::EnsEMBL::DBSQL::SequenceAdaptor);
00050 
00051 
00052 sub _fetch_seq {
00053   my $self          = shift;
00054   my $seq_region_id = shift;
00055   my $start         = shift;
00056   my $len           = shift;
00057 
00058   #calculate the offset and start in the compressed sequence 
00059   my $comp_start  = ($start-1 >> 2) + 1;
00060   my $comp_len    = ($len  >> 2) + 2;
00061 
00062   my ($bvector, $nline);
00063 
00064   my $sth = $self->prepare(
00065                "SELECT SUBSTRING( d.sequence, ?, ?), n_line
00066                 FROM dnac d
00067                 WHERE d.seq_region_id = ?");
00068   $sth->bind_param(1,$comp_start,SQL_INTEGER);
00069   $sth->bind_param(2,$comp_len ,SQL_INTEGER);
00070   $sth->bind_param(3,$seq_region_id,SQL_INTEGER);
00071   $sth->execute();
00072   $sth->bind_columns(\$bvector, \$nline);
00073   $sth->fetch();
00074   $sth->finish();
00075 
00076   #convert sequence from binary string to 0123 string
00077   my $bitlen = length($bvector) << 2;
00078   my $str = '';
00079   for(my $i=0; $i < $bitlen; $i++) {
00080     $str .= vec($bvector, $i, 2);
00081   }
00082 
00083   #convert from 0123 to ACTG
00084   $str =~ tr/0123/ACTG/;
00085 
00086   $str = substr($str, ($start-1)%4, $len);
00087 
00088   #expand the nlines and place them back in the sequence
00089   my @nlines = split(/:/, $nline);
00090   foreach my $nl (@nlines) {
00091     my ($offset,$char,$nlen) = $nl =~ /(\d+)(\D)(\d+)/;
00092     
00093     #skip nlines entirely out of range
00094     next if(($offset+$nlen-1) < $start || $offset > ($start+$len-1));
00095     
00096     #obtain relative offset into requested region
00097     $offset = $offset -  $start + 1;
00098 
00099     #nlines that partially overlap requested region have to be shrunk
00100     if($offset < 1) {
00101       $nlen = $nlen - (1-$offset);
00102       $offset = 1;
00103     }
00104     if($offset + $nlen > $start+$len) {
00105       $nlen = $len - $offset + 1;
00106     }
00107 
00108     substr($str,$offset-1,$nlen) = $char x $nlen;    
00109   }
00110 
00111   return \$str;
00112 }
00113 
00114 
00115 =head2 store
00116 
00117   Arg [1]    : string $seq_region_id the id of the sequence region this dna
00118                will be associated with.
00119   Arg [2]    : string reference  $sequence the dna sequence to be stored in 
00120                the database
00121   Example    : $dbID = $seq_adaptor->store(12,\'ACTGGGTACCAAACAAACACAACA');
00122   Description: stores a dna sequence in the databases dna table and returns the
00123                database identifier for the new record.
00124   Returntype : int
00125   Exceptions : none
00126   Caller     : Bio::EnsEMBL::DBSQL::RawContigAdaptor::store
00127   Status     : Stable
00128 
00129 =cut
00130 
00131 sub store {
00132   my ($self, $seq_region_id, $sequence) = @_;
00133 
00134   if(!$seq_region_id) {
00135     throw('seq_region_id is required');
00136   }
00137 
00138   $sequence = uc($sequence);
00139 
00140   my $bvector = '';
00141 
00142   #convert sequence to 0s,1s,2s and 3s
00143   $sequence =~ tr/ACTG/0123/;
00144 
00145   #nlines cover sequence which is not ACTG such as N
00146   #nline format is a set of colon delimited int, char, int triplets:
00147   #<offset><code><length>
00148   my($nline_char,$nline_len,$nline_off);
00149   my @nlines;
00150 
00151   my $len    = length($sequence);
00152   for(my $i=0; $i < $len; $i++) {
00153     my $char = substr($sequence,$i,1);
00154 
00155     #quickly check if this character was an A,C,T or G (and was converted to
00156     # a 0,1,2,3)
00157     if($char =~ /[0-3]/) {
00158       vec($bvector, $i,2) = $char;
00159       if($nline_char) {
00160         #end of an nline
00161         push @nlines, "$nline_off$nline_char$nline_len";
00162         $nline_char = undef;
00163         $nline_len  = 0;
00164         $nline_off  = 0;
00165       } 
00166     } else {
00167       #this was not an ACTG
00168       if($nline_char) {
00169         if($nline_char eq $char) {
00170           #continuation of an nline
00171           $nline_len++;
00172         } else {
00173           #end of a previous nline and start of a new one
00174           push @nlines, "$nline_off$nline_char$nline_len";
00175           $nline_char = $char;
00176           $nline_len  = 1;
00177           $nline_off  = $i+1;
00178         }
00179       } else {
00180         #start of a new nline
00181         $nline_char = $char;
00182         $nline_len  = 1;
00183         $nline_off  = $i+1;
00184       }
00185       $char = 0; #need to put numeric val into bitvector despite nline
00186     }
00187 
00188     vec($bvector, $i,2) = $char; 
00189   }
00190 
00191   my $nline = join(':', @nlines);
00192   my $statement = $self->prepare(
00193         "INSERT INTO dnac(seq_region_id, sequence, n_line) VALUES(?,?,?)");
00194 
00195   $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
00196   $statement->bind_param(2,$bvector,SQL_BLOB);
00197   $statement->bind_param(3,$nline,SQL_LONGVARCHAR);
00198   $statement->execute();
00199 
00200   $statement->finish();
00201   return;
00202 }
00203 
00204 
00205 1;