Archive Ensembl HomeArchive Ensembl Home
CollectionAdaptor.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::DBFile::CollectionAdaptor
00024 
00025 =head1 SYNOPSIS
00026 
00027 For use with a Bio::EnsEMBL::Collector e.g.
00028 
00029     package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor;
00030 
00031     @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor 
00032               Bio::EnsEMBL::Funcgen::Collector::ResultFeature 
00033               Bio::EnsEMBL::DBFile::CollectionAdaptor);
00034     #DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor
00035 
00036 
00037 Fetch wrapper methods access file based data via read_collection_blob:
00038 
00039     sub _fetch_from_file_by_Slice_ResultSet{
00040 
00041         #define filepath/config
00042 
00043         my $packed_scores =  $self->read_collection_blob(
00044                                                        $filepath,
00045                                                        $efg_sr_id,
00046                                                        $conf->{$window_size}{'byte_offset'},
00047                                                        $conf->{$window_size}{'byte_length'},
00048                                                       );
00049 
00050         #Do unpacking and object creation here
00051 
00052     }
00053 
00054 =head1 DESCRIPTION
00055 
00056 Adaptor for direct collection(.col) file access, which are binary compressed fixed 
00057 width format files providing window based values across the genome. Collection files
00058 integrate an index block which contains seq_region byte off set values.
00059 
00060 NOTE: By default all collection files are generated and packed using little endian encoding. 
00061 Due to the lack of standards of float encoding(wrt to endianess) perl packs using the 
00062 implicit endianess of the underlying architecture. This means that accessing float
00063 collection files located on a big endian architecture will produce unexpected results.
00064 
00065 # endian issues will disappear with knetfile xsubs
00066 
00067 =head1 SEE ALSO
00068 
00069 Bio::EnsEMBL::DBFile::FileAdaptor
00070 
00071 =cut
00072 
00073 
00074 
00075 package Bio::EnsEMBL::DBFile::CollectionAdaptor;
00076 
00077 use strict;
00078 use warnings;
00079 
00080 use Bio::EnsEMBL::DBFile::FileAdaptor;
00081 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
00082 use vars qw(@ISA);
00083 @ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor);
00084 
00085 
00086 =head2 initialise_filehandle
00087 
00088   Arg[1]     : string  - filepath
00089   Example    : $self->initialise_filehandle($filepath);
00090   Description: Initialises the filehandle for use, in this case reads 
00091                the index (seq_region offsets)
00092   Returntype : None
00093   Exceptions : warns if read fails
00094   Caller     : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle
00095   Status     : at risk
00096 
00097 =cut
00098 
00099 sub initialise_filehandle{
00100   my ($self, $filepath) = @_;
00101   my $fh = $self->{file_cache}{$filepath}{filehandle};
00102   
00103   #offsets include the length of the complete index block
00104   my ($index_size, $read_bytes, $index, $num_keys, %offset_index);
00105   
00106   ### INDEX FORMAT ###
00107   #First block of the index the index size in bytes(not inc size block).
00108   #
00109   #Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs
00110   #V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i? 
00111   #long is 8 bytes according to Config{longsize}!
00112 
00113   #read uses logical characters not necessarily in bytes
00114   #altho this does seem to read bytes, maybe due to binmode?
00115   #seek is in bytes
00116   #Changed to sysread/read which both use bytes explicitly
00117   #Can't mix sysread/seek due to I/O buffering differences
00118 
00119   
00120   #Read index_size first encoded as v(2 bytes)
00121   $read_bytes = sysread($fh, $index_size, 2);
00122     
00123   if(! ((defined $read_bytes) && ($read_bytes == 2))){
00124     #! defined is error 0 is end of file
00125     warn "Failed to read index size from $filepath\n$!";
00126 
00127     #Delete fh as it is useless/unsafe to retry
00128     undef $self->{file_cache}{$filepath}{filehandle};
00129   }
00130   else{ #Read index
00131     ($index_size) = unpack('v', $index_size);
00132     $read_bytes = sysread($fh, $index, $index_size);  #Now read index proper
00133     
00134     if(! ((defined $read_bytes) && ($read_bytes == $index_size))){
00135       #! defined is error 0 is end of file
00136       warn "Failed to read index from $filepath\n$!";
00137 
00138       #Delete fh as it is useless/unsafe to retry
00139       undef $self->{file_cache}{$filepath}{filehandle};
00140     }
00141     else{
00142       #Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes))
00143       $num_keys        = $index_size/6;
00144       my $unpack_template = '(vV)'.$num_keys,;
00145       
00146       %offset_index = unpack($unpack_template, $index);
00147       $self->{file_cache}{$filepath}{off_sets} = \%offset_index;
00148     }
00149   }
00150 
00151   return $self->{file_cache}{$filepath}{off_sets};
00152 }
00153 
00154 
00155 =head2 read_collection_blob
00156 
00157   Arg[1]     : string - filepath
00158   Arg[2]     : int    - seq_region_id
00159   Arg[3]     : int    - seq_region offset. The byte offset required to
00160                         locate the required start position
00161   Argp4[     : int    - byte length to read
00162   Example    : my $blob_substr = $self->read_collection_blob($filepath,
00163                                                              $sr_key,
00164                                                              $sr_offset,
00165                                                              $byte_length);
00166   Description: Reads bytes from file given a seq_region_key, byte offset and byte length.
00167                Sets filehandle to undef if read fails.
00168   Returntype : string - packed binary data
00169   Exceptions : warns if seek or read errors
00170   Caller     : general e.g. fetch_from_file_by_Slice_ResultSet
00171   Status     : at risk
00172 
00173 =cut
00174 
00175 # We could change this to take a Slice, hence we could check 
00176 # whether an EOF error is because the slice is out of range 
00177 # and undef only if it is in range i.e. the index/file is corrupt
00178 # overkill?
00179 # This is something the Slice API should warn about
00180 # but will still cause undef'd filehandle here
00181 # Index should also contain ends, so we can validate whether the slice is out of range???
00182 
00183 
00184 sub read_collection_blob{
00185   my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_;
00186     
00187   my $blob_substr;
00188   my $fh = $self->get_filehandle($filepath, {-binmode => 1});
00189 
00190   if(defined $fh){
00191     #Return from query cache here?
00192     #cache key = "$filepath:$key:$sr_offset:$byte_length"
00193 
00194     #define total offset
00195 
00196     #if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
00197     #  #warn "sr_key($sr_key) is not part of index for $filepath\n";
00198     #}
00199     #else{
00200 
00201     if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
00202 
00203       my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset;
00204       my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET.
00205 
00206       if(! $seeked){
00207         warn("Failed to seek to byte $total_offset in $filepath");
00208         #Don't undef fh here as this valid Slice maybe out of range
00209         #and we don't want to kill a valid fh
00210         #i.e. Slice start/end is past end of seq_region
00211       }
00212       else{
00213         my $read_bytes = sysread($fh, $blob_substr, $byte_length);
00214         
00215         if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){
00216           #! defined is error 0 is end of file
00217           warn "Failed to read from $filepath\n$!";
00218 
00219           if($read_bytes == 0){
00220             #This maybe because the slice is out of range!
00221             #The API gives no warning about this
00222                         
00223             warn "End Of File encountered\n";
00224             warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}.
00225               "  key($sr_key)  + $sr_offset = $total_offset\n";
00226 
00227             #add some checks against the theoretical/true length of the file?
00228           }
00229           else{  #Delete fh as it is useless/unsafe to retry
00230             undef $self->{file_cache}{$filepath}{filehandle};
00231             #$blob_substr is now set to empty string by read
00232             undef $blob_substr;
00233           }
00234         }       
00235       }
00236     }   
00237   }
00238 
00239   return $blob_substr;
00240 }
00241 
00242 
00243 1;