Archive Ensembl HomeArchive Ensembl Home
Cache.pm
Go to the documentation of this file.
00001 # This package, originally distributed by CPAN, has been modified from
00002 # its original version in order to be used by the ensembl project.
00003 #
00004 # 8 July 2002 - changed package name
00005 #
00006 
00007 #package Tie::Cache; # old package
00008 package Bio::EnsEMBL::Utils::Cache;
00009 
00010 use strict;
00011 use vars qw(
00012  $VERSION $Debug $STRUCT_SIZE $REF_SIZE
00013  $BEFORE $AFTER $KEY $VALUE $BYTES $DIRTY
00014 );
00015 
00016 $VERSION = .17;
00017 $Debug = 0; # set to 1 for summary, 2 for debug output
00018 $STRUCT_SIZE = 240; # per cached elem bytes overhead, approximate
00019 $REF_SIZE    = 16;
00020 
00021 # NODE ARRAY STRUCT
00022 $KEY    = 0;
00023 $VALUE  = 1;
00024 $BYTES  = 2;
00025 $BEFORE = 3;
00026 $AFTER  = 4;
00027 $DIRTY  = 5;
00028 
00029 =pod
00030 
00031 =head1 NAME
00032 
00033 Tie::Cache - LRU Cache in Memory
00034 
00035 =head1 SYNOPSIS
00036 
00037  use Tie::Cache;
00038  tie %cache, 'Tie::Cache', 100, { Debug => 1 };   
00039  tie %cache2, 'Tie::Cache', { MaxCount => 100, MaxBytes => 50000 };
00040  tie %cache3, 'Tie::Cache', 100, { Debug => 1 , WriteSync => 0};   
00041 
00042  # Options ##################################################################
00043  #
00044  # Debug =>  0 - DEFAULT, no debugging output
00045  #       1 - prints cache statistics upon destroying
00046  #       2 - prints detailed debugging info
00047  #
00048  # MaxCount =>   Maximum entries in cache.
00049  #
00050  # MaxBytes =>   Maximum bytes taken in memory for cache based on approximate 
00051  #               size of total cache structure in memory
00052  #
00053  #               There is approximately 240 bytes used per key/value pair in the cache for 
00054  #               the cache data structures, so a cache of 5000 entries would take
00055  #               at approximately 1.2M plus the size of the data being cached.
00056  #
00057  # MaxSize  =>   Maximum size of each cache entry. Larger entries are not cached.
00058  #                   This helps prevent much of the cache being flushed when 
00059  #                   you set an exceptionally large entry.  Defaults to MaxBytes/10
00060  #
00061  # WriteSync =>  1 - DEFAULT, write() when data is dirtied for 
00062  #                   TRUE CACHE (see below)
00063  #               0 - write() dirty data as late as possible, when leaving 
00064  #                   cache, or when cache is being DESTROY'd
00065  #
00066  ############################################################################
00067 
00068  # cache supports normal tied hash functions
00069  $cache{1} = 2;       # STORE
00070  print "$cache{1}\n"; # FETCH
00071 
00072  # FIRSTKEY, NEXTKEY
00073  while(($k, $v) = each %cache) { print "$k: $v\n"; } 
00074  
00075  delete $cache{1};    # DELETE
00076  %cache = ();         # CLEAR
00077 
00078 =head1 DESCRIPTION
00079 
00080 This module implements a least recently used (LRU) cache in memory
00081 through a tie interface.  Any time data is stored in the tied hash,
00082 that key/value pair has an entry time associated with it, and 
00083 as the cache fills up, those members of the cache that are
00084 the oldest are removed to make room for new entries.
00085 
00086 So, the cache only "remembers" the last written entries, up to the 
00087 size of the cache.  This can be especially useful if you access 
00088 great amounts of data, but only access a minority of the data a 
00089 majority of the time. 
00090 
00091 The implementation is a hash, for quick lookups, 
00092 overlaying a doubly linked list for quick insertion and deletion.
00093 On a WinNT PII 300, writes to the hash were done at a rate 
00094 3100 per second, and reads from the hash at 6300 per second.   
00095 Work has been done to optimize refreshing cache entries that are 
00096 frequently read from, code like $cache{entry}, which moves the 
00097 entry to the end of the linked list internally.
00098 
00099 =cut Documentation continues at the end of the module.
00100 
00101 sub TIEHASH {
00102     my($class, $max_count, $options) = @_;
00103 
00104     if(ref($max_count)) {
00105     $options = $max_count;
00106     $max_count = $options->{MaxCount};
00107     }
00108     
00109     unless($max_count || $options->{MaxBytes}) {
00110     die('you must specify cache size with either MaxBytes or MaxCount');
00111     }
00112 
00113     my $sync = exists($options->{WriteSync}) ? $options->{WriteSync} : 1;
00114 
00115     my $self = bless 
00116       { 
00117        # how many items to cache
00118        max_count=> $max_count, 
00119        
00120        # max bytes to cache
00121        max_bytes => $options->{MaxBytes},
00122        
00123        # max size (in bytes) of an individual cache entry
00124        max_size => $options->{MaxSize} || ($options->{MaxBytes} ? (int($options->{MaxBytes}/10) + 1) : 0),
00125        
00126        # class track, so know if overridden subs should be used
00127        'class'    => $class,
00128        'subclass' => $class ne 'Tie::Cache' ? 1 : 0,
00129        
00130        # current sizes
00131        count=>0,
00132        bytes=>0,
00133        
00134        # inner structures
00135        head=>0, 
00136        tail=>0, 
00137        nodes=>{},
00138        'keys'=>[],
00139        
00140        # statistics
00141        hit => 0,
00142        miss => 0,
00143        
00144        # config
00145        sync => $sync,
00146        dbg => $options->{Debug} || $Debug
00147        
00148        
00149       }, $class;
00150     
00151     if (($self->{max_bytes} && ! $self->{max_size})) {
00152     die("MaxSize must be defined when MaxBytes is");
00153     }
00154 
00155     if($self->{max_bytes} and $self->{max_bytes} < 1000) {
00156     die("cannot set MaxBytes to under 1000, each raw entry takes $STRUCT_SIZE bytes alone");
00157     }
00158 
00159     if($self->{max_size} && $self->{max_size} < 3) {
00160     die("cannot set MaxSize to under 3 bytes, assuming error in config");
00161     }
00162 
00163     $self;
00164 }
00165 
00166 # override to write data leaving cache
00167 sub write { undef; }
00168 # commented this section out for speed
00169 #    my($self, $key, $value) = @_;
00170 #    1;
00171 #}
00172 
00173 # override to get data if not in cache, should return $value
00174 # associated with $key
00175 sub read { undef; }
00176 # commented this section out for speed
00177 #    my($self, $key) = @_;
00178 #    undef;
00179 #}
00180 
00181 sub FETCH {
00182     my($self, $key) = @_;
00183 
00184     my $node = $self->{nodes}{$key};
00185     if($node) {
00186     # refresh node's entry
00187     $self->{hit}++; # if $self->{dbg};
00188 
00189     # we used to call delete then insert, but we streamlined code
00190     if(my $after = $node->[$AFTER]) {
00191         $self->{dbg} > 1 and $self->print("update() node $node to tail of list");
00192         # reconnect the nodes
00193         my $before = $after->[$BEFORE] = $node->[$BEFORE];
00194         if($before) {
00195         $before->[$AFTER] = $after;
00196         } else {
00197         $self->{head} = $after;
00198         }
00199 
00200         # place at the end
00201         $self->{tail}[$AFTER] = $node;
00202         $node->[$BEFORE] = $self->{tail};
00203         $node->[$AFTER] = undef;
00204         $self->{tail} = $node; # always true after this
00205     } else {
00206         # if there is nothing after node, then we are at the end already
00207         # so don't do anything to move the nodes around
00208         die("this node is the tail, so something's wrong") 
00209         unless($self->{tail} eq $node);
00210     }
00211 
00212     $self->print("FETCH [$key, $node->[$VALUE]]") if ($self->{dbg} > 1);
00213     $node->[$VALUE];
00214     } else {
00215     # we have a cache miss here
00216     $self->{miss}++; # if $self->{dbg};
00217 
00218     # its fine to always insert a node, even when we have an undef,
00219     # because even if we aren't a sub-class, we should assume use
00220     # that would then set the entry.  This model works well with
00221     # sub-classing and reads() that might want to return undef as
00222     # a valid value.
00223     my $value;
00224     if ($self->{subclass}) {
00225         $self->print("read() for key $key") if $self->{dbg} > 1;
00226         $value = $self->read($key);
00227     }
00228 
00229     if(defined $value) {
00230         my $length;
00231         if($self->{max_size}) {
00232         # check max size of entry, that it not exceed max size
00233         $length = &_get_data_length(\$key, \$value);
00234         if($length > $self->{max_size}) {
00235             $self->print("direct read() [$key, $value]") if ($self->{dbg} > 1);
00236             return $value;
00237         }
00238         }
00239         # if we get here, we should insert the new node
00240         $node = &create_node($self, \$key, \$value, $length);
00241         &insert($self, $node);
00242         $value;
00243     } else {
00244         undef;
00245     }
00246     }
00247 }
00248 
00249 sub STORE {
00250     my($self, $key, $value) = @_;
00251     my $node;
00252 
00253     $self->print("STORE [$key,$value]") if ($self->{dbg} > 1);
00254 
00255     # do not cache undefined values
00256     defined($value) || return(undef);
00257 
00258     # check max size of entry, that it not exceed max size
00259     my $length;
00260     if($self->{max_size}) {
00261     $length = &_get_data_length(\$key, \$value);
00262     if($length > $self->{max_size}) {
00263         if ($self->{subclass}) {
00264         $self->print("direct write() [$key, $value]") if ($self->{dbg} > 1);
00265         $self->write($key, $value);
00266         }
00267         return $value;
00268     }
00269     }
00270 
00271     # do we have node already ?
00272     if($self->{nodes}{$key}) {
00273     $node = &delete($self, $key);
00274 #   $node = &delete($self, $key);
00275 #   $node->[$VALUE] = $value;
00276 #   $node->[$BYTES] = $length || &_get_data_length(\$key, \$value);
00277     }
00278 
00279     # insert new node  
00280     $node = &create_node($self, \$key, \$value, $length);
00281 #    $node ||= &create_node($self, \$key, \$value, $length);
00282     &insert($self, $node);
00283 
00284     # if the data is sync'd call write now, otherwise defer the data
00285     # writing, but mark it dirty so it can be cleanup up at the end
00286     if ($self->{subclass}) {
00287     if($self->{sync}) {
00288         $self->print("sync write() [$key, $value]") if $self->{dbg} > 1;
00289         $self->write($key, $value);
00290     } else {
00291         $node->[$DIRTY] = 1;
00292     }
00293     }
00294 
00295     $value;
00296 }
00297 
00298 sub DELETE {
00299     my($self, $key) = @_;
00300 
00301     $self->print("DELETE $key") if ($self->{dbg} > 1);
00302     my $node = $self->delete($key);
00303     $node ? $node->[$VALUE] : undef;
00304 }
00305 
00306 sub CLEAR {
00307     my($self) = @_;
00308 
00309     $self->print("CLEAR CACHE") if ($self->{dbg} > 1);
00310 
00311     if($self->{subclass}) {
00312     my $flushed = $self->flush();
00313     $self->print("FLUSH COUNT $flushed") if ($self->{dbg} > 1);
00314     }
00315 
00316     my $node;
00317     while($node = $self->{head}) {
00318     $self->delete($self->{head}[$KEY]);
00319     }
00320 
00321     1;
00322 }
00323 
00324 sub EXISTS {
00325     my($self, $key) = @_;
00326     exists $self->{nodes}{$key};
00327 }
00328     
00329 # firstkey / nextkey emulate keys() and each() behavior by
00330 # taking a snapshot of all the nodes at firstkey, and 
00331 # iterating through the keys with nextkey
00332 #
00333 # this method therefore will only supports one each() / keys()
00334 # happening during any given time.
00335 #
00336 sub FIRSTKEY {
00337     my($self) = @_;
00338 
00339     $self->{'keys'} = [];
00340     my $node = $self->{head};
00341     while($node) {
00342     push(@{$self->{'keys'}}, $node->[$KEY]);
00343     $node = $node->[$AFTER];
00344     }
00345 
00346     shift @{$self->{'keys'}};
00347 }
00348 
00349 sub NEXTKEY {
00350     my($self, $lastkey) = @_;
00351     shift @{$self->{'keys'}};
00352 }
00353 
00354 sub DESTROY {
00355     my($self) = @_;
00356 
00357     # if debugging, snapshot cache before clearing
00358     if($self->{dbg}) {
00359     if($self->{hit} || $self->{miss}) {
00360         $self->{hit_ratio} = 
00361         sprintf("%4.3f", $self->{hit} / ($self->{hit} + $self->{miss})); 
00362     }
00363     $self->print($self->pretty_self());
00364     if($self->{dbg} > 1) {
00365         $self->print($self->pretty_chains());
00366     }
00367     }
00368     
00369     $self->print("DESTROYING") if $self->{dbg} > 1;
00370     $self->CLEAR();
00371     
00372     1;
00373 }
00374 
00375 ####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE
00376 ## Helper Routines
00377 ####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE
00378 
00379 # we use scalar_refs for the data for speed
00380 sub create_node {
00381     my($self, $key, $value, $length) = @_;
00382     (defined($$key) && defined($$value)) 
00383       || die("need more localized data than $$key and $$value");
00384     
00385     # max_size always defined when max_bytes is
00386     if (($self->{max_size})) {
00387     $length = defined $length ? $length : &_get_data_length($key, $value)
00388     } else {
00389     $length = 0;
00390     }
00391     
00392     # ORDER SPECIFIC, see top for NODE ARRAY STRUCT
00393     my $node = [ $$key, $$value, $length ];
00394 }
00395 
00396 sub _get_data_length {
00397     my($key, $value) = @_;
00398     my $length = 0;
00399     my %refs;
00400 
00401     my @data = ($$key, $$value);
00402     while(my $elem = shift @data) {
00403     next if $refs{$elem};
00404     $refs{$elem} = 1;
00405     if(ref $elem && $elem =~ /(SCALAR|HASH|ARRAY)/) {
00406         my $type = $1;
00407         $length += $REF_SIZE; # guess, 16 bytes per ref, probably more
00408         if (($type eq 'SCALAR')) {
00409         $length += length($$elem);
00410         } elsif (($type eq 'HASH')) {
00411         while (my($k,$v) = each %$elem) {
00412             for my $kv($k,$v) {
00413             if ((ref $kv)) {
00414                 push(@data, $kv);
00415             } else {
00416                 $length += length($kv);
00417             }
00418             }
00419         }
00420         } elsif (($type eq 'ARRAY')) {
00421         for my $val (@$elem){
00422             if ((ref $val)) {
00423             push(@data, $val);
00424             } else {
00425             $length += length($val);
00426             }
00427         }
00428         }
00429     } else {
00430         $length += length($elem);
00431     }
00432     }
00433 
00434     $length;
00435 }
00436 
00437 sub insert {
00438     my($self, $new_node) = @_;
00439     
00440     $new_node->[$AFTER] = 0;
00441     $new_node->[$BEFORE] = $self->{tail};
00442     $self->print("insert() [$new_node->[$KEY], $new_node->[$VALUE]]") if ($self->{dbg} > 1);
00443     
00444     $self->{nodes}{$new_node->[$KEY]} = $new_node;
00445 
00446     # current sizes
00447     $self->{count}++;
00448     $self->{bytes} += $new_node->[$BYTES] + $STRUCT_SIZE;
00449 
00450     if($self->{tail}) {
00451     $self->{tail}[$AFTER] = $new_node;
00452     } else {
00453     $self->{head} = $new_node;
00454     }
00455     $self->{tail} = $new_node;
00456 
00457     ## if we are too big now, remove head
00458     while(($self->{max_count} && ($self->{count} > $self->{max_count})) ||
00459       ($self->{max_bytes} && ($self->{bytes} > $self->{max_bytes}))) 
00460     {
00461     if($self->{dbg} > 1) {
00462         $self->print("current/max: ".
00463              "bytes ($self->{bytes}/$self->{max_bytes}) ".
00464              "count ($self->{count}/$self->{max_count}) "
00465              );
00466     }
00467     my $old_node = $self->delete($self->{head}[$KEY]);
00468     if ($self->{subclass}) {
00469         if($old_node->[$DIRTY]) {
00470         $self->print("dirty write() [$old_node->[$KEY], $old_node->[$VALUE]]") 
00471           if ($self->{dbg} > 1);
00472         $self->write($old_node->[$KEY], $old_node->[$VALUE]);
00473         }
00474     }
00475 #   if($self->{dbg} > 1) {
00476 #       $self->print("after delete - bytes $self->{bytes}; count $self->{count}");
00477 #   }
00478     }
00479     
00480     1;
00481 }
00482 
00483 sub delete {
00484     my($self, $key) = @_;    
00485     my $node = $self->{nodes}{$key} || return;
00486 #    return unless $node;
00487 
00488     $self->print("delete() [$key, $node->[$VALUE]]") if ($self->{dbg} > 1);
00489 
00490     my $before = $node->[$BEFORE];
00491     my $after = $node->[$AFTER];
00492 
00493     #    my($before, $after) = $node->{before,after};
00494     if($before) {
00495     ($before->[$AFTER] = $after);
00496     } else {
00497     $self->{head} = $after;
00498     }
00499 
00500     if($after) {
00501     ($after->[$BEFORE] = $before);
00502     } else {
00503     $self->{tail} = $before;
00504     }
00505 
00506     delete $self->{nodes}{$key};
00507     $self->{bytes} -= ($node->[$BYTES] + $STRUCT_SIZE);
00508     $self->{count}--;
00509     
00510     $node;
00511 }
00512 
00513 sub flush {
00514     my $self = shift;
00515 
00516     $self->print("FLUSH CACHE") if ($self->{dbg} > 1);
00517 
00518     my $node = $self->{head};
00519     my $flush_count = 0;
00520     while($node) {
00521     if($node->[$DIRTY]) {
00522         $self->print("flush dirty write() [$node->[$KEY], $node->[$VALUE]]") 
00523           if ($self->{dbg} > 1);
00524         $self->write($node->[$KEY], $node->[$VALUE]);
00525         $node->[$DIRTY] = 0;
00526         $flush_count++;
00527     }
00528     $node = $node->[$AFTER];
00529     }
00530 
00531     $flush_count;
00532 }
00533 
00534 sub print {
00535     my($self, $msg) = @_;
00536     print "$self: $msg\n";
00537 }
00538 
00539 sub pretty_self {
00540     my($self) = @_;
00541     
00542     my(@prints);
00543     for(sort keys %{$self}) { 
00544     next unless defined $self->{$_};
00545     push(@prints, "$_=>$self->{$_}"); 
00546     }
00547 
00548     "{ " . join(", ", @prints) . " }";
00549 }
00550 
00551 sub pretty_chains {
00552     my($self) = @_;
00553     my($str);
00554     my $k = $self->FIRSTKEY();
00555 
00556     $str .= "[head]->";
00557     my($curr_node) = $self->{head};
00558     while($curr_node) {
00559     $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->";
00560     $curr_node = $curr_node->[$AFTER];
00561     }
00562     $str .= "[tail]->";
00563 
00564     $curr_node = $self->{tail};
00565     while($curr_node) {
00566     $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->";
00567     $curr_node = $curr_node->[$BEFORE];
00568     }
00569     $str .= "[head]";
00570 
00571     $str;
00572 }
00573 
00574 1;
00575 
00576 __END__
00577 
00578 =head1 INSTALLATION
00579 
00580 Tie::Cache installs easily using the make or nmake commands as
00581 shown below.  Otherwise, just copy Cache.pm to $PERLLIB/site/Tie
00582 
00583     > perl Makefile.PL
00584     > make
00585         > make test 
00586     > make install
00587 
00588         * use nmake for win32
00589         ** you can also just copy Cache.pm to $perllib/Tie
00590 
00591 =head1 BENCMARKS
00592 
00593 There is another simpler LRU cache implementation in CPAN,
00594 Tie::Cache::LRU, which has the same basic size limiting 
00595 functionality, and for this functionality, the exact same 
00596 interface.
00597 
00598 Through healthy competition, Michael G Schwern got 
00599 Tie::Cache::LRU mostly faster than Tie::Cache on reads & writes:
00600 
00601  Cache Size 5000       Tie::Cache 0.17  Tie::Cache::LRU 0.21
00602  10000 Writes             1.55 CPU sec          1.10 CPU sec
00603  40000 Reads              1.82 CPU sec          1.58 CPU sec
00604  10000 Deletes            0.55 CPU sec          0.59 CPU sec
00605 
00606 Unless you are using TRUE CACHE or MaxBytes functionality,
00607 using Tie::Cache::LRU should be an easy replacement for Tie::Cache.
00608 
00609 =head1 TRUE CACHE
00610 
00611 To use class as a true cache, which acts as the sole interface 
00612 for some data set, subclass the real cache off Tie::Cache, 
00613 with @ISA = qw( 'Tie::Cache' ) notation.  Then override
00614 the read() method for behavior when there is a cache miss,
00615 and the write() method for behavior when the cache's data 
00616 changes.
00617 
00618 When WriteSync is 1 or TRUE (DEFAULT), write() is called immediately
00619 when data in the cache is modified.  If set to 0, data that has 
00620 been modified in the cache gets written out when the entries are deleted or
00621 during the DESTROY phase of the cache object, usually at the end of
00622 a script.
00623 
00624 To have the dirty data write() periodically while WriteSync is set to 0,
00625 there is a flush() cache API call that will flush the dirty writes
00626 in this way.  Just call the flush() API like:
00627 
00628   my $write_flush_count = tied(%cache)->flush();
00629 
00630 The flush() API was added in the .17 release thanks to Rob Bloodgood.
00631 
00632 =head1 TRUE CACHE EXAMPLE
00633 
00634  use Tie::Cache;
00635 
00636  # personalize the Tie::Cache object, by inheriting from it
00637  package My::Cache;
00638  @ISA = qw(Tie::Cache);
00639 
00640  # override the read() and write() member functions
00641  # these tell the cache what to do with a cache miss or flush
00642  sub read { 
00643     my($self, $key) = @_; 
00644     print "cache miss for $key, read() data\n";
00645     rand() * $key; 
00646  }
00647  sub write { 
00648     my($self, $key, $value) = @_;
00649     print "flushing [$key, $value] from cache, write() data\n";
00650  }
00651 
00652  my $cache_size   = $ARGV[0] || 2;
00653  my $num_to_cache = $ARGV[1] || 4;   
00654  my $Debug = $ARGV[2] || 1;
00655 
00656  tie %cache, 'My::Cache', $cache_size, {Debug => $Debug};   
00657 
00658  # load the cache with new data, each through its contents,
00659  # and then reload in reverse order.
00660  for(1..$num_to_cache) { print "read data $_: $cache{$_}\n" }
00661  while(my($k, $v) = each %cache) { print "each data $k: $v\n"; }
00662  for(my $i=$num_to_cache; $i>0; $i--) { print "read data $i: $cache{$i}\n"; }
00663 
00664  # flush writes now, trivial use since will happen in DESTROY() anyway
00665  tied(%cache)->flush(); 
00666 
00667  # clear cache in 2 ways, write will flush out to disk
00668  %cache = ();
00669  undef %cache;
00670 
00671 =head1 NOTES
00672 
00673 Many thanks to all those who helped me make this module a reality, 
00674 including:
00675 
00676     :) Tom Hukins who provided me insight and motivation for
00677        finishing this module.
00678     :) Jamie McCarthy, for trying to make Tie::Cache be all
00679        that it can be.
00680     :) Rob Fugina who knows how to "TRULY CACHE".
00681     :) Rob Bloodgood, for the TRUE CACHE flush() API
00682 
00683 =head1 AUTHOR
00684 
00685 Please send any questions or comments to Joshua Chamas
00686 at chamas@alumni.stanford.org
00687 
00688 =head1 COPYRIGHT
00689 
00690 Copyright (c) 1999-2002 Joshua Chamas, Chamas Enterprises Inc.  
00691 Sponsored by development on NodeWorks http://www.nodeworks.com
00692 
00693 All rights reserved. This program is free software; 
00694 you can redistribute it and/or modify it under the same 
00695 terms as Perl itself. 
00696 
00697 =cut
00698 
00699 
00700 
00701 
00702 
00703