Archive Ensembl HomeArchive Ensembl Home
Cache.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::IdMapping::Cache - a cache to hold data objects used by the 
00024 IdMapping application
00025 
00026 =head1 SYNOPSIS
00027 
00028 =head1 DESCRIPTION
00029 
00030 =head1 METHODS
00031 
00032 =cut
00033 
00034 
00035 package Bio::EnsEMBL::IdMapping::Cache;
00036 
00037 use strict;
00038 use warnings;
00039 no warnings 'uninitialized';
00040 
00041 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00042 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00043 use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
00044 use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
00045 use Bio::EnsEMBL::IdMapping::TinyGene;
00046 use Bio::EnsEMBL::IdMapping::TinyTranscript;
00047 use Bio::EnsEMBL::IdMapping::TinyTranslation;
00048 use Bio::EnsEMBL::IdMapping::TinyExon;
00049 use Bio::EnsEMBL::DBSQL::DBAdaptor;
00050 use Storable qw(nstore retrieve);
00051 use Digest::MD5 qw(md5_hex);
00052 
00053 # define available cache names here
00054 my @cache_names = qw(
00055     exons_by_id
00056     transcripts_by_id
00057     transcripts_by_exon_id
00058     translations_by_id
00059     genes_by_id
00060     genes_by_transcript_id
00061 );
00062 
00063 
00064 =head2 new
00065 
00066   Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
00067   Arg [CONF]  : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
00068   Example     : my $cache = Bio::EnsEMBL::IdMapping::Cache->new(
00069                   -LOGGER => $logger,
00070                   -CONF   => $conf,
00071                 );
00072   Description : constructor
00073   Return type : Bio::EnsEMBL::IdMapping::Cache object
00074   Exceptions  : thrown on wrong or missing arguments
00075   Caller      : general
00076   Status      : At Risk
00077               : under development
00078 
00079 =cut
00080 
00081 sub new {
00082   my $caller = shift;
00083   my $class = ref($caller) || $caller;
00084 
00085   my ($logger, $conf, $load_instance) =
00086     rearrange(['LOGGER', 'CONF', 'LOAD_INSTANCE'], @_);
00087 
00088   unless ($logger->isa('Bio::EnsEMBL::Utils::Logger')) {
00089     throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
00090   }
00091   
00092   unless ($conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
00093     throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
00094   }
00095   
00096   my $self = {};
00097   bless ($self, $class);
00098 
00099   # initialise
00100   $self->logger($logger);
00101   $self->conf($conf);
00102 
00103   if ($load_instance) {
00104     $self->read_instance_from_file;
00105   }
00106   
00107   return $self;
00108 }
00109 
00110 
00111 =head2 build_cache_by_slice
00112 
00113   Arg[1]      : String $dbtype - db type (source|target)
00114   Arg[2]      : String $slice_name - the name of a slice (format as returned by
00115                 Bio::EnsEMBL::Slice->name)
00116   Example     : my ($num_genes, $filesize) = $cache->build_cache_by_slice(
00117                   'source', 'chromosome:NCBI36:X:1:1000000:-1');
00118   Description : Builds a cache of genes, transcripts, translations and exons
00119                 needed by the IdMapping application and serialises the resulting
00120                 cache object to a file, one slice at a time.
00121   Return type : list of the number of genes processed and the size of the
00122                 serialised cache file
00123   Exceptions  : thrown on invalid slice name
00124   Caller      : general
00125   Status      : At Risk
00126               : under development
00127 
00128 =cut
00129 
00130 sub build_cache_by_slice {
00131   my $self       = shift;
00132   my $dbtype     = shift;
00133   my $slice_name = shift;
00134 
00135   # set cache method (required for loading cache later)
00136   $self->cache_method('BY_SEQ_REGION');
00137 
00138   my $dba = $self->get_DBAdaptor($dbtype);
00139   my $sa  = $dba->get_SliceAdaptor;
00140 
00141   my $slice = $sa->fetch_by_name($slice_name);
00142   unless ($slice) {
00143     throw("Could not retrieve slice $slice_name.");
00144   }
00145 
00146   my $genes = $slice->get_all_Genes( undef, undef, 1 );
00147 
00148   # find common coord_system
00149   my $common_cs_found = $self->find_common_coord_systems;
00150 
00151   # find out whether native coord_system is a common coord_system.
00152   # if so, you don't need to project.
00153   # also don't project if no common coord_system present
00154   my $need_project = 1;
00155 
00156   my $csid = join( ':',
00157                    $slice->coord_system_name,
00158                    $slice->coord_system->version );
00159 
00160   if ( $self->is_common_cs($csid) or !$self->highest_common_cs ) {
00161     $need_project = 0;
00162   }
00163 
00164   # build cache
00165   my $type = "$dbtype.$slice_name";
00166   my $num_genes =
00167     $self->build_cache_from_genes( $type, $genes, $need_project );
00168   undef $genes;
00169 
00170   # write cache to file, then flush cache to reclaim memory
00171   my $size = $self->write_all_to_file($type);
00172 
00173   return $num_genes, $size;
00174 } ## end sub build_cache_by_slice
00175 
00176 
00177 =head2 build_cache_all
00178 
00179   Arg[1]      : String $dbtype - db type (source|target)
00180   Example     : my ($num_genes, $filesize) = $cache->build_cache_all('source');
00181   Description : Builds a cache of genes, transcripts, translations and exons
00182                 needed by the IdMapping application and serialises the
00183                 resulting cache object to a file. All genes across the genome
00184                 are processed in one go. This method should be used when
00185                 build_cache_by_seq_region can't be used due to a large number
00186                 of toplevel seq_regions (e.g. 2x genomes).
00187   Return type : list of the number of genes processed and the size of the
00188                 serialised cache file
00189   Exceptions  : thrown on invalid slice name
00190   Caller      : general
00191   Status      : At Risk
00192               : under development
00193 
00194 =cut
00195 
00196 sub build_cache_all {
00197   my $self = shift;
00198   my $dbtype = shift;
00199   
00200   # set cache method (required for loading cache later)
00201   $self->cache_method('ALL');
00202 
00203   my $dba = $self->get_DBAdaptor($dbtype);
00204   my $ga = $dba->get_GeneAdaptor;
00205   
00206   my $genes = $ga->fetch_all;
00207 
00208   # find common coord_system
00209   my $common_cs_found = $self->find_common_coord_systems;
00210 
00211   # Build cache. Setting $need_project to 'CHECK' will cause
00212   # build_cache_from_genes() to check the coordinate system for each
00213   # gene.
00214   my $type         = "$dbtype.ALL";
00215   my $need_project = 'CHECK';
00216   my $num_genes =
00217     $self->build_cache_from_genes( $type, $genes, $need_project );
00218 
00219   undef $genes;
00220 
00221   # write cache to file, then flush cache to reclaim memory
00222   my $size = $self->write_all_to_file($type);
00223 
00224   return $num_genes, $size;
00225 }
00226 
00227 
00228 =head2 build_cache_from_genes 
00229 
00230   Arg[1]      : String $type - cache type
00231   Arg[2]      : Listref of Bio::EnsEMBL::Genes $genes - genes to build cache
00232                 from
00233   Arg[3]      : Boolean $need_project - indicate if we need to project exons to
00234                 common coordinate system
00235   Example     : $cache->build_cache_from_genes(
00236                   'source.chromosome:NCBI36:X:1:100000:1', \@genes);
00237   Description : Builds the cache by fetching transcripts, translations and exons
00238                 for a list of genes from the database, and creating lightweight
00239                 Bio::EnsEMBL::IdMapping::TinyFeature objects containing only the
00240                 data needed by the IdMapping application. These objects are
00241                 attached to a name cache in this cache object. Exons only need
00242                 to be projected to a commond coordinate system if their native
00243                 coordinate system isn't common to source and target assembly
00244                 itself.
00245   Return type : int - number of genes after filtering
00246   Exceptions  : thrown on wrong or missing arguments
00247   Caller      : internal
00248   Status      : At Risk
00249               : under development
00250 
00251 =cut
00252 
00253 sub build_cache_from_genes {
00254   my $self         = shift;
00255   my $type         = shift;
00256   my $genes        = shift;
00257   my $need_project = shift;
00258 
00259   throw("You must provide a type.") unless $type;
00260   throw("You must provide a listref of genes.")
00261     unless ( ref($genes) eq 'ARRAY' );
00262 
00263   # biotype filter
00264   if ( $self->conf->param('biotypes') ) {
00265     $genes = $self->filter_biotypes($genes);
00266   }
00267   my $num_genes = scalar(@$genes);
00268 
00269   # initialise cache for the given type.
00270   $self->{'cache'}->{$type} = {};
00271 
00272   #my $i = 0;
00273   #my $num_genes = scalar(@$genes);
00274   #my $progress_id = $self->logger->init_progress($num_genes);
00275 
00276  # loop over genes sorted by gene location.
00277  # the sort will hopefully improve assembly mapper cache performance and
00278  # therefore speed up exon sequence retrieval
00279   foreach my $gene ( sort { $a->start <=> $b->start } @$genes ) {
00280     #$self->logger->log_progressbar($progress_id, ++$i, 2);
00281     #$self->logger->log_progress($num_genes, ++$i, 20, 3, 1);
00282 
00283     if ( $need_project eq 'CHECK' ) {
00284       # find out whether native coord_system is a common coord_system.
00285       # if so, you don't need to project.
00286       # also don't project if no common coord_system present
00287       if ( $self->highest_common_cs ) {
00288         my $csid = join( ':',
00289                          $gene->slice->coord_system_name,
00290                          $gene->slice->coord_system->version );
00291         if ( $self->is_common_cs($csid) ) {
00292           $need_project = 0;
00293         }
00294       }
00295       else {
00296         $need_project = 0;
00297       }
00298     }
00299 
00300     # create lightweigt gene
00301     my $lgene =
00302       Bio::EnsEMBL::IdMapping::TinyGene->new_fast( [
00303                           $gene->dbID,          $gene->stable_id,
00304                           $gene->version,       $gene->created_date,
00305                           $gene->modified_date, $gene->start,
00306                           $gene->end,           $gene->strand,
00307                           $gene->slice->seq_region_name, $gene->biotype,
00308                           $gene->status, $gene->analysis->logic_name,
00309                           ( $gene->is_known ? 1 : 0 ), ] );
00310 
00311     # build gene caches
00312     $self->add( 'genes_by_id', $type, $gene->dbID, $lgene );
00313 
00314     # transcripts
00315     foreach my $tr ( @{ $gene->get_all_Transcripts } ) {
00316       my $ltr =
00317         Bio::EnsEMBL::IdMapping::TinyTranscript->new_fast( [
00318                                $tr->dbID,          $tr->stable_id,
00319                                $tr->version,       $tr->created_date,
00320                                $tr->modified_date, $tr->start,
00321                                $tr->end,           $tr->strand,
00322                                $tr->length, md5_hex( $tr->spliced_seq ),
00323                                ( $tr->is_known ? 1 : 0 ) ] );
00324 
00325       $ltr->biotype( $tr->biotype() );
00326       $lgene->add_Transcript($ltr);
00327 
00328       # build transcript caches
00329       $self->add( 'transcripts_by_id',      $type, $tr->dbID, $ltr );
00330       $self->add( 'genes_by_transcript_id', $type, $tr->dbID, $lgene );
00331 
00332       # translation (if there is one)
00333       if ( my $tl = $tr->translation ) {
00334         my $ltl =
00335           Bio::EnsEMBL::IdMapping::TinyTranslation->new_fast( [
00336                          $tl->dbID,          $tl->stable_id,
00337                          $tl->version,       $tl->created_date,
00338                          $tl->modified_date, $tr->dbID,
00339                          $tr->translate->seq, ( $tr->is_known ? 1 : 0 ),
00340                        ] );
00341 
00342         $ltr->add_Translation($ltl);
00343 
00344         $self->add( 'translations_by_id', $type, $tl->dbID, $ltl );
00345 
00346         undef $tl;
00347       }
00348 
00349       # exons
00350       foreach my $exon ( @{ $tr->get_all_Exons } ) {
00351         my $lexon =
00352           Bio::EnsEMBL::IdMapping::TinyExon->new_fast( [
00353                          $exon->dbID,
00354                          $exon->stable_id,
00355                          $exon->version,
00356                          $exon->created_date,
00357                          $exon->modified_date,
00358                          $exon->start,
00359                          $exon->end,
00360                          $exon->strand,
00361                          $exon->slice->seq_region_name,
00362                          $exon->slice->coord_system_name,
00363                          $exon->slice->coord_system->version,
00364                          $exon->slice->subseq( $exon->start, $exon->end,
00365                                                $exon->strand ),
00366                          $exon->phase,
00367                          $need_project, ] );
00368 
00369         # get coordinates in common coordinate system if needed
00370         if ($need_project) {
00371           my @seg = @{
00372             $exon->project( $self->highest_common_cs,
00373                             $self->highest_common_cs_version ) };
00374 
00375           if ( scalar(@seg) == 1 ) {
00376             my $sl = $seg[0]->to_Slice;
00377             $lexon->common_start( $sl->start );
00378             $lexon->common_end( $sl->end );
00379             $lexon->common_strand( $sl->strand );
00380             $lexon->common_sr_name( $sl->seq_region_name );
00381           }
00382         }
00383 
00384         $ltr->add_Exon($lexon);
00385 
00386         $self->add( 'exons_by_id', $type, $exon->dbID, $lexon );
00387         $self->add_list( 'transcripts_by_exon_id',
00388                          $type, $exon->dbID, $ltr );
00389 
00390         undef $exon;
00391       } ## end foreach my $exon ( @{ $tr->get_all_Exons...})
00392 
00393       undef $tr;
00394     } ## end foreach my $tr ( @{ $gene->get_all_Transcripts...})
00395 
00396     undef $gene;
00397   } ## end foreach my $gene ( sort { $a...})
00398 
00399   return $num_genes;
00400 } ## end sub build_cache_from_genes
00401 
00402 
00403 =head2 filter_biotypes
00404 
00405   Arg[1]      : Listref of Bio::EnsEMBL::Genes $genes - the genes to filter
00406   Example     : my @filtered = @{ $cache->filter_biotypes(\@genes) };
00407   Description : Filters a list of genes by biotype. Biotypes are taken from the
00408                 IdMapping configuration parameter 'biotypes'.
00409   Return type : Listref of Bio::EnsEMBL::Genes (or empty list)
00410   Exceptions  : none
00411   Caller      : internal
00412   Status      : At Risk
00413               : under development
00414 
00415 =cut
00416 
00417 sub filter_biotypes {
00418   my $self = shift;
00419   my $genes = shift;
00420 
00421   my $filtered = [];
00422 
00423   foreach my $biotype ($self->conf->param('biotypes')) {
00424     push @$filtered, grep { $_->biotype eq $biotype } @$genes;
00425   }
00426 
00427   return $filtered;
00428 }
00429 
00430 
00431 =head2 add
00432 
00433   Arg[1]      : String $name - a cache name (e.g. 'genes_by_id')
00434   Arg[2]      : String type - a cache type (e.g. "source.$slice_name")
00435   Arg[3]      : String $key - key of this entry (e.g. a gene dbID)
00436   Arg[4]      : Bio::EnsEMBL::IdMappping::TinyFeature $val - value to cache
00437   Example     : $cache->add('genes_by_id',
00438                   'source.chromosome:NCBI36:X:1:1000000:1', '1234', $tiny_gene);
00439   Description : Adds a TinyFeature object to a named cache.
00440   Return type : Bio::EnsEMBL::IdMapping::TinyFeature
00441   Exceptions  : thrown on wrong or missing arguments
00442   Caller      : internal
00443   Status      : At Risk
00444               : under development
00445 
00446 =cut
00447 
00448 sub add {
00449   my $self = shift;
00450   my $name = shift;
00451   my $type = shift;
00452   my $key = shift;
00453   my $val = shift;
00454 
00455   throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
00456   throw("You must provide a cache type.") unless $type;
00457   throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
00458 
00459   $self->{'cache'}->{$type}->{$name}->{$key} = $val;
00460 
00461   return $self->{'cache'}->{$type}->{$name}->{$key};
00462 }
00463 
00464 =head2 add_list
00465 
00466   Arg[1]      : String $name - a cache name (e.g. 'genes_by_id')
00467   Arg[2]      : String type - a cache type (e.g. "source.$slice_name")
00468   Arg[3]      : String $key - key of this entry (e.g. a gene dbID)
00469   Arg[4]      : List of Bio::EnsEMBL::IdMappping::TinyFeature @val - values
00470                 to cache
00471   Example     : $cache->add_list('transcripts_by_exon_id',
00472                   'source.chromosome:NCBI36:X:1:1000000:1', '1234',
00473                   $tiny_transcript1, $tiny_transcript2);
00474   Description : Adds a list of TinyFeature objects to a named cache.
00475   Return type : Listref of Bio::EnsEMBL::IdMapping::TinyFeature objects
00476   Exceptions  : thrown on wrong or missing arguments
00477   Caller      : internal
00478   Status      : At Risk
00479               : under development
00480 
00481 =cut
00482 
00483 sub add_list {
00484   my $self = shift;
00485   my $name = shift;
00486   my $type = shift;
00487   my $key = shift;
00488   my @vals = @_;
00489 
00490   throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
00491   throw("You must provide a cache type.") unless $type;
00492   throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
00493 
00494   push @{ $self->{'cache'}->{$type}->{$name}->{$key} }, @vals;
00495 
00496   return $self->{'cache'}->{$type}->{$name}->{$key};
00497 }
00498 
00499 sub get_by_key {
00500   my $self = shift;
00501   my $name = shift;
00502   my $type = shift;
00503   my $key = shift;
00504 
00505   throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
00506   throw("You must provide a cache type.") unless $type;
00507   throw("You must provide a cache key (e.g. a gene dbID).") unless $key;
00508 
00509   # transparently load cache from file unless already loaded
00510   unless ($self->{'instance'}->{'loaded'}->{"$type"}) {
00511     $self->read_and_merge($type);
00512   }
00513 
00514   return $self->{'cache'}->{$type}->{$name}->{$key};
00515 }
00516 
00517 sub get_by_name {
00518   my $self = shift;
00519   my $name = shift;
00520   my $type = shift;
00521 
00522   throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
00523   throw("You must provide a cache type.") unless $type;
00524   
00525   # transparently load cache from file unless already loaded
00526   unless ($self->{'instance'}->{'loaded'}->{$type}) {
00527     $self->read_and_merge($type);
00528   }
00529 
00530   return $self->{'cache'}->{$type}->{$name} || {};
00531 }
00532 
00533 
00534 sub get_count_by_name {
00535   my $self = shift;
00536   my $name = shift;
00537   my $type = shift;
00538 
00539   throw("You must provide a cache name (e.g. genes_by_id.") unless $name;
00540   throw("You must provide a cache type.") unless $type;
00541   
00542   # transparently load cache from file unless already loaded
00543   unless ($self->{'instance'}->{'loaded'}->{$type}) {
00544     $self->read_and_merge($type);
00545   }
00546 
00547   return scalar(keys %{ $self->get_by_name($name, $type) });
00548 }
00549 
00550 
00551 sub find_common_coord_systems {
00552   my $self = shift;
00553 
00554   # get adaptors for source db
00555   my $s_dba = $self->get_DBAdaptor('source');
00556   my $s_csa = $s_dba->get_CoordSystemAdaptor;
00557   my $s_sa  = $s_dba->get_SliceAdaptor;
00558 
00559   # get adaptors for target db
00560   my $t_dba = $self->get_DBAdaptor('target');
00561   my $t_csa = $t_dba->get_CoordSystemAdaptor;
00562   my $t_sa  = $t_dba->get_SliceAdaptor;
00563 
00564   # find common coord_systems
00565   my @s_coord_systems = @{ $s_csa->fetch_all };
00566   my @t_coord_systems = @{ $t_csa->fetch_all };
00567   my $found_highest   = 0;
00568 
00569 SOURCE:
00570   foreach my $s_cs (@s_coord_systems) {
00571     if ( !$s_cs->is_default() ) { next SOURCE }
00572 
00573   TARGET:
00574     foreach my $t_cs (@t_coord_systems) {
00575       if ( !$t_cs->is_default() ) { next TARGET }
00576 
00577       if ( $s_cs->name eq $t_cs->name ) {
00578 
00579         # test for identical coord_system version
00580         if ( $s_cs->version and ( $s_cs->version ne $t_cs->version ) ) {
00581           next TARGET;
00582         }
00583 
00584         # test for at least 50% identical seq_regions
00585         if ( $self->seq_regions_compatible( $s_cs, $s_sa, $t_sa ) ) {
00586           $self->add_common_cs($s_cs);
00587 
00588           unless ($found_highest) {
00589             $self->highest_common_cs( $s_cs->name );
00590             $self->highest_common_cs_version( $s_cs->version );
00591           }
00592 
00593           $found_highest = 1;
00594 
00595           next SOURCE;
00596         }
00597       }
00598     } ## end foreach my $t_cs (@t_coord_systems)
00599   } ## end foreach my $s_cs (@s_coord_systems)
00600 
00601   return $found_highest;
00602 } ## end sub find_common_coord_systems
00603 
00604 
00605 sub seq_regions_compatible {
00606   my $self = shift;
00607   my $cs = shift;
00608   my $s_sa = shift;
00609   my $t_sa = shift;
00610 
00611   unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
00612     throw('You must provide a CoordSystem');
00613   }
00614 
00615   unless ($s_sa and $t_sa and $s_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')
00616           and $t_sa->isa('Bio::EnsEMBL::DBSQL::SliceAdaptor')) {
00617     throw('You must provide a source and target SliceAdaptor');
00618   }
00619 
00620   my %sr_match;
00621   my $equal = 0;
00622 
00623   my $s_seq_regions = $s_sa->fetch_all($cs->name, $cs->version);
00624   my $t_seq_regions = $t_sa->fetch_all($cs->name, $cs->version);
00625   
00626   # sanity check to prevent divison by zero
00627   my $s_count = scalar(@$s_seq_regions);
00628   my $t_count = scalar(@$t_seq_regions);
00629   return(0) if ($s_count == 0 or $t_count == 0);
00630   
00631   foreach my $s_sr (@$s_seq_regions) {
00632     $sr_match{$s_sr->seq_region_name} = $s_sr->length;
00633   }
00634 
00635   foreach my $t_sr (@$t_seq_regions) {
00636     if (exists($sr_match{$t_sr->seq_region_name})) {
00637       $equal++;
00638 
00639       # return false if we have a region with same name but different length
00640       return(0) unless ($sr_match{$t_sr->seq_region_name} == $t_sr->length);
00641     }
00642   }
00643 
00644   if ($equal/$s_count > 0.5 and $equal/$t_count > 0.5) {
00645     return(1);
00646   } else {
00647     $self->logger->info("Only $equal seq_regions identical for ".$cs->name." ".$cs->version."\n");
00648     return(0);
00649   }
00650   
00651 }
00652 
00653 
00654 sub check_db_connection {
00655   my $self = shift;
00656   my $dbtype = shift;
00657   
00658   my $err = 0;
00659   
00660   eval {
00661     my $dba = $self->get_DBAdaptor($dbtype);
00662     $dba->dbc->connect;
00663   };
00664   
00665   if ($@) {
00666     $self->logger->warning("Can't connect to $dbtype db: $@\n");
00667     $err++;
00668   } else {
00669     $self->logger->debug("Connection to $dbtype db ok.\n");
00670     $self->{'_db_conn_ok'}->{$dbtype} = 1;
00671   }
00672 
00673   return $err;
00674 }
00675 
00676   
00677 sub check_db_read_permissions {
00678   my $self = shift;
00679   my $dbtype = shift;
00680 
00681   # skip this check if db connection failed (this prevents re-throwing
00682   # exceptions).
00683   return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
00684   
00685   my $err = 0;
00686   my %privs = %{ $self->get_db_privs($dbtype) };
00687   
00688   unless ($privs{'SELECT'} or $privs{'ALL PRIVILEGES'}) {
00689     $self->logger->warning("User doesn't have read permission on $dbtype db.\n");
00690     $err++;
00691   } else {
00692     $self->logger->debug("Read permission on $dbtype db ok.\n");
00693   }
00694 
00695   return $err;
00696 }
00697 
00698   
00699 sub check_db_write_permissions {
00700   my $self = shift;
00701   my $dbtype = shift;
00702   
00703   # skip this check if db connection failed (this prevents re-throwing
00704   # exceptions).
00705   return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
00706   
00707   my $err = 0;
00708 
00709   unless ($self->do_upload) {
00710     $self->logger->debug("No uploads, so write permission on $dbtype db not required.\n");
00711     return $err;
00712   }
00713 
00714   my %privs = %{ $self->get_db_privs($dbtype) };
00715 
00716   unless ($privs{'INSERT'} or $privs{'ALL PRIVILEGES'}) {
00717     $self->logger->warning("User doesn't have write permission on $dbtype db.\n");
00718     $err++;
00719   } else {
00720     $self->logger->debug("Write permission on $dbtype db ok.\n");
00721   }
00722 
00723   return $err;
00724 }
00725 
00726 
00727 sub do_upload {
00728   my $self = shift;
00729 
00730   if ($self->conf->param('dry_run') or
00731     ! ($self->conf->param('upload_events') or
00732        $self->conf->param('upload_stable_ids') or
00733        $self->conf->param('upload_archive'))) {
00734     return 0;
00735   } else {
00736     return 1;
00737   }
00738 }   
00739 
00740 
00741 sub get_db_privs {
00742   my ( $self, $dbtype ) = @_;
00743 
00744   my %privs = ();
00745   my $rs;
00746 
00747   # get privileges from mysql db
00748   eval {
00749     my $dbc = $self->get_DBAdaptor($dbtype)->dbc();
00750     my $sql = qq(SHOW GRANTS FOR ) . $dbc->username();
00751     my $sth = $dbc->prepare($sql);
00752     $sth->execute();
00753     $rs = $sth->fetchall_arrayref();
00754     #$sth->finish();
00755   };
00756 
00757   if ($@) {
00758     $self->logger->warning(
00759       "Error obtaining privileges from $dbtype db: $@\n");
00760     return {};
00761   }
00762 
00763   # parse the output
00764   foreach my $r ( map { $_->[0] } @{$rs} ) {
00765     $r =~ s/GRANT (.*) ON .*/$1/i;
00766     foreach my $p ( split( ',', $r ) ) {
00767       # trim leading and trailing whitespace
00768       $p =~ s/^\s+//;
00769       $p =~ s/\s+$//;
00770       $privs{ uc($p) } = 1;
00771     }
00772   }
00773 
00774   return \%privs;
00775 } ## end sub get_db_privs
00776 
00777 
00778 sub check_empty_tables {
00779   my $self = shift;
00780   my $dbtype = shift;
00781   
00782   # skip this check if db connection failed (this prevents re-throwing
00783   # exceptions).
00784   return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
00785   
00786   my $err = 0;
00787   my $c = 0;
00788 
00789   if ($self->conf->param('no_check_empty_tables') or !$self->do_upload) {
00790     $self->logger->debug("Won't check for empty stable ID and archive tables in $dbtype db.\n");
00791     return $err;
00792   }
00793 
00794   eval {
00795     my @tables =
00796       qw(
00797       gene_stable_id
00798       transcript_stable_id
00799       translation_stable_id
00800       exon_stable_id
00801       stable_id_event
00802       mapping_session
00803       gene_archive
00804       peptide_archive
00805     );
00806 
00807     my $dba = $self->get_DBAdaptor($dbtype);
00808     foreach my $table (@tables) {
00809       if ( $table =~ /^([^_]+)_stable_id/ ) {
00810         $table = $1;
00811         if ( $c =
00812              $self->fetch_value_from_db(
00813                $dba,
00814                "SELECT COUNT(*) FROM $table WHERE stable_id IS NOT NULL"
00815              ) )
00816         {
00817           $self->logger->warning(
00818                         "$table table in $dbtype db has $c stable IDs.\n");
00819           $err++;
00820         }
00821       }
00822       else {
00823         if ( $c =
00824              $self->fetch_value_from_db(
00825                                      $dba, "SELECT COUNT(*) FROM $table"
00826              ) )
00827         {
00828           $self->logger->warning(
00829                         "$table table in $dbtype db has $c entries.\n");
00830           $err++;
00831         }
00832       }
00833     } ## end foreach my $table (@tables)
00834   };
00835 
00836   if ($@) {
00837     $self->logger->warning(
00838 "Error retrieving stable ID and archive table row counts from $dbtype db: $@\n"
00839     );
00840     $err++;
00841   }
00842   elsif ( !$err ) {
00843     $self->logger->debug(
00844          "All stable ID and archive tables in $dbtype db are empty.\n");
00845   }
00846   return $err;
00847 }
00848 
00849 
00850 sub check_sequence {
00851   my ( $self, $dbtype ) = @_;
00852 
00853   # skip this check if db connection failed (this prevents re-throwing
00854   # exceptions).
00855   return 1 unless ( $self->{'_db_conn_ok'}->{$dbtype} );
00856 
00857   my $err = 0;
00858   my $c   = 0;
00859 
00860   eval {
00861     my $dba = $self->get_DBAdaptor($dbtype);
00862     unless ( $c =
00863              $self->fetch_value_from_db(
00864                                $dba->dnadb(), "SELECT COUNT(*) FROM dna"
00865              ) )
00866     {
00867       $err++;
00868     }
00869   };
00870 
00871   if ($@) {
00872     $self->logger->warning(   "Error retrieving dna table row count "
00873                             . "from $dbtype database: $@\n" );
00874     $err++;
00875   } elsif ($err) {
00876     $self->logger->warning("No sequence found in $dbtype database.\n");
00877   } else {
00878     $self->logger->debug(
00879                 ucfirst($dbtype) . " db has sequence ($c entries).\n" );
00880   }
00881 
00882   return $err;
00883 } ## end sub check_sequence
00884 
00885 
00886 sub check_meta_entries {
00887   my $self = shift;
00888   my $dbtype = shift;
00889   
00890   # skip this check if db connection failed (this prevents re-throwing
00891   # exceptions).
00892   return 1 unless ($self->{'_db_conn_ok'}->{$dbtype});
00893   
00894   my $err = 0;
00895   my $assembly_default;
00896   my $schema_version;
00897   
00898   eval {
00899     my $dba = $self->get_DBAdaptor($dbtype);
00900     $assembly_default = $self->fetch_value_from_db($dba,
00901       qq(SELECT meta_value FROM meta WHERE meta_key = 'assembly.default'));
00902     $schema_version = $self->fetch_value_from_db($dba,
00903       qq(SELECT meta_value FROM meta WHERE meta_key = 'schema_version'));
00904   };
00905   
00906   if ($@) {
00907     $self->logger->warning("Error retrieving dna table row count from $dbtype db: $@\n");
00908     return ++$err;
00909   }
00910   
00911   unless ($assembly_default) {
00912     $self->logger->warning("No meta.assembly.default value found in $dbtype db.\n");
00913     $err++;
00914   } else {
00915     $self->logger->debug("meta.assembly.default value found ($assembly_default).\n");
00916   }
00917 
00918   unless ($schema_version) {
00919     $self->logger->warning("No meta.schema_version value found in $dbtype db.\n");
00920     $err++;
00921   } else {
00922     $self->logger->debug("meta.schema_version value found ($schema_version).\n");
00923   }
00924 
00925   return $err;
00926 }
00927 
00928 
00929 sub fetch_value_from_db {
00930   my ( $self, $dba, $sql ) = @_;
00931 
00932   assert_ref( $dba, 'Bio::EnsEMBL::DBSQL::DBAdaptor' );
00933 
00934   if ( !defined($sql) ) {
00935     throw("Need an SQL statement to execute.\n");
00936   }
00937 
00938   my $sth = $dba->dbc->prepare($sql);
00939   $sth->execute();
00940 
00941   my ($c) = $sth->fetchrow_array;
00942   return $c;
00943 }
00944 
00945 sub get_DBAdaptor {
00946   my ( $self, $prefix ) = @_;
00947 
00948   unless ( $self->{'_dba'}->{$prefix} ) {
00949     # connect to database
00950     my $dba =
00951       new Bio::EnsEMBL::DBSQL::DBAdaptor(
00952                        -host   => $self->conf->param("${prefix}host"),
00953                        -port   => $self->conf->param("${prefix}port"),
00954                        -user   => $self->conf->param("${prefix}user"),
00955                        -pass   => $self->conf->param("${prefix}pass"),
00956                        -dbname => $self->conf->param("${prefix}dbname"),
00957                        -group  => $prefix, );
00958 
00959     if ( !defined( $self->conf->param("${prefix}host_dna") ) ) {
00960       # explicitely set the dnadb to itself - by default the Registry
00961       # assumes a group 'core' for this now
00962       $dba->dnadb($dba);
00963     } else {
00964       my $dna_dba =
00965         new Bio::EnsEMBL::DBSQL::DBAdaptor(
00966                    -host   => $self->conf->param("${prefix}host_dna"),
00967                    -port   => $self->conf->param("${prefix}port_dna"),
00968                    -user   => $self->conf->param("${prefix}user_dna"),
00969                    -pass   => $self->conf->param("${prefix}pass_dna"),
00970                    -dbname => $self->conf->param("${prefix}dbname_dna"),
00971                    -group  => $prefix, );
00972       $dba->dnadb($dna_dba);
00973     }
00974 
00975     $self->{'_dba'}->{$prefix} = $dba;
00976   } ## end unless ( $self->{'_dba'}->...)
00977 
00978   return $self->{'_dba'}->{$prefix};
00979 } ## end sub get_DBAdaptor
00980 
00981 
00982 sub cache_file_exists {
00983   my $self = shift;
00984   my $type = shift;
00985 
00986   throw("You must provide a cache type.") unless $type;
00987 
00988   my $cache_file = $self->cache_file($type);
00989 
00990   if (-e $cache_file) {
00991     $self->logger->info("Cache file found for $type.\n", 2);
00992     $self->logger->debug("Will read from $cache_file.\n", 2);
00993     return 1;
00994   } else {
00995     $self->logger->info("No cache file found for $type.\n", 2);
00996     $self->logger->info("Will build cache from db.\n", 2);
00997     return 0;
00998   }
00999 }
01000 
01001 
01002 sub cache_file {
01003   my $self = shift;
01004   my $type = shift;
01005 
01006   throw("You must provide a cache type.") unless $type;
01007 
01008   return $self->dump_path."/$type.object_cache.ser";
01009 }
01010 
01011 
01012 sub instance_file {
01013   my $self = shift;
01014 
01015   return $self->dump_path."/cache_instance.ser";
01016 }
01017 
01018 
01019 sub dump_path {
01020   my $self = shift;
01021 
01022   $self->{'dump_path'} ||= path_append($self->conf->param('basedir'), 'cache');
01023 
01024   return $self->{'dump_path'};
01025 }
01026 
01027 
01028 sub write_all_to_file {
01029   my $self = shift;
01030   my $type = shift;
01031 
01032   throw("You must provide a cache type.") unless $type;
01033 
01034   my $size = 0;
01035   $size += $self->write_to_file($type);
01036   $size += $self->write_instance_to_file;
01037 
01038   return parse_bytes($size);
01039 }
01040 
01041 
01042 sub write_to_file {
01043   my $self = shift;
01044   my $type = shift;
01045 
01046   throw("You must provide a cache type.") unless $type;
01047 
01048   unless ($self->{'cache'}->{$type}) {
01049     $self->logger->warning("No features found in $type. Won't write cache file.\n");
01050     return;
01051   }
01052 
01053   my $cache_file = $self->cache_file($type);
01054 
01055   eval { nstore($self->{'cache'}->{$type}, $cache_file) };
01056   if ($@) {
01057     throw("Unable to store $cache_file: $@\n");
01058   }
01059 
01060   my $size = -s $cache_file;
01061   return $size;
01062 }
01063 
01064 
01065 sub write_instance_to_file {
01066   my $self = shift;
01067 
01068   my $instance_file = $self->instance_file;
01069 
01070   eval { nstore($self->{'instance'}, $instance_file) };
01071   if ($@) {
01072     throw("Unable to store $instance_file: $@\n");
01073   }
01074 
01075   my $size = -s $instance_file;
01076   return $size;
01077 }
01078 
01079 
01080 sub read_from_file {
01081   my $self = shift;
01082   my $type = shift;
01083 
01084   throw("You must provide a cache type.") unless $type;
01085 
01086   my $cache_file = $self->cache_file($type);
01087 
01088   if (-s $cache_file) {
01089     
01090     #$self->logger->info("Reading cache from file...\n", 0, 'stamped');
01091     #$self->logger->info("Cache file $cache_file.\n", 1);
01092     eval { $self->{'cache'}->{$type} = retrieve($cache_file); };
01093     if ($@) {
01094       throw("Unable to retrieve cache: $@");
01095     }
01096     #$self->logger->info("Done.\n", 0, 'stamped');
01097 
01098   } else {
01099     $self->logger->warning("Cache file $cache_file not found or empty.\n");
01100   }
01101 
01102 
01103   return $self->{'cache'}->{$type};
01104 }
01105 
01106 
01107 sub read_and_merge {
01108   my $self = shift;
01109   my $dbtype = shift;
01110 
01111   unless ($dbtype eq 'source' or $dbtype eq 'target') {
01112     throw("Db type must be 'source' or 'target'.");
01113   }
01114 
01115   # read cache from single or multiple files, depending on caching strategy
01116   my $cache_method = $self->cache_method;
01117   if ($cache_method eq 'ALL') {
01118     $self->read_from_file("$dbtype.ALL");
01119   } elsif ($cache_method eq 'BY_SEQ_REGION') {
01120     foreach my $slice_name (@{ $self->slice_names($dbtype) }) {
01121       $self->read_from_file("$dbtype.$slice_name");
01122     }
01123   } else {
01124     throw("Unknown cache method: $cache_method.");
01125   }
01126 
01127   $self->merge($dbtype);
01128 
01129   # flag as being loaded
01130   $self->{'instance'}->{'loaded'}->{$dbtype} = 1;
01131 }
01132 
01133 
01134 sub merge {
01135   my $self = shift;
01136   my $dbtype = shift;
01137 
01138   unless ($dbtype eq 'source' or $dbtype eq 'target') {
01139     throw("Db type must be 'source' or 'target'.");
01140   }
01141 
01142   foreach my $type (keys %{ $self->{'cache'} || {} }) {
01143     next unless ($type =~ /^$dbtype/);
01144 
01145     foreach my $name (keys %{ $self->{'cache'}->{$type} || {} }) {
01146     
01147       foreach my $key (keys %{ $self->{'cache'}->{$type}->{$name} || {} }) {
01148         if (defined $self->{'cache'}->{$dbtype}->{$name}->{$key}) {
01149           # warning("Duplicate key in cache: $name|$dbtype|$key. Skipping.\n");
01150         } else {
01151           $self->{'cache'}->{$dbtype}->{$name}->{$key} =
01152             $self->{'cache'}->{$type}->{$name}->{$key};
01153         }
01154 
01155         delete $self->{'cache'}->{$type}->{$name}->{$key};
01156       }
01157       
01158       delete $self->{'cache'}->{$type}->{$name};
01159     }
01160     
01161     delete $self->{'cache'}->{$type};
01162 
01163   }
01164 }
01165 
01166 
01167 sub read_instance_from_file {
01168   my $self = shift;
01169 
01170   my $instance_file = $self->instance_file;
01171 
01172   unless (-s $instance_file) {
01173     throw("No valid cache instance file found at $instance_file.");
01174   }
01175 
01176   eval { $self->{'instance'} = retrieve($instance_file); };
01177   if ($@) {
01178     throw("Unable to retrieve cache instance: $@");
01179   }
01180 
01181   return $self->{'instance'};
01182 }
01183 
01184 
01185 sub slice_names {
01186   my $self   = shift;
01187   my $dbtype = shift;
01188 
01189   throw("You must provide a db type (source|target).") unless $dbtype;
01190 
01191   my $dba = $self->get_DBAdaptor($dbtype);
01192   my $sa  = $dba->get_SliceAdaptor;
01193 
01194   my @slice_names = ();
01195 
01196   if ( $self->conf->param('chromosomes') ) {
01197     # Fetch the specified chromosomes.
01198     foreach my $chr ( $self->conf->param('chromosomes') ) {
01199       my $slice = $sa->fetch_by_region( 'chromosome', $chr );
01200       push @slice_names, $slice->name;
01201     }
01202 
01203   }
01204   elsif ( $self->conf->param('region') ) {
01205     # Fetch the slices on the specified regions.  Don't use
01206     # SliceAdaptor->fetch_by_name() since this will fail if assembly
01207     # versions are different for source and target db.
01208     my ( $cs, $version, $name, $start, $end, $strand ) =
01209       split( /:/, $self->conf->param('region') );
01210 
01211     my $slice = $sa->fetch_by_region( $cs, $name, $start, $end );
01212 
01213     push @slice_names, $slice->name;
01214 
01215   }
01216   else {
01217     # Fetch all slices that have genes on them.
01218     my $ga = $dba->get_GeneAdaptor;
01219 
01220     foreach my $srid ( @{ $ga->list_seq_region_ids } ) {
01221       my $slice = $sa->fetch_by_seq_region_id($srid);
01222 
01223       if ( !$slice->is_reference() ) {
01224         my $slices =
01225           $slice->adaptor()
01226           ->fetch_by_region_unique( $slice->coord_system_name(),
01227                                     $slice->seq_region_name() );
01228 
01229         push( @slice_names, map { $_->name() } @{$slices} );
01230       }
01231       else {
01232         push @slice_names, $slice->name();
01233       }
01234     }
01235   }
01236 
01237   return \@slice_names;
01238 } ## end sub slice_names
01239 
01240 
01241 =head2 logger
01242 
01243   Arg[1]      : 
01244   Example     : 
01245   Description : 
01246   Return type : 
01247   Exceptions  : 
01248   Caller      : 
01249   Status      :
01250 
01251 =cut
01252 
01253 sub logger {
01254   my $self = shift;
01255   $self->{'logger'} = shift if (@_);
01256   return $self->{'logger'};
01257 }
01258 
01259 
01260 =head2 conf
01261 
01262   Arg[1]      : 
01263   Example     : 
01264   Description : 
01265   Return type : 
01266   Exceptions  : 
01267   Caller      : 
01268   Status      :
01269 
01270 =cut
01271 
01272 sub conf {
01273   my $self = shift;
01274   $self->{'conf'} = shift if (@_);
01275   return $self->{'conf'};
01276 }
01277 
01278 
01279 sub cache_method {
01280   my $self = shift;
01281   $self->{'instance'}->{'cache_method'} = shift if (@_);
01282   return $self->{'instance'}->{'cache_method'};
01283 }
01284 
01285 
01286 sub highest_common_cs {
01287   my $self = shift;
01288   $self->{'instance'}->{'hccs'} = shift if (@_);
01289   return $self->{'instance'}->{'hccs'};
01290 }
01291 
01292 
01293 sub highest_common_cs_version {
01294   my $self = shift;
01295   $self->{'instance'}->{'hccsv'} = shift if (@_);
01296   return $self->{'instance'}->{'hccsv'};
01297 }
01298 
01299 
01300 sub add_common_cs {
01301   my $self = shift;
01302   my $cs = shift;
01303 
01304   unless ($cs and $cs->isa('Bio::EnsEMBL::CoordSystem')) {
01305     throw('You must provide a CoordSystem');
01306   }
01307 
01308   my $csid = join(':', $cs->name, $cs->version);
01309 
01310   $self->{'instance'}->{'ccs'}->{$csid} = 1;
01311 }
01312 
01313 
01314 sub is_common_cs {
01315   my $self = shift;
01316   my $csid = shift;
01317 
01318   return $self->{'instance'}->{'ccs'}->{$csid};
01319 }
01320 
01321 
01322 1;
01323