Archive Ensembl HomeArchive Ensembl Home
ScoreBuilder.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::IdMapping::ScoreBuilder - score builder base class
00024 
00025 =head1 SYNOPSIS
00026 
00027 This class is not instantiated. Please see subclasses for usage examples
00028 (e.g.  GeneScoreBuilder).
00029 
00030 =head1 DESCRIPTION
00031 
00032 This is the base class for the score builders used in the stable Id
00033 mapping application. It contains methods which are used by more than one
00034 ScoreBuilder.
00035 
00036 =head1 METHODS
00037 
00038   create_shrinked_matrix
00039   internal_id_rescore
00040   log_matrix_stats
00041 
00042 =cut
00043 
00044 package Bio::EnsEMBL::IdMapping::ScoreBuilder;
00045 
00046 use strict;
00047 use warnings;
00048 no warnings 'uninitialized';
00049 
00050 use Bio::EnsEMBL::IdMapping::BaseObject;
00051 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
00052 
00053 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00054 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
00055 use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
00056 
00057 
00058 =head2 create_shrinked_matrix
00059 
00060   Arg[1]      : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
00061                 matrix
00062   Arg[2]      : Bio::EnsEMBL::Idmapping::MappingList $mappings - mappings
00063   Arg[3]      : String $cache_file - base name of a cache file (extension '.ser'
00064                 will be added automatically) for the returned matrix
00065   Example     : my $new_scores = $score_builder->create_shrinked_matrix(
00066                   $gene_scores, $mappings, "gene_matrix1");
00067   Description : Create a shrinked scoring matrix which doesn't contain entries
00068                 which were already mapped. It also logs how many new mappings
00069                 were added in this process.
00070   Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
00071   Exceptions  : thrown on wrong or missing arguments
00072   Caller      : InternalIdMapper plugin
00073   Status      : At Risk
00074               : under development
00075 
00076 =cut
00077 
00078 # TODO: shrinked = shrunken?
00079 sub create_shrinked_matrix {
00080   my $self = shift;
00081   my $matrix = shift;
00082   my $mappings = shift;
00083   my $cache_file = shift; # base name, extension '.ser' will be added
00084 
00085   # argument checks
00086   unless ($matrix and
00087           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00088     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00089   }
00090   
00091   unless ($mappings and
00092           $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
00093     throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
00094   }
00095 
00096   throw('Need a cache file name.') unless ($cache_file);
00097 
00098   my $dump_path = path_append($self->conf->param('basedir'), 'matrix');
00099   $cache_file .= '.ser';
00100 
00101   my $shrinked_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
00102     -DUMP_PATH   => $dump_path,
00103     -CACHE_FILE  => $cache_file,
00104     -AUTO_LOAD   => 1,
00105   );
00106 
00107   # if we already found a saved matrix, just return it
00108   if ($shrinked_matrix->loaded) {
00109   
00110     $self->logger->info("Read existing scoring matrix from $cache_file.\n");
00111   
00112   } else {
00113     
00114     # create lookup hashes for sources and targets in the MappingList
00115     my %sources = ();
00116     my %targets = ();
00117 
00118     foreach my $entry (@{ $mappings->get_all_Entries }) {
00119       $sources{$entry->source} = 1;
00120       $targets{$entry->target} = 1;
00121     }
00122 
00123     # add all entries to shrinked matrix which are not in the MappingList
00124     foreach my $entry (@{ $matrix->get_all_Entries }) {
00125       unless ($sources{$entry->source} or $targets{$entry->target}) {
00126         $shrinked_matrix->add_Entry($entry);
00127       }
00128     }
00129 
00130   }
00131 
00132   # log shrinking stats
00133   $self->logger->info('Sources '.$matrix->get_source_count.' --> '.
00134     $shrinked_matrix->get_source_count."\n");
00135   $self->logger->info('Targets '.$matrix->get_target_count.' --> '.
00136     $shrinked_matrix->get_target_count."\n");
00137   $self->logger->info('Entries '.$matrix->get_entry_count.' --> '.
00138     $shrinked_matrix->get_entry_count."\n");
00139   $self->logger->info('New mappings: '.$mappings->get_entry_count."\n\n");
00140 
00141   return $shrinked_matrix;
00142 }
00143 
00144 
00145 =head2 internal_id_rescore
00146 
00147   Arg[1]      : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
00148                 matrix
00149   Example     : $score_builder->internal_id_rescore($gene_scores);
00150   Description : Rescore ambiguous mappings based on internal Ids. This is the
00151                 last disambiguation step and is only useful if objects with the
00152                 same internal Id were used in source and target dbs (e.g. in
00153                 patch builds or if objects were copied from source to target).
00154 
00155                 If a source and target gene have the same internal Id and there
00156                 are mappings to other target genes then these *other* mappings
00157                 are penalised.
00158   Return type : none
00159   Exceptions  : thrown on wrong or missing argument
00160   Caller      : InternalIdMapper plugins
00161   Status      : At Risk
00162               : under development
00163 
00164 =cut
00165 
00166 sub internal_id_rescore {
00167   my $self   = shift;
00168   my $matrix = shift;
00169 
00170   unless ($matrix
00171       and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
00172   {
00173     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00174   }
00175 
00176   my $i = 0;
00177 
00178   foreach my $source ( @{ $matrix->get_all_sources } ) {
00179     my @entries =
00180       sort { $b <=> $a } @{ $matrix->get_Entries_for_source($source) };
00181 
00182     # nothing to do if we only have one mapping
00183     if ( scalar(@entries) == 1 ) { next }
00184 
00185     # only penalise if mappings are ambiguous
00186     if ( $entries[0]->score != $entries[1]->score ) { next }
00187 
00188     # only penalise if one source id == target id where score == best
00189     # score
00190     my $ambiguous = 0;
00191 
00192     foreach my $e (@entries) {
00193       if ( $e->target == $source and $e->score == $entries[0]->score() )
00194       {
00195         $ambiguous = 1;
00196         last;
00197       }
00198     }
00199 
00200     if ( !$ambiguous ) { next }
00201 
00202     # now penalise those where source id != target id and score == best
00203     # score
00204     foreach my $e (@entries) {
00205       if ( $e->target != $source and $e->score == $entries[0]->score() )
00206       {
00207         # PENALTY: Reduce score for ambiguous mappings.
00208         $matrix->set_score( $source, $e->target(), 0.9*$e->score() );
00209         $i++;
00210       }
00211     }
00212 
00213   } ## end foreach my $source ( @{ $matrix...})
00214 
00215   $self->logger->debug("Scored entries with internal ID mismatch: $i\n",
00216                        1 );
00217 } ## end sub internal_id_rescore
00218 
00219 
00220 =head2 log_matrix_stats
00221 
00222   Arg[1]      : Bio::EnsEMBL::Idmapping::ScoredMappingMatrix $matrix - a scoring
00223                 matrix
00224   Example     : $score_builder->log_matrix_stats;
00225   Description : Logs scoring matrix statistics (number of entries, min/max/avg
00226                 scores).
00227   Return type : none
00228   Exceptions  : thrown on wrong or missing argument
00229   Caller      : general
00230   Status      : At Risk
00231               : under development
00232 
00233 =cut
00234 
00235 sub log_matrix_stats {
00236   my $self = shift;
00237   my $matrix = shift;
00238 
00239   unless ($matrix and
00240           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00241     throw('You must provide a ScoredMappingMatrix.');
00242   }
00243 
00244   my $fmt1 = "%-40s%10.0f\n";
00245   my $fmt2 = "%-40s%10.5f\n";
00246   
00247   $self->logger->info(sprintf($fmt1, "Scoring matrix entries:",
00248     $matrix->get_entry_count), 1);
00249   
00250   $self->logger->info(sprintf($fmt1, "Scoring matrix sources:",
00251     $matrix->get_source_count), 1);
00252   
00253   $self->logger->info(sprintf($fmt1, "Scoring matrix targets:",
00254     $matrix->get_target_count), 1);
00255   
00256   $self->logger->info(sprintf($fmt2, "Average score:",
00257     $matrix->get_average_score), 1);
00258   
00259   my ($min, $max) = @{ $matrix->get_min_max_scores };
00260   $self->logger->info(sprintf($fmt2, "Min. score:", $min), 1);
00261   $self->logger->info(sprintf($fmt2, "Max. score:", $max), 1);
00262 }
00263 
00264 
00265 1;
00266