Archive Ensembl HomeArchive Ensembl Home
InternalIdMapper.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 =head1 SYNOPSIS
00024 
00025 =head1 DESCRIPTION
00026 
00027 =head1 METHODS
00028 
00029 =cut
00030 
00031 
00032 package Bio::EnsEMBL::IdMapping::InternalIdMapper;
00033 
00034 use strict;
00035 use warnings;
00036 no warnings 'uninitialized';
00037 
00038 use Bio::EnsEMBL::IdMapping::BaseObject;
00039 our @ISA = qw(Bio::EnsEMBL::IdMapping::BaseObject);
00040 
00041 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00042 use Bio::EnsEMBL::Utils::ScriptUtils qw(inject path_append);
00043 use Bio::EnsEMBL::IdMapping::Entry;
00044 use Bio::EnsEMBL::IdMapping::MappingList;
00045 use Bio::EnsEMBL::IdMapping::SyntenyFramework;
00046 
00047 
00048 # scores are considered the same if (2.0 * (s1-s2))/(s1 + s2) < this
00049 use constant SIMILAR_SCORE_RATIO => 0.01;
00050 
00051     
00052 sub map_genes {
00053   my $self = shift;
00054   my $gene_scores = shift;
00055   my $transcript_scores = shift;
00056   my $gsb = shift;
00057 
00058   # argument checks
00059   unless ($gene_scores and
00060           $gene_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00061     throw('Need a gene Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00062   }
00063   
00064   unless ($transcript_scores and
00065           $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00066     throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00067   }
00068 
00069   unless ($gsb and
00070           $gsb->isa('Bio::EnsEMBL::IdMapping::GeneScoreBuilder')) {
00071     throw('Need a Bio::EnsEMBL::IdMapping::GeneScoreBuilder.');
00072   }
00073   
00074   $self->logger->info("== Internal ID mapping for genes...\n\n", 0, 'stamped');
00075 
00076   my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
00077 
00078   my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00079     -DUMP_PATH   => $dump_path,
00080     -CACHE_FILE  => 'gene_mappings.ser',
00081   );
00082 
00083   my $mapping_cache = $mappings->cache_file;
00084 
00085   if (-s $mapping_cache) {
00086     
00087     # read from file
00088     $self->logger->info("Reading gene mappings from file...\n", 0, 'stamped');
00089     $self->logger->debug("Cache file $mapping_cache.\n", 1);
00090     $mappings->read_from_file;
00091     $self->logger->info("Done.\n\n", 0, 'stamped');
00092     
00093   } else {
00094     
00095     # create gene mappings
00096     $self->logger->info("No gene mappings found. Will calculate them now.\n");
00097 
00098     # determine which plugin methods to run
00099     my @default_plugins = (qw(
00100       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::init_basic
00101       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::best_transcript
00102       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::biotype
00103       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::synteny
00104       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblGeneGeneric::internal_id
00105     ));
00106 
00107     my @plugins = $self->conf->param('plugin_internal_id_mappers_gene');
00108     @plugins = @default_plugins unless (defined($plugins[0]));
00109 
00110     my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00111       -DUMP_PATH   => $dump_path,
00112       -CACHE_FILE  => 'gene_mappings0.ser',
00113     );
00114     my @mappings = ();
00115     my $i = 0;
00116 
00117     #
00118     # run the scoring chain
00119     #
00120     foreach my $plugin (@plugins) {
00121       ($gene_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
00122         $gsb, $new_mappings, $gene_scores, $transcript_scores);
00123 
00124       push(@mappings, $new_mappings);
00125     }
00126 
00127     # report remaining ambiguities
00128     $self->logger->info($gene_scores->get_source_count.
00129       " source genes are ambiguous with ".
00130       $gene_scores->get_target_count." target genes.\n\n");
00131 
00132     $self->log_ambiguous($gene_scores, 'gene');
00133     
00134     # merge mappings and write to file
00135     $mappings->add_all(@mappings);
00136     $mappings->write_to_file;
00137 
00138     if ($self->logger->loglevel eq 'debug') {
00139       $mappings->log('gene', $self->conf->param('basedir'));
00140     }
00141 
00142     $self->logger->info("Done.\n\n", 0, 'stamped');
00143 
00144   }
00145 
00146   return $mappings;
00147 }
00148 
00149 
00150 sub map_transcripts {
00151   my $self = shift;
00152   my $transcript_scores = shift;
00153   my $gene_mappings = shift;
00154   my $tsb = shift;
00155 
00156   # argument checks
00157   unless ($transcript_scores and
00158       $transcript_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00159     throw('Need a transcript Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00160   }
00161 
00162   unless ($gene_mappings and
00163           $gene_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
00164     throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
00165   }
00166   
00167   unless ($tsb and
00168           $tsb->isa('Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder')) {
00169     throw('Need a Bio::EnsEMBL::IdMapping::TranscriptScoreBuilder.');
00170   }
00171   
00172   $self->logger->info("== Internal ID mapping for transcripts...\n\n", 0, 'stamped');
00173 
00174   my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
00175 
00176   my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00177     -DUMP_PATH   => $dump_path,
00178     -CACHE_FILE  => 'transcript_mappings.ser',
00179   );
00180 
00181   my $mapping_cache = $mappings->cache_file;
00182 
00183   if (-s $mapping_cache) {
00184     
00185     # read from file
00186     $self->logger->info("Reading transcript mappings from file...\n", 0,
00187       'stamped');
00188     $self->logger->debug("Cache file $mapping_cache.\n", 1);
00189     $mappings->read_from_file;
00190     $self->logger->info("Done.\n\n", 0, 'stamped');
00191     
00192   } else {
00193     
00194     # create transcript mappings
00195     $self->logger->info("No transcript mappings found. Will calculate them now.\n");
00196 
00197     # determine which plugin methods to run
00198     my @default_plugins = (qw(
00199       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::init_basic
00200       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::non_exact_translation
00201       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::biotype
00202       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::mapped_gene
00203       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::single_gene
00204       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric::internal_id
00205     ));
00206 
00207     my @plugins = $self->conf->param('plugin_internal_id_mappers_transcript');
00208     @plugins = @default_plugins unless (defined($plugins[0]));
00209 
00210     my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00211       -DUMP_PATH   => $dump_path,
00212       -CACHE_FILE  => 'transcript_mappings0.ser',
00213     );
00214     my @mappings = ();
00215     my $i = 0;
00216 
00217     #
00218     # run the scoring chain
00219     #
00220     foreach my $plugin (@plugins) {
00221       ($transcript_scores, $new_mappings) = $self->delegate_to_plugin($plugin,
00222         $i++, $tsb, $new_mappings, $transcript_scores, $gene_mappings);
00223 
00224       push(@mappings, $new_mappings);
00225     }
00226 
00227     # report remaining ambiguities
00228     $self->logger->info($transcript_scores->get_source_count.
00229       " source transcripts are ambiguous with ".
00230       $transcript_scores->get_target_count." target transcripts.\n\n");
00231 
00232     $self->log_ambiguous($transcript_scores, 'transcript');
00233 
00234     # merge mappings and write to file
00235     $mappings->add_all(@mappings);
00236     $mappings->write_to_file;
00237 
00238     if ($self->logger->loglevel eq 'debug') {
00239       $mappings->log('transcript', $self->conf->param('basedir'));
00240     }
00241 
00242     $self->logger->info("Done.\n\n", 0, 'stamped');
00243 
00244   }
00245 
00246   return $mappings;
00247 
00248 }
00249 
00250 
00251 sub map_exons {
00252   my $self = shift;
00253   my $exon_scores = shift;
00254   my $transcript_mappings = shift;
00255   my $esb = shift;
00256 
00257   # argument checks
00258   unless ($exon_scores and
00259       $exon_scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00260     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
00261   }
00262 
00263   unless ($transcript_mappings and
00264           $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
00265     throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
00266   }
00267   
00268   unless ($esb and
00269           $esb->isa('Bio::EnsEMBL::IdMapping::ExonScoreBuilder')) {
00270     throw('Need a Bio::EnsEMBL::IdMapping::ExonScoreBuilder.');
00271   }
00272   
00273   $self->logger->info("== Internal ID mapping for exons...\n\n", 0, 'stamped');
00274 
00275   my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
00276 
00277   my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00278     -DUMP_PATH   => $dump_path,
00279     -CACHE_FILE  => 'exon_mappings.ser',
00280   );
00281 
00282   my $mapping_cache = $mappings->cache_file;
00283 
00284   if (-s $mapping_cache) {
00285     
00286     # read from file
00287     $self->logger->info("Reading exon mappings from file...\n", 0,
00288       'stamped');
00289     $self->logger->debug("Cache file $mapping_cache.\n", 1);
00290     $mappings->read_from_file;
00291     $self->logger->info("Done.\n\n", 0, 'stamped');
00292     
00293   } else {
00294     
00295     # create exon mappings
00296     $self->logger->info("No exon mappings found. Will calculate them now.\n");
00297 
00298     # determine which plugin methods to run
00299     my @default_plugins = (qw(
00300       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::init_basic
00301       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::mapped_transcript
00302       Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblExonGeneric::internal_id
00303     ));
00304 
00305     my @plugins = $self->conf->param('plugin_internal_id_mappers_exon');
00306     @plugins = @default_plugins unless (defined($plugins[0]));
00307 
00308     my $new_mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00309       -DUMP_PATH   => $dump_path,
00310       -CACHE_FILE  => 'exon_mappings0.ser',
00311     );
00312     my @mappings = ();
00313     my $i = 0;
00314 
00315     #
00316     # run the scoring chain
00317     #
00318     foreach my $plugin (@plugins) {
00319       ($exon_scores, $new_mappings) = $self->delegate_to_plugin($plugin, $i++,
00320         $esb, $new_mappings, $exon_scores);
00321 
00322       push(@mappings, $new_mappings);
00323     }
00324 
00325     # report remaining ambiguities
00326     $self->logger->info($exon_scores->get_source_count.
00327       " source exons are ambiguous with ".
00328       $exon_scores->get_target_count." target exons.\n\n");
00329 
00330     $self->log_ambiguous($exon_scores, 'exon');
00331 
00332     # merge mappings and write to file
00333     $mappings->add_all(@mappings);
00334     $mappings->write_to_file;
00335 
00336     if ($self->logger->loglevel eq 'debug') {
00337       $mappings->log('exon', $self->conf->param('basedir'));
00338     }
00339 
00340     $self->logger->info("Done.\n\n", 0, 'stamped');
00341 
00342   }
00343 
00344   return $mappings;
00345 
00346 }
00347 
00348 
00349 #
00350 # this is not implemented as a plugin, since a) it's too simple and b) it's
00351 # tied to transcripts so there are no translation scores or score builder.
00352 #
00353 sub map_translations {
00354   my $self = shift;
00355   my $transcript_mappings = shift;
00356 
00357   # argument checks
00358   unless ($transcript_mappings and
00359           $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
00360     throw('Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
00361   }
00362   
00363   $self->logger->info("== Internal ID mapping for translations...\n\n", 0, 'stamped');
00364 
00365   my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
00366 
00367   my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00368     -DUMP_PATH   => $dump_path,
00369     -CACHE_FILE  => 'translation_mappings.ser',
00370   );
00371 
00372   my $mapping_cache = $mappings->cache_file;
00373 
00374   if (-s $mapping_cache) {
00375     
00376     # read from file
00377     $self->logger->info("Reading translation mappings from file...\n", 0,
00378       'stamped');
00379     $self->logger->debug("Cache file $mapping_cache.\n", 1);
00380     $mappings->read_from_file;
00381     $self->logger->info("Done.\n\n", 0, 'stamped');
00382     
00383   } else {
00384     
00385     # create translation mappings
00386     $self->logger->info("No translation mappings found. Will calculate them now.\n");
00387 
00388     $self->logger->info("Translation mapping...\n", 0, 'stamped');
00389 
00390     #
00391     # map translations for mapped transcripts
00392     #
00393     my $i = 0;
00394 
00395     foreach my $entry (@{ $transcript_mappings->get_all_Entries }) {
00396 
00397       my $source_tl = $self->cache->get_by_key('transcripts_by_id',
00398         'source', $entry->source)->translation;
00399       my $target_tl = $self->cache->get_by_key('transcripts_by_id',
00400         'target', $entry->target)->translation;
00401 
00402       if ($source_tl and $target_tl) {
00403       
00404         # add mapping for the translations; note that the score is taken from
00405         # the transcript mapping
00406         my $tl_entry = Bio::EnsEMBL::IdMapping::Entry->new_fast([
00407           $source_tl->id, $target_tl->id, $entry->score
00408         ]);
00409         $mappings->add_Entry($tl_entry);
00410       
00411       } else {
00412         $i++;
00413       }
00414 
00415     }
00416 
00417     $self->logger->debug("Skipped transcripts without translation: $i\n", 1);
00418     $self->logger->info("New mappings: ".$mappings->get_entry_count."\n\n");
00419 
00420     $mappings->write_to_file;
00421 
00422     if ($self->logger->loglevel eq 'debug') {
00423       $mappings->log('translation', $self->conf->param('basedir'));
00424     }
00425 
00426     $self->logger->info("Done.\n\n", 0, 'stamped');
00427 
00428   }
00429 
00430   return $mappings;
00431 
00432 }
00433 
00434 
00435 sub delegate_to_plugin {
00436   my $self = shift;
00437   my $plugin = shift;
00438   my $num = shift;
00439   my $score_builder = shift;
00440   my $mappings = shift;
00441   my $scores = shift;
00442 
00443   # argument checks
00444   unless ($score_builder and
00445           $score_builder->isa('Bio::EnsEMBL::IdMapping::ScoreBuilder')) {
00446     throw('Need a Bio::EnsEMBL::IdMapping::ScoreBuilder.');
00447   }
00448 
00449   unless ($mappings and
00450           $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
00451     throw('Need a Bio::EnsEMBL::IdMapping::MappingList.');
00452   }
00453   
00454   unless ($scores and
00455           $scores->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00456     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00457   }
00458 
00459   # split plugin name into module and method
00460   $plugin =~ /(.*)::(\w+)$/;
00461   my $module = $1;
00462   my $method = $2;
00463 
00464   unless ($module and $method) {
00465     throw("Unable to determine module and method name from $plugin.\n");
00466   }
00467 
00468   # instantiate the plugin unless we already have an instance
00469   my $plugin_instance;
00470   if ($self->has_plugin($module)) {
00471     
00472     # re-use an existing plugin instance
00473     $plugin_instance = $self->get_plugin($module);
00474   
00475   } else {
00476     
00477     # inject and instantiate the plugin module
00478     inject($module);
00479     $plugin_instance = $module->new(
00480         -LOGGER       => $self->logger,
00481         -CONF         => $self->conf,
00482         -CACHE        => $self->cache
00483     );
00484     $self->add_plugin($plugin_instance);
00485 
00486   }
00487 
00488   # run the method on the plugin
00489   #
00490   # pass in a sequence number (number of method run, used for generating
00491   # checkpoint files), the scores used for determining the mapping, and all
00492   # other arguments passed to this method (these will vary for different object
00493   # types)
00494   #
00495   # return the scores and mappings to feed into the next plugin in the chain
00496   return $plugin_instance->$method($num, $score_builder, $mappings, $scores, @_);
00497 }
00498 
00499 
00500 sub has_plugin {
00501   my $self = shift;
00502   my $module = shift;
00503 
00504   defined($self->{'_plugins'}->{$module}) ? (return 1) : (return 0);
00505 }
00506 
00507 
00508 sub get_plugin {
00509   my $self = shift;
00510   my $module = shift;
00511 
00512   return $self->{'_plugins'}->{$module};
00513 }
00514 
00515 
00516 sub add_plugin {
00517   my $self = shift;
00518   my $plugin_instance = shift;
00519 
00520   $self->{'_plugins'}->{ref($plugin_instance)} = $plugin_instance;
00521 }
00522 
00523 
00524 sub log_ambiguous {
00525   my $self = shift;
00526   my $matrix = shift;
00527   my $type = shift;
00528 
00529   unless ($matrix and
00530           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00531     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00532   }
00533   
00534   # create dump directory if it doesn't exist
00535   my $debug_path = $self->conf->param('basedir').'/debug';
00536   unless (-d $debug_path) {
00537     system("mkdir -p $debug_path") == 0 or
00538       throw("Unable to create directory $debug_path.\n");
00539   }
00540   
00541   my $logfile = "$debug_path/ambiguous_${type}.txt";
00542   
00543   open(my $fh, '>', $logfile) or
00544     throw("Unable to open $logfile for writing: $!");
00545 
00546   my @low_scoring = ();
00547   my @high_scoring = ();
00548   my $last_id;
00549 
00550   # log by source
00551   foreach my $entry (sort { $a->source <=> $b->source }
00552                         @{ $matrix->get_all_Entries }) {
00553     
00554     $last_id ||= $entry->target;
00555 
00556     if ($last_id != $entry->source) {
00557       $self->write_ambiguous($type, 'source', $fh, \@low_scoring,
00558         \@high_scoring);
00559       $last_id = $entry->source;
00560     }
00561     
00562     if ($entry->score < 0.5) {
00563       push @low_scoring, $entry;
00564     } else {
00565       push @high_scoring, $entry;
00566     }
00567   }
00568 
00569   # write last source
00570   $self->write_ambiguous($type, 'source', $fh, \@low_scoring, \@high_scoring);
00571 
00572   # now do the same by target
00573   $last_id = undef;
00574   foreach my $entry (sort { $a->target <=> $b->target }
00575                         @{ $matrix->get_all_Entries }) {
00576 
00577     $last_id ||= $entry->target;
00578 
00579     if ($last_id != $entry->target) {
00580       $self->write_ambiguous($type, 'target', $fh, \@low_scoring,
00581         \@high_scoring);
00582       $last_id = $entry->target;
00583     }
00584     
00585     if ($entry->score < 0.5) {
00586       push @low_scoring, $entry;
00587     } else {
00588       push @high_scoring, $entry;
00589     }
00590   }
00591 
00592   # write last target
00593   $self->write_ambiguous($type, 'target', $fh, \@low_scoring, \@high_scoring);
00594 
00595   close($fh);
00596 }
00597 
00598 
00599 sub write_ambiguous {
00600   my ($self, $type, $db_type, $fh, $low, $high) = @_;
00601   
00602   # if only source or target are ambiguous (i.e. you have only one mapping from
00603   # this perspective) then log from the other perspective
00604   if (scalar(@$low) + scalar(@$high) <= 1) {
00605     @$low = ();
00606     @$high = ();
00607     return;
00608   }
00609 
00610   my $first_id;
00611   if (@$low) {
00612     $first_id = $low->[0]->$db_type;
00613   } else {
00614     $first_id = $high->[0]->$db_type;
00615   }
00616 
00617   my $other_db_type;
00618   if ($db_type eq 'source') {
00619     $other_db_type = 'target';
00620   } else {
00621     $other_db_type = 'source';
00622   }
00623 
00624   print $fh "$db_type $type $first_id scores ambiguously:\n";
00625 
00626   # high scorers
00627   if (@$high) {
00628     print $fh "  high scoring ${other_db_type}s\n";
00629 
00630     while (my $e = shift(@$high)) {
00631       print $fh "    ", $e->$other_db_type, " ", $e->score, "\n";
00632     }
00633   }
00634 
00635   # low scorers
00636   if (@$low) {
00637     print $fh "  low scoring ${other_db_type}s\n    ";
00638 
00639     my $i = 1;
00640 
00641     while (my $e = shift(@$low)) {
00642       print $fh "\n    " unless (($i++)%10);
00643       print $fh $e->$other_db_type, ", ";
00644     }
00645   }
00646 
00647   print $fh "\n";
00648 }
00649 
00650 
00651 1;
00652