Archive Ensembl HomeArchive Ensembl Home
EnsemblTranscriptGeneric.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 =head1 SYNOPSIS
00024 
00025 =head1 DESCRIPTION
00026 
00027 =head1 METHODS
00028 
00029 =cut
00030 
00031 
00032 package Bio::EnsEMBL::IdMapping::InternalIdMapper::EnsemblTranscriptGeneric;
00033 
00034 use strict;
00035 use warnings;
00036 no warnings 'uninitialized';
00037 
00038 use Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper;
00039 our @ISA = qw(Bio::EnsEMBL::IdMapping::InternalIdMapper::BaseMapper);
00040 
00041 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00042 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
00043 
00044   
00045 #
00046 # basic mapping
00047 #
00048 sub init_basic {
00049   my $self = shift;
00050   my $num = shift;
00051   my $tsb = shift;
00052   my $mappings = shift;
00053   my $transcript_scores = shift;
00054 
00055   $self->logger->info("Basic transcript mapping...\n", 0, 'stamped');
00056 
00057   $mappings = $self->basic_mapping($transcript_scores,
00058     "transcript_mappings$num");
00059   $num++;
00060   my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
00061     "transcript_matrix$num");
00062 
00063   return ($new_scores, $mappings);
00064 }
00065 
00066 
00067 #
00068 # handle cases with exact match but different translation
00069 #
00070 sub non_exact_translation {
00071   my $self = shift;
00072   my $num = shift;
00073   my $tsb = shift;
00074   my $mappings = shift;
00075   my $transcript_scores = shift;
00076 
00077   $self->logger->info("Exact Transcript non-exact Translation...\n", 0, 'stamped');
00078   
00079   unless ($transcript_scores->loaded) {
00080     $tsb->different_translation_rescore($transcript_scores);
00081     $transcript_scores->write_to_file;
00082   }
00083   
00084   $mappings = $self->basic_mapping($transcript_scores,
00085     "transcript_mappings$num");
00086   $num++;
00087   my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
00088     "transcript_matrix$num");
00089 
00090   return ($new_scores, $mappings);
00091 }
00092 
00093 
00094 #
00095 # reduce score for mappings of transcripts which do not belong to mapped
00096 # genes
00097 #
00098 sub mapped_gene {
00099   my $self = shift;
00100   my $num = shift;
00101   my $tsb = shift;
00102   my $mappings = shift;
00103   my $transcript_scores = shift;
00104   my $gene_mappings = shift;
00105 
00106   $self->logger->info("Transcripts in mapped genes...\n", 0, 'stamped');
00107   
00108   unless ($transcript_scores->loaded) {
00109   $tsb->non_mapped_gene_rescore($transcript_scores, $gene_mappings);
00110     $transcript_scores->write_to_file;
00111   }
00112   
00113   $mappings = $self->basic_mapping($transcript_scores,
00114     "transcript_mappings$num");
00115   $num++;
00116   my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
00117     "transcript_matrix$num");
00118 
00119   return ($new_scores, $mappings);
00120 }
00121 
00122 #
00123 # rescore by penalising scores between transcripts with different biotypes
00124 #
00125 sub biotype {
00126   my $self              = shift;
00127   my $num               = shift;
00128   my $tsb               = shift;
00129   my $mappings          = shift;
00130   my $transcript_scores = shift;
00131 
00132   $self->logger->info( "Retry with biotype disambiguation...\n",
00133                        0, 'stamped' );
00134 
00135   unless ( $transcript_scores->loaded() ) {
00136     $tsb->biotype_transcript_rescore($transcript_scores);
00137     $transcript_scores->write_to_file();
00138   }
00139 
00140   my $new_mappings = $self->basic_mapping( $transcript_scores,
00141                                            "transcript_mappings$num" );
00142   $num++;
00143   my $new_scores =
00144     $tsb->create_shrinked_matrix( $transcript_scores, $new_mappings,
00145                                   "transcript_matrix$num" );
00146 
00147   return ( $new_scores, $new_mappings );
00148 }
00149 
00150 #
00151 # selectively rescore by penalising scores between transcripts with
00152 # different internalIDs  
00153 #
00154 sub internal_id {
00155   my $self = shift;
00156   my $num = shift;
00157   my $tsb = shift;
00158   my $mappings = shift;
00159   my $transcript_scores = shift;
00160 
00161   $self->logger->info("Retry with internalID disambiguation...\n", 0, 'stamped');
00162   
00163   unless ($transcript_scores->loaded) {
00164     $tsb->internal_id_rescore($transcript_scores);
00165     $transcript_scores->write_to_file;
00166   }
00167 
00168   $mappings = $self->basic_mapping($transcript_scores,
00169     "transcript_mappings$num");
00170   $num++;
00171   my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
00172     "transcript_matrix$num");
00173 
00174   return ($new_scores, $mappings);
00175 }
00176 
00177 
00178 #
00179 # handle ambiguities between transcripts in single genes
00180 #
00181 sub single_gene {
00182   my $self = shift;
00183   my $num = shift;
00184   my $tsb = shift;
00185   my $mappings = shift;
00186   my $transcript_scores = shift;
00187 
00188   $self->logger->info("Transcripts in single genes...\n", 0, 'stamped');
00189   
00190   unless ($transcript_scores->loaded) {
00191     $transcript_scores->write_to_file;
00192   }
00193   
00194   $mappings = $self->same_gene_transcript_mapping($transcript_scores,
00195     "transcript_mappings$num");
00196   $num++;
00197   my $new_scores = $tsb->create_shrinked_matrix($transcript_scores, $mappings,
00198     "transcript_matrix$num");
00199 
00200   return ($new_scores, $mappings);
00201 }
00202 
00203 
00204 #
00205 # modified basic mapper that maps transcripts that are ambiguous within one gene
00206 #
00207 sub same_gene_transcript_mapping {
00208   my $self = shift;
00209   my $matrix = shift;
00210   my $mapping_name = shift;
00211 
00212   # argument checks
00213   unless ($matrix and
00214           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00215     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00216   }
00217 
00218   throw('Need a name for serialising the mapping.') unless ($mapping_name);
00219 
00220   # Create a new MappingList object. Specify AUTO_LOAD to load serialised
00221   # existing mappings if found
00222   my $dump_path = path_append($self->conf->param('basedir'), 'mapping');
00223   
00224   my $mappings = Bio::EnsEMBL::IdMapping::MappingList->new(
00225     -DUMP_PATH   => $dump_path,
00226     -CACHE_FILE  => "${mapping_name}.ser",
00227     -AUTO_LOAD   => 1,
00228   );
00229   
00230   # checkpoint test: return a previously stored MappingList
00231   if ($mappings->loaded) {
00232     $self->logger->info("Read existing mappings from ${mapping_name}.ser.\n");
00233     return $mappings;
00234   }
00235 
00236   my $sources_done = {};
00237   my $targets_done = {};
00238 
00239   # sort scoring matrix entries by descending score
00240   my @sorted_entries = sort { $b->score <=> $a->score ||
00241     $a->source <=> $b->source || $a->target <=> $b->target }
00242       @{ $matrix->get_all_Entries };
00243 
00244   while (my $entry = shift(@sorted_entries)) {
00245     
00246     # $self->logger->debug("\nxxx4 ".$entry->to_string." ");
00247 
00248     # we already found a mapping for either source or target yet
00249     next if ($sources_done->{$entry->source} or
00250              $targets_done->{$entry->target});
00251 
00252     #$self->logger->debug('d');
00253 
00254     my $other_sources = [];
00255     my $other_targets = [];
00256     my %source_genes = ();
00257     my %target_genes = ();
00258 
00259     if ($self->ambiguous_mapping($entry, $matrix, $other_sources, $other_targets)) {
00260       #$self->logger->debug('a');
00261 
00262       $other_sources = $self->filter_sources($other_sources, $sources_done);
00263       $other_targets = $self->filter_targets($other_targets, $targets_done);
00264 
00265       $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
00266         'source', $entry->source)} = 1;
00267       $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
00268         'target', $entry->target)} = 1;
00269 
00270       foreach my $other_source (@{ $other_sources }) {
00271         $source_genes{$self->cache->get_by_key('genes_by_transcript_id',
00272           'source', $other_source)} = 1;
00273       }
00274         
00275       foreach my $other_target (@{ $other_targets }) {
00276         $target_genes{$self->cache->get_by_key('genes_by_transcript_id',
00277           'target', $other_target)} = 1;
00278       }
00279       
00280       # only add mapping if only one source and target gene involved
00281       if (scalar(keys %source_genes) == 1 and scalar(keys %target_genes) == 1) {
00282         #$self->logger->debug('O');
00283         $mappings->add_Entry($entry);
00284       }
00285 
00286     } else {
00287       #$self->logger->debug('A');
00288 
00289       # this is the best mapping, add it
00290       $mappings->add_Entry($entry);
00291     }
00292 
00293     $sources_done->{$entry->source} = 1;
00294     $targets_done->{$entry->target} = 1;
00295   }
00296 
00297   # create checkpoint
00298   $mappings->write_to_file;
00299 
00300   return $mappings;
00301 }
00302 
00303 
00304 1;
00305