Archive Ensembl HomeArchive Ensembl Home
ExonScoreBuilder.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 =head1 SYNOPSIS
00024 
00025 =head1 DESCRIPTION
00026 
00027 Combines ExonScoreBuilder, ExonDirectMapper and ExonerateRunner from
00028 Java application.
00029 
00030 =head1 METHODS
00031 
00032 =cut
00033 
00034 package Bio::EnsEMBL::IdMapping::ExonScoreBuilder;
00035 
00036 use strict;
00037 use warnings;
00038 no warnings 'uninitialized';
00039 
00040 use Bio::EnsEMBL::IdMapping::ScoreBuilder;
00041 our @ISA = qw(Bio::EnsEMBL::IdMapping::ScoreBuilder);
00042 
00043 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00044 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00045 use Bio::EnsEMBL::Utils::ScriptUtils qw(parse_bytes path_append);
00046 use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
00047 
00048 
00049 #
00050 # exon scoring is done in two steps:
00051 # 1. map exons by overlap (if a common coord_system exists)
00052 # 2. map remaining and poorly scoring exons using exonerate
00053 #
00054 sub score_exons {
00055   my $self = shift;
00056 
00057   $self->logger->info( "-- Scoring exons...\n\n", 0, 'stamped' );
00058 
00059   # score using overlaps, then exonerate
00060   my $matrix = $self->overlap_score;
00061 
00062   my $exonerate_matrix = $self->exonerate_score($matrix);
00063 
00064   # log stats before matrix merging
00065   $self->logger->info("\nOverlap scoring matrix:\n");
00066   $self->log_matrix_stats($matrix);
00067   $self->logger->info("\nExonerate scoring matrix:\n");
00068   $self->log_matrix_stats($exonerate_matrix);
00069 
00070   # merge matrices
00071   $self->logger->info( "\nMerging scoring matrices...\n", 0,
00072                        'stamped' );
00073   $matrix->merge($exonerate_matrix);
00074 
00075   $self->logger->info( "Done.\n\n", 0, 'stamped' );
00076 
00077   # debug logging
00078   if ( $self->logger->loglevel eq 'debug' ) {
00079     $matrix->log( 'exon', $self->conf->param('basedir') );
00080   }
00081 
00082   # log stats of combined matrix
00083   $self->logger->info("Combined scoring matrix:\n");
00084   $self->log_matrix_stats($matrix);
00085 
00086   $self->logger->info( "\nDone with exon scoring.\n\n", 0, 'stamped' );
00087 
00088   return $matrix;
00089 } ## end sub score_exons
00090 
00091 
00092 #
00093 # direct mapping by overlap (if common coord_system exists)
00094 #
00095 sub overlap_score {
00096   my $self = shift;
00097 
00098   my $dump_path =
00099     path_append( $self->conf->param('basedir'), 'matrix' );
00100 
00101   my $matrix =
00102     Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
00103                                -DUMP_PATH  => $dump_path,
00104                                -CACHE_FILE => 'exon_overlap_matrix.ser',
00105     );
00106 
00107   my $overlap_cache = $matrix->cache_file;
00108 
00109   if ( -s $overlap_cache ) {
00110 
00111     # read from file
00112     $self->logger->info(
00113                    "Reading exon overlap scoring matrix from file...\n",
00114                    0, 'stamped' );
00115     $self->logger->debug( "Cache file $overlap_cache.\n", 1 );
00116     $matrix->read_from_file;
00117     $self->logger->info( "Done.\n", 0, 'stamped' );
00118 
00119   }
00120   else {
00121 
00122     # build scoring matrix
00123     $self->logger->info(
00124          "No exon overlap scoring matrix found. Will build new one.\n");
00125 
00126     if ( $self->cache->highest_common_cs ) {
00127       $self->logger->info( "Overlap scoring...\n", 0, 'stamped' );
00128       $matrix = $self->build_overlap_scores($matrix);
00129       $self->logger->info( "Done.\n", 0, 'stamped' );
00130     }
00131 
00132     # write scoring matrix to file
00133     $matrix->write_to_file;
00134 
00135   }
00136 
00137   return $matrix;
00138 } ## end sub overlap_score
00139 
00140 
00141 #
00142 # map the remaining exons using exonerate
00143 #
00144 sub exonerate_score {
00145   my $self   = shift;
00146   my $matrix = shift;
00147 
00148   unless ( $matrix and
00149           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
00150   {
00151     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00152   }
00153 
00154   my $dump_path =
00155     path_append( $self->conf->param('basedir'), 'matrix' );
00156 
00157   my $exonerate_matrix =
00158     Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
00159                              -DUMP_PATH  => $dump_path,
00160                              -CACHE_FILE => 'exon_exonerate_matrix.ser',
00161     );
00162 
00163   my $exonerate_cache = $exonerate_matrix->cache_file;
00164 
00165   if ( -s $exonerate_cache ) {
00166 
00167     # read from file
00168     $self->logger->info( "Reading exonerate matrix from file...\n",
00169                          0, 'stamped' );
00170     $self->logger->debug( "Cache file $exonerate_cache.\n", 1 );
00171     $exonerate_matrix->read_from_file;
00172     $self->logger->info( "Done.\n", 0, 'stamped' );
00173 
00174   }
00175   else {
00176 
00177     # build scoring matrix
00178     $self->logger->info(
00179                     "No exonerate matrix found. Will build new one.\n");
00180 
00181     # dump exons to fasta files
00182     my $dump_count = $self->dump_filtered_exons($matrix);
00183 
00184     if ($dump_count) {
00185       # run exonerate
00186       $self->run_exonerate;
00187 
00188       # parse results
00189       $self->parse_exonerate_results($exonerate_matrix);
00190 
00191     }
00192     else {
00193 
00194       $self->logger->info( "No source and/or target exons dumped, " .
00195                            "so don't need to run exonerate.\n" );
00196 
00197     }
00198 
00199     # write scoring matrix to file
00200     $exonerate_matrix->write_to_file;
00201 
00202   } ## end else [ if ( -s $exonerate_cache)]
00203 
00204   return $exonerate_matrix;
00205 } ## end sub exonerate_score
00206 
00207 #
00208 # Algorithm:
00209 # Get a lists of exon containers for source and target.  Walk along both
00210 # lists, set a flag when you first encounter an exon (i.e. it starts).
00211 # Record all alternative exons until you encounter the exon again (exon
00212 # ends), then score against all alternative exons you've recorded.
00213 #
00214 sub build_overlap_scores {
00215   my ( $self, $matrix ) = @_;
00216 
00217   unless ($matrix
00218       and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
00219   {
00220     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00221   }
00222 
00223   # get sorted list of exon containers
00224   $self->logger->info( "Reading sorted exons from cache...\n",
00225                        1, 'stamped' );
00226 
00227   my @source_exons =
00228     $self->sort_exons( [
00229         values %{ $self->cache->get_by_name( 'exons_by_id', 'source' ) }
00230       ] );
00231 
00232   my @target_exons =
00233     $self->sort_exons( [
00234         values %{ $self->cache->get_by_name( 'exons_by_id', 'target' ) }
00235       ] );
00236 
00237   $self->logger->info( "Done.\n", 1, 'stamped' );
00238 
00239   # get first source and target exon container
00240   my $source_ec = shift(@source_exons);
00241   my $target_ec = shift(@target_exons);
00242 
00243   my %source_overlap = ();
00244   my %target_overlap = ();
00245 
00246   $self->logger->info( "Scoring...\n", 1, 'stamped' );
00247 
00248   while ( $source_ec or $target_ec ) {
00249     my $add_source = 0;
00250     my $add_target = 0;
00251 
00252     # compare exon containers
00253     if ( $source_ec && $target_ec ) {
00254       my $cmp =
00255         $self->compare_exon_containers( $source_ec, $target_ec );
00256       if ( $cmp <= 0 ) { $add_source = 1 }
00257       if ( $cmp >= 0 ) { $add_target = 1 }
00258     }
00259     elsif ($source_ec) {
00260       $add_source = 1;
00261     }
00262     elsif ($target_ec) {
00263       $add_target = 1;
00264     }
00265     else {
00266       die('The world is a strange place');
00267     }
00268 
00269     if ($add_source) {
00270       if ( $source_overlap{ $source_ec->[0] } ) {
00271         # remove exon from list of overlapping source exons to score
00272         # target against
00273         delete $source_overlap{ $source_ec->[0] };
00274       }
00275       else {
00276         # add exon to list of overlapping source exons to score target
00277         # against
00278         $source_overlap{ $source_ec->[0] } = $source_ec->[0];
00279 
00280         # score source exon against all target exons in current overlap
00281         # list
00282         foreach my $target_exon ( values %target_overlap ) {
00283           if ( defined( $matrix->get_score(
00284                                    $source_ec->[0]->id, $target_exon->id
00285                         ) ) )
00286           {
00287             next;
00288           }
00289 
00290           $self->calc_overlap_score( $source_ec->[0], $target_exon,
00291                                      $matrix );
00292         }
00293       } ## end else [ if ( $source_overlap{ ...})]
00294 
00295       # get next source exon container
00296       $source_ec = shift(@source_exons);
00297     } ## end if ($add_source)
00298 
00299     if ($add_target) {
00300       if ( $target_overlap{ $target_ec->[0] } ) {
00301         # remove exon from list of overlapping target exons to score
00302         # source against
00303         delete $target_overlap{ $target_ec->[0] };
00304       }
00305       else {
00306         # add exon to list of overlapping target exons to score source
00307         # against
00308         $target_overlap{ $target_ec->[0] } = $target_ec->[0];
00309 
00310         # score target exon against all source exons in current overlap
00311         # list
00312         foreach my $source_exon ( values %source_overlap ) {
00313           if ( defined( $matrix->get_score(
00314                                    $source_exon->id, $target_ec->[0]->id
00315                         ) ) )
00316           {
00317             next;
00318           }
00319 
00320           $self->calc_overlap_score( $source_exon, $target_ec->[0],
00321                                      $matrix );
00322         }
00323       }
00324 
00325       # get next target exon container
00326       $target_ec = shift(@target_exons);
00327     } ## end if ($add_target)
00328   } ## end while ( $source_ec or $target_ec)
00329 
00330   $self->logger->info( "Done.\n", 1, 'stamped' );
00331 
00332   return $matrix;
00333 } ## end sub build_overlap_scores
00334 
00335 
00336 #
00337 # Return a list of exon containers, sorted by seq_region_name, then location
00338 # (where location is either start-1 or end, so each exon is in the list twice).
00339 # An exon container is a listrefs of a TinyExon object and its location. This
00340 # implements the ExonSortContainer in the java application.
00341 #
00342 sub sort_exons {
00343   my $self  = shift;
00344   my $exons = shift;
00345 
00346   return sort {
00347     ( $a->[0]->common_sr_name cmp $b->[0]->common_sr_name ) ||
00348       ( $a->[1] <=> $b->[1] )
00349     } ( map { [ $_, $_->common_start - 1 ] } @$exons ),
00350     ( map { [ $_, $_->common_end ] } @$exons );
00351 }
00352 
00353 sub compare_exon_containers {
00354   my $self = shift;
00355   my $e1   = shift;
00356   my $e2   = shift;
00357 
00358   return ( ( $e1->[0]->common_sr_name cmp $e2->[0]->common_sr_name ) ||
00359            ( $e1->[1] <=> $e2->[1] ) );
00360 }
00361 
00362 #
00363 # Calculates overlap score between two exons. Its done by dividing
00364 # overlap region by exons sizes. 1.0 is full overlap on both exons.
00365 # Score of at least 0.5 are added to the exon scoring matrix.
00366 #
00367 sub calc_overlap_score {
00368   my $self        = shift;
00369   my $source_exon = shift;
00370   my $target_exon = shift;
00371   my $matrix      = shift;
00372 
00373   my ( $start, $end );
00374 
00375   # don't score if exons on different strand
00376   return unless ( $source_exon->strand == $target_exon->strand );
00377 
00378   # determine overlap start
00379   if ( $source_exon->start > $target_exon->start ) {
00380     $start = $source_exon->start;
00381   }
00382   else {
00383     $start = $target_exon->start;
00384   }
00385 
00386   # determine overlap end
00387   if ( $source_exon->end < $target_exon->end ) {
00388     $end = $source_exon->end;
00389   }
00390   else {
00391     $end = $target_exon->end;
00392   }
00393 
00394   #
00395   # Calculate score, which is defined as average overlap / exon length
00396   # ratio.
00397   #
00398 
00399   my $overlap       = $end - $start + 1;
00400   my $source_length = $source_exon->end - $source_exon->start + 1;
00401   my $target_length = $target_exon->end - $target_exon->start + 1;
00402 
00403   my $score = ( $overlap/$source_length + $overlap/$target_length )/2;
00404 
00405   # PENALTY:
00406   # penalise by 10% if phase if different
00407   if ( $source_exon->phase != $target_exon->phase ) {
00408     $score *= 0.9;
00409   }
00410 
00411   # add score to scoring matrix if it's at least 0.5
00412   if ( $score >= 0.5 ) {
00413     $matrix->add_score( $source_exon->id, $target_exon->id, $score );
00414   }
00415 
00416 } ## end sub calc_overlap_score
00417 
00418 
00419 sub run_exonerate {
00420   my $self = shift;
00421 
00422   my $source_file = $self->exon_fasta_file('source');
00423   my $target_file = $self->exon_fasta_file('target');
00424   my $source_size = -s $source_file;
00425   my $target_size = -s $target_file;
00426 
00427   # check if fasta files exist and are not empty
00428   unless ($source_size and $target_size) {
00429     throw("Can't find exon fasta files.");
00430   }
00431 
00432   # create an empty lsf log directory
00433   my $logpath = path_append($self->logger->logpath, 'exonerate');
00434   system("rm -rf $logpath") == 0 or
00435     $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
00436   system("mkdir -p $logpath") == 0 or
00437     $self->logger->error("Can't create lsf log dir $logpath: $!\n");
00438 
00439   # delete exonerate output from previous runs
00440   my $dump_path = $self->cache->dump_path;
00441 
00442   opendir(DUMPDIR, $dump_path) or
00443     $self->logger->error("Can't open $dump_path for reading: $!");
00444 
00445   while (defined(my $file = readdir(DUMPDIR))) {
00446     next unless /exonerate_map\.\d+/;
00447 
00448     unlink("$dump_path/$file") or
00449       $self->logger->error("Can't delete $dump_path/$file: $!");
00450   }
00451   
00452   closedir(DUMPDIR);
00453 
00454   # determine number of jobs to split task into
00455   my $bytes_per_job = $self->conf->param('exonerate_bytes_per_job')
00456     || 250000;
00457   my $num_jobs = $self->conf->param('exonerate_jobs');
00458   $num_jobs ||= int( $source_size/$bytes_per_job + 1 );
00459 
00460   my $percent =
00461     int( ( $self->conf->param('exonerate_threshold') || 0.5 )*100 );
00462   my $lsf_name       = 'idmapping_exonerate_' . time;
00463   my $exonerate_path = $self->conf->param('exonerate_path');
00464   my $exonerate_extra_params =
00465     $self->conf->param('exonerate_extra_params');
00466 
00467   #
00468   # run exonerate jobs using lsf
00469   #
00470   my $exonerate_job =
00471     qq{$exonerate_path } .
00472     qq{--query $source_file --target $target_file } .
00473     q{--querychunkid $LSB_JOBINDEX } .
00474     qq{--querychunktotal $num_jobs } .
00475     q{--model ungapped -M 1000 -D 100 } .
00476     q{--showalignment FALSE --subopt no } . qq{--percent $percent } .
00477     $self->conf->param('exonerate_extra_params') . " " .
00478     q{--ryo 'myinfo: %qi %ti %et %ql %tl\n' } .
00479     qq{| grep '^myinfo:' > $dump_path/exonerate_map.\$LSB_JOBINDEX} .
00480     "\n";
00481 
00482   $self->logger->info("Submitting $num_jobs exonerate jobs to lsf.\n");
00483   $self->logger->debug("$exonerate_job\n\n");
00484 
00485   my $bsub_cmd = sprintf(
00486                "|bsub -J%s[1-%d]%%%d -o %s/exonerate.%%I.out %s",
00487                $lsf_name,
00488                $num_jobs,
00489                $self->conf()->param('exonerate_concurrent_jobs') || 200,
00490                $logpath,
00491                $self->conf()->param('lsf_opt_exonerate') );
00492 
00493   local *BSUB;
00494   open( BSUB, $bsub_cmd )
00495     or $self->logger->error("Could not open open pipe to bsub: $!\n");
00496 
00497   print BSUB $exonerate_job;
00498   $self->logger->error("Error submitting exonerate jobs: $!\n")
00499     unless ($? == 0); 
00500   close BSUB;
00501 
00502   # submit dependent job to monitor finishing of exonerate jobs
00503   $self->logger->info("Waiting for exonerate jobs to finish...\n", 0, 'stamped');
00504 
00505   my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
00506     qq{-o $logpath/exonerate_depend.out /bin/true};
00507 
00508   system($dependent_job) == 0 or
00509     $self->logger->error("Error submitting dependent job: $!\n");
00510 
00511   $self->logger->info("All exonerate jobs finished.\n", 0, 'stamped');
00512 
00513   #
00514   # check results
00515   #
00516   my @missing;
00517   my @error;
00518   
00519   for (my $i = 1; $i <= $num_jobs; $i++) {
00520   
00521     # check that output file exists
00522     my $outfile = "$dump_path/exonerate_map.$i";
00523     push @missing, $outfile unless (-f "$outfile");
00524 
00525     # check no errors occurred
00526     my $errfile = "$logpath/exonerate.$i.err";
00527     push @error, $errfile if (-s "$errfile");
00528   }
00529 
00530   if (@missing) {
00531     $self->logger->info("Couldn't find all exonerate output files. These are missing:\n");
00532     foreach (@missing) {
00533       $self->logger->info("$_\n", 1);
00534     }
00535 
00536     exit(1);
00537   }
00538 
00539   if (@error) {
00540     $self->logger->info("One or more exonerate jobs failed. Check these error files:\n");
00541     foreach (@error) {
00542       $self->logger->info("$_\n", 1);
00543     }
00544 
00545     exit(1);
00546   }
00547 
00548 }
00549 
00550 
00551 sub exon_fasta_file {
00552   my $self = shift;
00553   my $type = shift;
00554 
00555   throw("You must provide a type.") unless $type;
00556 
00557   return $self->cache->dump_path."/$type.exons.fasta";
00558 }
00559 
00560 
00561 sub dump_filtered_exons {
00562   my $self = shift;
00563   my $matrix = shift;
00564 
00565   unless ($matrix and
00566           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00567     throw('You must provide a ScoredMappingMatrix.');
00568   }
00569 
00570   # write exons to fasta files
00571   my $source_count = $self->write_filtered_exons('source', $matrix);
00572   my $target_count = $self->write_filtered_exons('target', $matrix);
00573 
00574   # return true if both source and target exons were written; otherwise we
00575   # don't need to run exonerate
00576   return (($source_count > 0) and ($target_count > 0));
00577 }
00578 
00579 
00580 sub write_filtered_exons {
00581   my $self = shift;
00582   my $type = shift;
00583   my $matrix = shift;
00584 
00585   throw("You must provide a type.") unless $type;
00586   unless ($matrix and
00587           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00588     throw('You must provide a ScoredMappingMatrix.');
00589   }
00590 
00591   $self->logger->info("\nDumping $type exons to fasta file...\n", 0, 'stamped');
00592 
00593   # don't dump exons shorter than this
00594   my $min_exon_length = $self->conf->param('min_exon_length') || 15;
00595 
00596   # counters
00597   my $total_exons = 0;
00598   my $dumped_exons = 0;
00599 
00600   # filehandle for fasta files
00601   my $fh;
00602   my $file = $self->exon_fasta_file($type);
00603   open($fh, '>', $file) or throw("Unable to open $file for writing: $!");
00604 
00605   # loop over exons, dump sequence to fasta file if longer than threshold and
00606   # score < 1
00607   EXON:
00608   foreach my $eid (sort { $b <=> $a }
00609                    keys %{ $self->cache->get_by_name('exons_by_id', $type) }) {
00610 
00611     my $exon = $self->cache->get_by_key('exons_by_id', $type, $eid);
00612 
00613     $total_exons++;
00614 
00615     # skip if exon shorter than threshold
00616     next EXON if ($exon->length < $min_exon_length);
00617 
00618     # skip if overlap score with any other exon is 1
00619     if ( $type eq 'source' ) {
00620       foreach my $target ( @{ $matrix->get_targets_for_source($eid) } )
00621       {
00622         if ( $matrix->get_score( $eid, $target ) > 0.9999 ) {
00623           next EXON;
00624         }
00625       }
00626     } else {
00627       foreach my $source ( @{ $matrix->get_sources_for_target($eid) } )
00628       {
00629         if ( $matrix->get_score( $source, $eid ) > 0.9999 ) {
00630           next EXON;
00631         }
00632       }
00633     }
00634 
00635     # write exon to fasta file
00636     print $fh '>', $eid, "\n", $exon->seq, "\n";
00637 
00638     $dumped_exons++;
00639 
00640   }
00641 
00642   close($fh);
00643 
00644   # log
00645   my $fmt = "%-30s%10s\n";
00646   my $size = -s $file;
00647   $self->logger->info(sprintf($fmt, 'Total exons:', $total_exons), 1);
00648   $self->logger->info(sprintf($fmt, 'Dumped exons:', $dumped_exons), 1);
00649   $self->logger->info(sprintf($fmt, 'Dump file size:', parse_bytes($size)), 1);
00650   $self->logger->info("Done.\n\n", 0, 'stamped');
00651 
00652   return $dumped_exons;
00653 }
00654 
00655 sub parse_exonerate_results {
00656   my ( $self, $exonerate_matrix ) = @_;
00657 
00658   unless ( defined($exonerate_matrix)
00659            &&
00660            $exonerate_matrix->isa(
00661                          'Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')
00662     )
00663   {
00664     throw('You must provide a ScoredMappingMatrix.');
00665   }
00666 
00667   $self->logger->info( "Parsing exonerate results...\n", 0, 'stamped' );
00668 
00669   # loop over all result files
00670   my $dump_path = $self->cache->dump_path;
00671   my $num_files = 0;
00672   my $num_lines = 0;
00673 
00674   opendir( DUMPDIR, $dump_path ) or
00675     $self->logger->error("Can't open $dump_path for reading: $!");
00676 
00677   my $penalised = 0;
00678   my $killed    = 0;
00679 
00680   while ( defined( my $file = readdir(DUMPDIR) ) ) {
00681     unless ( $file =~ /exonerate_map\.\d+/ ) { next }
00682 
00683     $num_files++;
00684 
00685     open( F, '<', "$dump_path/$file" );
00686 
00687     my $threshold = $self->conf->param('exonerate_threshold') || 0.5;
00688 
00689     while (<F>) {
00690       $num_lines++;
00691       chomp;
00692 
00693   # line format:
00694   # myinfo: source_id target_id match_length source_length target_length
00695       my ( undef, $source_id, $target_id, $match_length, $source_length,
00696            $target_length )
00697         = split;
00698 
00699       my $score = 0;
00700 
00701       if ( $source_length == 0 or $target_length == 0 ) {
00702         $self->logger->warning(
00703                "Alignment length is 0 for $source_id or $target_id.\n");
00704       }
00705       else {
00706         $score = 2*$match_length/( $source_length + $target_length );
00707 
00708       }
00709 
00710       if ( $score > $threshold ) {
00711         my $source_sr =
00712           $self->cache()
00713           ->get_by_key( 'exons_by_id', 'source', $source_id )
00714           ->seq_region_name();
00715         my $target_sr =
00716           $self->cache()
00717           ->get_by_key( 'exons_by_id', 'target', $target_id )
00718           ->seq_region_name();
00719 
00720         if ( $source_sr ne $target_sr ) {
00721           # PENALTY: The target and source are not on the same
00722           # seq_region.
00723           $score *= 0.70;
00724 
00725           # With a penalty of 0.7, exonerate scores need to be above
00726           # approximately 0.714 to make the 0.5 threshold.
00727 
00728           ++$penalised;
00729         }
00730 
00731         if ( $score > $threshold ) {
00732           $exonerate_matrix->add_score( $source_id, $target_id,
00733                                         $score );
00734         }
00735         else {
00736           ++$killed;
00737         }
00738       } ## end if ( $score > $threshold)
00739 
00740     } ## end while (<F>)
00741 
00742     close(F);
00743   } ## end while ( defined( my $file...))
00744 
00745   closedir(DUMPDIR);
00746 
00747   $self->logger->info(
00748         "Done parsing $num_lines lines from $num_files result files.\n",
00749         0, 'stamped' );
00750   $self->logger->info( "Penalised $penalised exon alignments " .
00751                          "for not being on the same seq_region " .
00752                          "($killed killed).\n",
00753                        0,
00754                        'stamped' );
00755 
00756   return $exonerate_matrix;
00757 } ## end sub parse_exonerate_results
00758 
00759 
00760 sub non_mapped_transcript_rescore {
00761   my $self                = shift;
00762   my $matrix              = shift;
00763   my $transcript_mappings = shift;
00764 
00765   # argument checks
00766   unless ($matrix
00767       and $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix') )
00768   {
00769     throw(
00770        'Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix of exons.');
00771   }
00772 
00773   unless ( $transcript_mappings
00774      and
00775      $transcript_mappings->isa('Bio::EnsEMBL::IdMapping::MappingList') )
00776   {
00777     throw(
00778          'Need a Bio::EnsEMBL::IdMapping::MappingList of transcripts.');
00779   }
00780 
00781   # create of lookup hash of mapped source transcripts to target
00782   # transcripts
00783   my %transcript_lookup =
00784     map { $_->source => $_->target }
00785     @{ $transcript_mappings->get_all_Entries };
00786 
00787   my $i = 0;
00788 
00789   foreach my $entry ( @{ $matrix->get_all_Entries } ) {
00790 
00791     my @source_transcripts = @{
00792       $self->cache->get_by_key( 'transcripts_by_exon_id', 'source',
00793                                 $entry->source ) };
00794     my @target_transcripts = @{
00795       $self->cache->get_by_key( 'transcripts_by_exon_id', 'target',
00796                                 $entry->target ) };
00797 
00798     my $found_mapped = 0;
00799 
00800   TR:
00801     foreach my $source_tr (@source_transcripts) {
00802       foreach my $target_tr (@target_transcripts) {
00803         my $mapped_target = $transcript_lookup{ $source_tr->id };
00804 
00805         if ( $mapped_target and ( $mapped_target == $target_tr->id ) ) {
00806           $found_mapped = 1;
00807           last TR;
00808         }
00809       }
00810     }
00811 
00812     unless ($found_mapped) {
00813       # PENALTY: The exon appears to belong to a transcript that has not
00814       # been mapped.
00815       $matrix->set_score( $entry->source(), $entry->target(),
00816                           0.9*$entry->score() );
00817       $i++;
00818     }
00819   } ## end foreach my $entry ( @{ $matrix...})
00820 
00821   $self->logger->debug( "Scored exons in non-mapped transcripts: $i\n",
00822                         1 );
00823 } ## end sub non_mapped_transcript_rescore
00824 
00825 1;