Archive Ensembl HomeArchive Ensembl Home
SyntenyFramework.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::IdMapping::SyntenyFramework - framework representing syntenic
00024 regions across the genome
00025 
00026 =head1 SYNOPSIS
00027 
00028   # build the SyntenyFramework from unambiguous gene mappings
00029   my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
00030     -DUMP_PATH  => $dump_path,
00031     -CACHE_FILE => 'synteny_framework.ser',
00032     -LOGGER     => $self->logger,
00033     -CONF       => $self->conf,
00034     -CACHE      => $self->cache,
00035   );
00036   $sf->build_synteny($gene_mappings);
00037 
00038   # use it to rescore the genes
00039   $gene_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
00040 
00041 =head1 DESCRIPTION
00042 
00043 The SyntenyFramework is a set of SyntenyRegions. These are pairs of
00044 locations very analoguous to the information in the assembly table (the
00045 locations dont have to be the same length though). They are built from
00046 genes that map uniquely between source and target.
00047 
00048 Once built, the SyntenyFramework is used to score source and target gene
00049 pairs to determine whether they are similar. This process is slow (it
00050 involves testing all gene pairs against all SyntenyRegions), this module
00051 therefor has built-in support to run the process in parallel via LSF.
00052 
00053 =head1 METHODS
00054 
00055   new
00056   build_synteny
00057   _by_overlap
00058   add_SyntenyRegion
00059   get_all_SyntenyRegions
00060   rescore_gene_matrix_lsf
00061   rescore_gene_matrix
00062   logger
00063   conf
00064   cache
00065 
00066 =cut
00067 
00068 package Bio::EnsEMBL::IdMapping::SyntenyFramework;
00069 
00070 use strict;
00071 use warnings;
00072 no warnings 'uninitialized';
00073 
00074 use Bio::EnsEMBL::IdMapping::Serialisable;
00075 our @ISA = qw(Bio::EnsEMBL::IdMapping::Serialisable);
00076 
00077 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00078 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00079 use Bio::EnsEMBL::Utils::ScriptUtils qw(path_append);
00080 use Bio::EnsEMBL::IdMapping::SyntenyRegion;
00081 use Bio::EnsEMBL::IdMapping::ScoredMappingMatrix;
00082 
00083 use FindBin qw($Bin);
00084 FindBin->again;
00085 
00086 
00087 =head2 new
00088 
00089   Arg [LOGGER]: Bio::EnsEMBL::Utils::Logger $logger - a logger object
00090   Arg [CONF]  : Bio::EnsEMBL::Utils::ConfParser $conf - a configuration object
00091   Arg [CACHE] : Bio::EnsEMBL::IdMapping::Cache $cache - a cache object
00092   Arg [DUMP_PATH] : String - path for object serialisation
00093   Arg [CACHE_FILE] : String - filename of serialised object
00094   Example     : my $sf = Bio::EnsEMBL::IdMapping::SyntenyFramework->new(
00095                   -DUMP_PATH    => $dump_path,
00096                   -CACHE_FILE   => 'synteny_framework.ser',
00097                   -LOGGER       => $self->logger,
00098                   -CONF         => $self->conf,
00099                   -CACHE        => $self->cache,
00100                 );
00101   Description : Constructor.
00102   Return type : Bio::EnsEMBL::IdMapping::SyntenyFramework
00103   Exceptions  : thrown on wrong or missing arguments
00104   Caller      : InternalIdMapper plugins
00105   Status      : At Risk
00106               : under development
00107 
00108 =cut
00109 
00110 sub new {
00111   my $caller = shift;
00112   my $class = ref($caller) || $caller;
00113   my $self = $class->SUPER::new(@_);
00114 
00115   my ($logger, $conf, $cache) = rearrange(['LOGGER', 'CONF', 'CACHE'], @_);
00116 
00117   unless ($logger and ref($logger) and
00118           $logger->isa('Bio::EnsEMBL::Utils::Logger')) {
00119     throw("You must provide a Bio::EnsEMBL::Utils::Logger for logging.");
00120   }
00121   
00122   unless ($conf and ref($conf) and
00123           $conf->isa('Bio::EnsEMBL::Utils::ConfParser')) {
00124     throw("You must provide configuration as a Bio::EnsEMBL::Utils::ConfParser object.");
00125   }
00126   
00127   unless ($cache and ref($cache) and
00128           $cache->isa('Bio::EnsEMBL::IdMapping::Cache')) {
00129     throw("You must provide configuration as a Bio::EnsEMBL::IdMapping::Cache object.");
00130   }
00131   
00132   # initialise
00133   $self->logger($logger);
00134   $self->conf($conf);
00135   $self->cache($cache);
00136   $self->{'cache'} = [];
00137 
00138   return $self;
00139 }
00140 
00141 
00142 =head2 build_synteny
00143 
00144   Arg[1]      : Bio::EnsEMBL::IdMapping::MappingList $mappings - gene mappings
00145                 to build the SyntenyFramework from
00146   Example     : $synteny_framework->build_synteny($gene_mappings);
00147   Description : Builds the SyntenyFramework from unambiguous gene mappings.
00148                 SyntenyRegions are allowed to overlap. At most two overlapping
00149                 SyntenyRegions are merged (otherwise we'd get too large
00150                 SyntenyRegions with little information content).
00151   Return type : none
00152   Exceptions  : thrown on wrong or missing argument
00153   Caller      : InternalIdMapper plugins
00154   Status      : At Risk
00155               : under development
00156 
00157 =cut
00158 
00159 sub build_synteny {
00160   my $self = shift;
00161   my $mappings = shift;
00162   
00163   unless ($mappings and
00164           $mappings->isa('Bio::EnsEMBL::IdMapping::MappingList')) {
00165     throw('Need a gene Bio::EnsEMBL::IdMapping::MappingList.');
00166   }
00167 
00168   # create a synteny region for each mapping
00169   my @synteny_regions = ();
00170 
00171   foreach my $entry (@{ $mappings->get_all_Entries }) {
00172     
00173     my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
00174       $entry->source);
00175     my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
00176       $entry->target);
00177 
00178     my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([
00179       $source_gene->start,
00180       $source_gene->end,
00181       $source_gene->strand,
00182       $source_gene->seq_region_name,
00183       $target_gene->start,
00184       $target_gene->end,
00185       $target_gene->strand,
00186       $target_gene->seq_region_name,
00187       $entry->score,
00188     ]);
00189 
00190     push @synteny_regions, $sr;
00191   }
00192 
00193   unless (@synteny_regions) {
00194     $self->logger->warning("No synteny regions could be identified.\n");
00195     return;
00196   }
00197 
00198   # sort synteny regions
00199   #my @sorted = sort _by_overlap @synteny_regions;
00200   my @sorted = reverse sort {
00201     $a->source_seq_region_name cmp $b->source_seq_region_name ||
00202     $a->source_start <=> $b->source_start ||
00203     $a->source_end <=> $b->source_end } @synteny_regions;
00204 
00205   $self->logger->info("SyntenyRegions before merging: ".scalar(@sorted)."\n");
00206   
00207   # now create merged regions from overlapping syntenies, but only merge a
00208   # maximum of 2 regions (otherwise you end up with large synteny blocks which
00209   # won't contain much information in this context)
00210   my $last_merged = 0;
00211   my $last_sr = shift(@sorted);
00212 
00213   while (my $sr = shift(@sorted)) {
00214     #$self->logger->debug("this ".$sr->to_string."\n");
00215   
00216     my $merged_sr = $last_sr->merge($sr);
00217 
00218     if (! $merged_sr) {
00219       unless ($last_merged) {
00220         $self->add_SyntenyRegion($last_sr->stretch(2));
00221         #$self->logger->debug("nnn  ".$last_sr->to_string."\n");
00222       }
00223       $last_merged = 0;
00224     } else {
00225       $self->add_SyntenyRegion($merged_sr->stretch(2));
00226       #$self->logger->debug("mmm  ".$merged_sr->to_string."\n");
00227       $last_merged = 1;
00228     }
00229     
00230     $last_sr = $sr;
00231   }
00232 
00233   # deal with last synteny region in @sorted
00234   unless ($last_merged) {
00235     $self->add_SyntenyRegion($last_sr->stretch(2));
00236     $last_merged = 0;
00237   }
00238 
00239   #foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
00240   #  $self->logger->debug("SRs ".$sr->to_string."\n");
00241   #}
00242   
00243   $self->logger->info("SyntenyRegions after merging: ".scalar(@{ $self->get_all_SyntenyRegions })."\n");
00244 
00245 }
00246 
00247 
00248 #
00249 # sort SyntenyRegions by overlap
00250 #
00251 sub _by_overlap {
00252   # first sort by seq_region
00253   my $retval = ($b->source_seq_region_name cmp $a->source_seq_region_name);
00254   return $retval if ($retval);
00255 
00256   # then sort by overlap:
00257   # return -1 if $a is downstream, 1 if it's upstream, 0 if they overlap
00258   if ($a->source_end < $b->source_start) { return 1; }
00259   if ($a->source_start < $b->source_end) { return -1; }
00260   return 0;
00261 }
00262 
00263 
00264 =head2 add_SyntenyRegion
00265 
00266   Arg[1]      : Bio::EnsEMBL::IdMaping::SyntenyRegion - SyntenyRegion to add
00267   Example     : $synteny_framework->add_SyntenyRegion($synteny_region);
00268   Description : Adds a SyntenyRegion to the framework. For speed reasons (and
00269                 since this is an internal method), no argument check is done.
00270   Return type : none
00271   Exceptions  : none
00272   Caller      : internal
00273   Status      : At Risk
00274               : under development
00275 
00276 =cut
00277 
00278 sub add_SyntenyRegion {
00279   push @{ $_[0]->{'cache'} }, $_[1];
00280 }
00281 
00282 
00283 =head2 get_all_SyntenyRegions
00284 
00285   Example     : foreach my $sr (@{ $sf->get_all_SyntenyRegions }) {
00286                   # do something with the SyntenyRegion
00287                 }
00288   Description : Get a list of all SyntenyRegions in the framework.
00289   Return type : Arrayref of Bio::EnsEMBL::IdMapping::SyntenyRegion
00290   Exceptions  : none
00291   Caller      : general
00292   Status      : At Risk
00293               : under development
00294 
00295 =cut
00296 
00297 sub get_all_SyntenyRegions {
00298   return $_[0]->{'cache'};
00299 }
00300 
00301 
00302 =head2 rescore_gene_matrix_lsf
00303 
00304   Arg[1]      : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
00305                 scores to rescore
00306   Example     : my $new_scores = $sf->rescore_gene_matrix_lsf($gene_scores);
00307   Description : This method runs rescore_gene_matrix() (via the
00308                 synteny_resocre.pl script) in parallel with lsf, then combines
00309                 the results to return a single rescored scoring matrix.
00310                 Parallelisation is done by chunking the scoring matrix into
00311                 several pieces (determined by the --synteny_rescore_jobs
00312                 configuration option).
00313   Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
00314   Exceptions  : thrown on wrong or missing argument
00315                 thrown on filesystem I/O error
00316                 thrown on failure of one or mor lsf jobs
00317   Caller      : InternalIdMapper plugins
00318   Status      : At Risk
00319               : under development
00320 
00321 =cut
00322 
00323 sub rescore_gene_matrix_lsf {
00324   my $self = shift;
00325   my $matrix = shift;
00326 
00327   unless ($matrix and
00328           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00329     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00330   }
00331 
00332   # serialise SyntenyFramework to disk
00333   $self->logger->debug("Serialising SyntenyFramework...\n", 0, 'stamped');
00334   $self->write_to_file;
00335   $self->logger->debug("Done.\n", 0, 'stamped');
00336 
00337   # split the ScoredMappingMatrix into chunks and write to disk
00338   my $matrix_size = $matrix->size;
00339   $self->logger->debug("Scores before rescoring: $matrix_size.\n");
00340 
00341   my $num_jobs = $self->conf->param('synteny_rescore_jobs') || 20;
00342   $num_jobs++;
00343 
00344   my $dump_path = path_append($self->conf->param('basedir'),
00345     'matrix/synteny_rescore');
00346 
00347   $self->logger->debug("Creating sub-matrices...\n", 0, 'stamped');
00348   foreach my $i (1..$num_jobs) {
00349     my $start = (int($matrix_size/($num_jobs-1)) * ($i - 1)) + 1;
00350     my $end = int($matrix_size/($num_jobs-1)) * $i;
00351     $self->logger->debug("$start-$end\n", 1);
00352     my $sub_matrix = $matrix->sub_matrix($start, $end);
00353     
00354     $sub_matrix->cache_file_name("gene_matrix_synteny$i.ser");
00355     $sub_matrix->dump_path($dump_path);
00356     
00357     $sub_matrix->write_to_file;
00358   }
00359   $self->logger->debug("Done.\n", 0, 'stamped');
00360 
00361   # create an empty lsf log directory
00362   my $logpath = path_append($self->logger->logpath, 'synteny_rescore');
00363   system("rm -rf $logpath") == 0 or
00364     $self->logger->error("Unable to delete lsf log dir $logpath: $!\n");
00365   system("mkdir -p $logpath") == 0 or
00366     $self->logger->error("Can't create lsf log dir $logpath: $!\n");
00367 
00368   # build lsf command
00369   my $lsf_name = 'idmapping_synteny_rescore_'.time;
00370 
00371   my $options = $self->conf->create_commandline_options(
00372       logauto       => 1,
00373       logautobase   => "synteny_rescore",
00374       logpath       => $logpath,
00375       interactive   => 0,
00376       is_component  => 1,
00377   );
00378 
00379   my $cmd = qq{$Bin/synteny_rescore.pl $options --index \$LSB_JOBINDEX};
00380 
00381   my $bsub_cmd =
00382     sprintf( "|bsub -J%s[1-%d] "
00383                             . "-o %s/synteny_rescore.%%I.out "
00384                             . "-e %s/synteny_rescore.%%I.err %s",
00385              $lsf_name, $num_jobs, $logpath, $logpath,
00386              $self->conf()->param('lsf_opt_synteny_rescore') );
00387 
00388   # run lsf job array
00389   $self->logger->info("Submitting $num_jobs jobs to lsf.\n");
00390   $self->logger->debug("$cmd\n\n");
00391 
00392   local *BSUB;
00393   open( BSUB, $bsub_cmd )
00394     or $self->logger->error("Could not open open pipe to bsub: $!\n");
00395 
00396   print BSUB $cmd;
00397   $self->logger->error("Error submitting synteny rescoring jobs: $!\n")
00398     unless ($? == 0); 
00399   close BSUB;
00400 
00401   # submit dependent job to monitor finishing of jobs
00402   $self->logger->info("Waiting for jobs to finish...\n", 0, 'stamped');
00403 
00404   my $dependent_job = qq{bsub -K -w "ended($lsf_name)" -q small } .
00405     qq{-o $logpath/synteny_rescore_depend.out /bin/true};
00406 
00407   system($dependent_job) == 0 or
00408     $self->logger->error("Error submitting dependent job: $!\n");
00409 
00410   $self->logger->info("All jobs finished.\n", 0, 'stamped');
00411 
00412   # check for lsf errors
00413   sleep(5);
00414   my $err;
00415   foreach my $i (1..$num_jobs) {
00416     $err++ unless (-e "$logpath/synteny_rescore.$i.success");
00417   }
00418 
00419   if ($err) {
00420     $self->logger->error("At least one of your jobs failed.\nPlease check the logfiles at $logpath for errors.\n");
00421   }
00422 
00423   # merge and return matrix
00424   $self->logger->debug("Merging rescored matrices...\n");
00425   $matrix->flush;
00426 
00427   foreach my $i (1..$num_jobs) {
00428     # read partial matrix created by lsf job from file
00429     my $sub_matrix = Bio::EnsEMBL::IdMapping::ScoredMappingMatrix->new(
00430       -DUMP_PATH   => $dump_path,
00431       -CACHE_FILE  => "gene_matrix_synteny$i.ser",
00432     );
00433     $sub_matrix->read_from_file;
00434 
00435     # merge with main matrix
00436     $matrix->merge($sub_matrix);
00437   }
00438 
00439   $self->logger->debug("Done.\n");
00440   $self->logger->debug("Scores after rescoring: ".$matrix->size.".\n");
00441   
00442   return $matrix;
00443 }
00444 
00445 
00446 # 
00447 #
00448 =head2 rescore_gene_matrix
00449 
00450   Arg[1]      : Bio::EnsEMBL::IdMapping::ScoredmappingMatrix $matrix - gene
00451                 scores to rescore
00452   Example     : my $new_scores = $sf->rescore_gene_matrix($gene_scores);
00453   Description : Rescores a gene matrix. Retains 70% of old score and builds
00454                 other 30% from the synteny match.
00455   Return type : Bio::EnsEMBL::IdMapping::ScoredMappingMatrix
00456   Exceptions  : thrown on wrong or missing argument
00457   Caller      : InternalIdMapper plugins
00458   Status      : At Risk
00459               : under development
00460 
00461 =cut
00462 
00463 sub rescore_gene_matrix {
00464   my $self = shift;
00465   my $matrix = shift;
00466 
00467   unless ($matrix and
00468           $matrix->isa('Bio::EnsEMBL::IdMapping::ScoredMappingMatrix')) {
00469     throw('Need a Bio::EnsEMBL::IdMapping::ScoredMappingMatrix.');
00470   }
00471 
00472   my $retain_factor = 0.7;
00473 
00474   foreach my $entry (@{ $matrix->get_all_Entries }) {
00475     my $source_gene = $self->cache->get_by_key('genes_by_id', 'source',
00476       $entry->source);
00477 
00478     my $target_gene = $self->cache->get_by_key('genes_by_id', 'target',
00479       $entry->target);
00480 
00481     my $highest_score = 0;
00482 
00483     foreach my $sr (@{ $self->get_all_SyntenyRegions }) {
00484       my $score = $sr->score_location_relationship($source_gene, $target_gene);
00485       $highest_score = $score if ($score > $highest_score);
00486     }
00487 
00488     #$self->logger->debug("highscore ".$entry->to_string." ".
00489     #  sprintf("%.6f\n", $highest_score));
00490 
00491     $matrix->set_score($entry->source, $entry->target,
00492       ($entry->score * 0.7 + $highest_score * 0.3));
00493   }
00494 
00495   return $matrix;
00496 }
00497 
00498 
00499 =head2 logger
00500 
00501   Arg[1]      : (optional) Bio::EnsEMBL::Utils::Logger - the logger to set
00502   Example     : $object->logger->info("Starting ID mapping.\n");
00503   Description : Getter/setter for logger object
00504   Return type : Bio::EnsEMBL::Utils::Logger
00505   Exceptions  : none
00506   Caller      : constructor
00507   Status      : At Risk
00508               : under development
00509 
00510 =cut
00511 
00512 sub logger {
00513   my $self = shift;
00514   $self->{'_logger'} = shift if (@_);
00515   return $self->{'_logger'};
00516 }
00517 
00518 
00519 =head2 conf
00520 
00521   Arg[1]      : (optional) Bio::EnsEMBL::Utils::ConfParser - the configuration
00522                 to set
00523   Example     : my $basedir = $object->conf->param('basedir');
00524   Description : Getter/setter for configuration object
00525   Return type : Bio::EnsEMBL::Utils::ConfParser
00526   Exceptions  : none
00527   Caller      : constructor
00528   Status      : At Risk
00529               : under development
00530 
00531 =cut
00532 
00533 sub conf {
00534   my $self = shift;
00535   $self->{'_conf'} = shift if (@_);
00536   return $self->{'_conf'};
00537 }
00538 
00539 
00540 =head2 cache
00541 
00542   Arg[1]      : (optional) Bio::EnsEMBL::IdMapping::Cache - the cache to set
00543   Example     : $object->cache->read_from_file('source');
00544   Description : Getter/setter for cache object
00545   Return type : Bio::EnsEMBL::IdMapping::Cache
00546   Exceptions  : none
00547   Caller      : constructor
00548   Status      : At Risk
00549               : under development
00550 
00551 =cut
00552 
00553 sub cache {
00554   my $self = shift;
00555   $self->{'_cache'} = shift if (@_);
00556   return $self->{'_cache'};
00557 }
00558 
00559 
00560 1;
00561