Archive Ensembl HomeArchive Ensembl Home
AssemblyProjector.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 =head1 NAME
00022 
00023 Bio::EnsEMBL::Utils::AssemblyProjector -
00024 utility class to post-process projections from one assembly to another
00025 
00026 =head1 SYNOPSIS
00027 
00028   # connect to an old database
00029   my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor(
00030     -host   => 'ensembldb.ensembl.org',
00031     -port   => 3306,
00032     -user   => ensro,
00033     -dbname => 'mus_musculus_core_46_36g',
00034     -group  => 'core_old',
00035   );
00036 
00037   # connect to the new database containing the mapping between old and
00038   # new assembly
00039   my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor(
00040     -host   => 'ensembldb.ensembl.org',
00041     -port   => 3306,
00042     -user   => ensro,
00043     -dbname => 'mus_musculus_core_47_37',
00044     -group  => 'core_new',
00045   );
00046 
00047   my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new(
00048     -OLD_ASSEMBLY    => 'NCBIM36',
00049     -NEW_ASSEMBLY    => 'NCBIM37',
00050     -ADAPTOR         => $dba_new,
00051     -EXTERNAL_SOURCE => 1,
00052     -MERGE_FRAGMENTS => 1,
00053     -CHECK_LENGTH    => 0,
00054   );
00055 
00056   # fetch a slice on the old assembly
00057   my $slice_adaptor = $dba_old->get_SliceAdaptor;
00058   my $slice =
00059     $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef,
00060     undef, 'NCBIM36' );
00061 
00062   my $new_slice = $assembly_projector->old_to_new($slice);
00063 
00064   print $new_slice->name, " (", $assembly_projector->last_status, ")\n";
00065 
00066 =head1 DESCRIPTION
00067 
00068 This class implements some utility functions for converting coordinates
00069 between assemblies. A mapping between the two assemblies has to present
00070 the database for this to work, see the 'Related Modules' section below
00071 on how to generate the mapping.
00072 
00073 In addition to the "raw" projecting of features and slices, the methods
00074 in this module also apply some sensible rules to the results of the
00075 projection (like discarding unwanted results or merging fragmented
00076 projections). These are the rules (depending on configuration):
00077 
00078 Discard the projected feature/slice if:
00079 
00080   1. it doesn't project at all (no segments returned)
00081   2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
00082      than one segment)
00083   3. [if CHECK_LENGTH is set] the projection doesn't have the same
00084      length as the original feature/slice
00085   4. all segments are on same chromosome and strand
00086 
00087 If a projection fails any of these rules, undef is returned instead of
00088 a projected feature/slice. You can use the last_status() method to find
00089 out about the results of the rules tests.
00090 
00091 Also note that when projecting features, only a shallow projection is
00092 performed, i.e. other features attached to your features (e.g. the
00093 transcripts of a gene) are not projected automatically, so it will be
00094 the responsability of the user code project all levels of features
00095 involved.
00096 
00097 =head1 METHODS
00098 
00099   new
00100   project
00101   old_to_new
00102   new_to_old
00103   adaptor
00104   external_source
00105   old_assembly
00106   new_assembly
00107   merge_fragments
00108   check_length
00109 
00110 =head1 RELATED MODULES
00111 
00112 The process of creating a whole genome alignment between two assemblies
00113 (which is the basis for the use of the methods in this class) is done by
00114 a series of scripts. Please see
00115 
00116   ensembl/misc-scripts/assembly/README
00117 
00118 for a high-level description of this process, and POD in the individual
00119 scripts for the details.
00120 
00121 =cut
00122 
00123 package Bio::EnsEMBL::Utils::AssemblyProjector;
00124 
00125 use strict;
00126 use warnings;
00127 no warnings qw(uninitialized);
00128 
00129 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00130 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00131 use Bio::EnsEMBL::Slice;
00132 use Scalar::Util qw(weaken);
00133 
00134 =head2 new
00135 
00136   Arg [ADAPTOR]         : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor
00137                           for a database containing the assembly mapping
00138   Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if
00139                           source is from a different database
00140   Arg [OLD_ASSEMBLY]    : name of the old assembly
00141   Arg [OLD_ASSEMBLY]    : name of the new assembly
00142   Arg [OBJECT_TYPE]     : (optional) object type ('slice' or 'feature')
00143   Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged
00144                           to return a single object spanning all segments
00145                           (default: true)
00146   Arg [CHECK_LENGTH]    : (optional) Boolean - determines if projected objects
00147                           have to have same length as original (default: false)
00148   Example     : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new(
00149                   -DBADAPTOR    => $dba,
00150                   -OLD_ASSEMBLY => NCBIM36,
00151                   -NEW_ASSEMBLY => NCBIM37,
00152                 );
00153   Description : Constructor.
00154   Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object
00155   Exceptions  : thrown on missing arguments
00156                 thrown on invalid OBJECT_TYPE
00157   Caller      : general
00158   Status      : At Risk
00159               : under development
00160 
00161 =cut
00162 
00163 sub new {
00164   my $caller = shift;
00165   my $class = ref($caller) || $caller;
00166 
00167   my ($adaptor, $external_source, $old_assembly, $new_assembly,
00168       $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE 
00169         OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_);
00170 
00171   unless ($adaptor and ref($adaptor) and
00172           $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
00173     throw("You must provide a DBAdaptor to a database containing the assembly mapping.");
00174   }
00175 
00176   unless ($old_assembly and $new_assembly) {
00177     throw("You must provide an old and new assembly name.");
00178   }
00179 
00180   my $self = {};
00181   bless ($self, $class);
00182 
00183   # initialise
00184   $self->adaptor($adaptor);
00185   $self->{'old_assembly'} = $old_assembly;
00186   $self->{'new_assembly'} = $new_assembly;
00187   
00188   # by default, merge fragments
00189   $self->{'merge_fragments'} = $merge_fragments || 1;
00190 
00191   # by default, do not check length
00192   $self->{'check_length'} = $check_length || 0;
00193 
00194   # by default, features and slices are expected in same database as the 
00195   # assembly mapping
00196   $self->{'external_source'} = $external_source || 0;
00197 
00198   return $self;
00199 }
00200 
00201 
00202 =head2 project
00203 
00204   Arg[1]      : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
00205                 the object to project
00206   Arg[2]      : String $to_assembly - assembly to project to
00207   Example     : my $new_slice = $assembly_projector->project($old_slice, 
00208                   'NCBIM37');
00209   Description : Projects a Slice or Feature to the specified assembly.
00210 
00211                 Several tests are performed on the result to discard unwanted
00212                 results. All projection segments have to be on the same
00213                 seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be
00214                 bridged by creating a single object from first_segment_start to
00215                 last_segment_end. If -CHECK_LENGTH is set, the projected object
00216                 will have to have the same length as the original. You can use
00217                 the last_status() method to find out what the result of some of
00218                 these rule tests were. Please see the comments in the code for
00219                 more details about these rules.
00220 
00221                 The return value of this method will always be a single object,
00222                 or undef if the projection fails any of the rules.
00223                 
00224                 Note that when projecting features, only a "shallow" projection
00225                 is performed, i.e. attached features aren't projected
00226                 automatically! (e.g. if you project a gene, its transcripts will
00227                 have to be projected manually before storing the new gene)
00228   Return type : same a Arg 1, or undef if projection fails any of the rules
00229   Exceptions  : thrown on invalid arguments
00230   Caller      : general, $self->old_to_new, $self->new_to_old
00231   Status      : At Risk
00232               : under development
00233 
00234 =cut
00235 
00236 sub project {
00237   my ($self, $object, $to_assembly) = @_;
00238 
00239   throw("Need an assembly version to project to.") unless ($to_assembly);
00240   throw("Need an object to project.") unless ($object and ref($object));
00241 
00242   my ($slice, $object_type);
00243 
00244   if ($object->isa('Bio::EnsEMBL::Feature')) {
00245     $object_type = 'feature';
00246   } elsif ($object->isa('Bio::EnsEMBL::Slice')) {
00247     $object_type = 'slice';
00248   } else {
00249     throw("Need a Feature or Slice to project.");
00250   }
00251 
00252   # if the feature or slice is sourced from another db, we have to "transfer"
00253   # it to the db that contains the assembly mapping. the transfer is very 
00254   # shallow but that should do for our purposes
00255   if ($self->external_source) {
00256     my $slice_adaptor = $self->adaptor->get_SliceAdaptor;
00257 
00258     if ($object_type eq 'feature') {
00259     
00260       # createa a new slice from the target db
00261       my $f_slice = $object->slice;
00262       my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name);
00263       
00264       # now change the feature so that it appears it's from the target db
00265       $object->slice($target_slice);
00266     
00267     } else {
00268     
00269       # createa a new slice from the target db
00270       $object = $slice_adaptor->fetch_by_name($object->name);
00271       
00272     }
00273   }
00274 
00275   if ($object_type eq 'feature') {
00276     $slice = $object->feature_Slice;
00277   } else {
00278     $slice = $object;
00279   }
00280 
00281   # warn if trying to project to assembly version the object already is on
00282   if ($slice->coord_system->version eq $to_assembly) {
00283     warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.").");
00284   }
00285 
00286   # now project the slice
00287   my $cs_name = $slice->coord_system_name;
00288   my @segments = @{ $slice->project($cs_name, $to_assembly) };
00289 
00290   # we need to reverse the projection segment list if the orignial 
00291   if ($slice->strand == -1) {
00292     @segments = reverse(@segments);
00293   }
00294 
00295   # apply rules to projection results
00296   # 
00297   # discard the projected feature/slice if
00298   #   1. it doesn't project at all (no segments returned)
00299   #   2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
00300   #      than one segment)
00301   #   3. [if CHECK_LENGTH is set] the projection doesn't have the same length
00302   #      as the original feature/slice
00303   #   4. all segments are on same chromosome and strand
00304 
00305   # keep track of the status of applied rules
00306   my @status = ();
00307 
00308   # test (1)
00309   return undef unless (@segments);
00310   #warn "DEBUG: passed test 1\n";
00311 
00312   # test (2)
00313   return undef if (!($self->merge_fragments) and scalar(@segments) > 1);
00314   push @status, 'fragmented' if (scalar(@segments) > 1);
00315   #warn "DEBUG: passed test 2\n";
00316 
00317   # test (3)
00318   my $first_slice = $segments[0]->to_Slice;
00319   my $last_slice = $segments[-1]->to_Slice;
00320   my $length_mismatch = (($last_slice->end - $first_slice->start + 1) !=
00321     $object->length);
00322   return undef if ($self->check_length and $length_mismatch);
00323   push @status, 'length_mismatch' if ($length_mismatch);
00324   #warn "DEBUG: passed test 3\n";
00325 
00326   # test (4)
00327   my %sr_names = ();
00328   my %strands = ();
00329   foreach my $seg (@segments) {
00330     my $sl = $seg->to_Slice;
00331     $sr_names{$sl->seq_region_name}++;
00332     $strands{$sl->strand}++;
00333   }
00334   return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1);
00335   #warn "DEBUG: passed test 4\n";
00336 
00337   # remember rule status
00338   $self->last_status(join('|', @status));
00339 
00340   # everything looks fine, so adjust the coords of your feature/slice
00341   my $new_slice = $first_slice;
00342   $new_slice->{'end'} = $last_slice->end;
00343 
00344   if ($object_type eq 'slice') {
00345     return $new_slice;
00346   } else {
00347   
00348     $object->start($new_slice->start);
00349     $object->end($new_slice->end);
00350     $object->strand($new_slice->strand);
00351     $object->slice($new_slice->seq_region_Slice);
00352 
00353     # undef dbID and adaptor so you can store the feature in the target db
00354     $object->dbID(undef);
00355     $object->adaptor(undef);
00356 
00357     return $object;
00358   }
00359   
00360 }
00361 
00362 
00363 =head2 old_to_new
00364 
00365   Arg[1]      : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
00366                 the object to project
00367   Example     : my $new_slice = $assembly_projector->old_to_new($old_slice);
00368   Description : Projects a Slice or Feature from old to new assembly.
00369                 This method is just a convenience wrapper for $self->project.
00370   Return type : same a Arg 1, or undef
00371   Exceptions  : none
00372   Caller      : general
00373   Status      : At Risk
00374               : under development
00375 
00376 =cut
00377 
00378 sub old_to_new {
00379   my ($self, $object) = @_;
00380   return $self->project($object, $self->new_assembly);
00381 }
00382 
00383 
00384 =head2 new_to_old
00385 
00386   Arg[1]      : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
00387                 the object to project
00388   Example     : my $old_slice = $assembly_projector->new_to_old($new_slice, 1);
00389   Description : Projects a Slice or Feature from new to old assembly.
00390                 This method is just a convenience wrapper for $self->project.
00391   Return type : same a Arg 1, or undef
00392   Exceptions  : none
00393   Caller      : general
00394   Status      : At Risk
00395               : under development
00396 
00397 =cut
00398 
00399 sub new_to_old {
00400   my ($self, $object) = @_;
00401   return $self->project($object, $self->old_assembly);
00402 }
00403 
00404 
00405 #
00406 # accessors
00407 #
00408 sub adaptor {
00409   my $self = shift;
00410   weaken($self->{'adaptor'} = shift) if (@_);
00411   return $self->{'adaptor'};
00412 }
00413 
00414 
00415 sub external_source {
00416   my $self = shift;
00417   $self->{'external_source'} = shift if (@_);
00418   return $self->{'external_source'};
00419 }
00420 
00421 
00422 sub old_assembly {
00423   my $self = shift;
00424   $self->{'old_assembly'} = shift if (@_);
00425   return $self->{'old_assembly'};
00426 }
00427 
00428 
00429 sub new_assembly {
00430   my $self = shift;
00431   $self->{'new_assembly'} = shift if (@_);
00432   return $self->{'new_assembly'};
00433 }
00434 
00435 
00436 sub merge_fragments {
00437   my $self = shift;
00438   $self->{'merge_fragments'} = shift if (@_);
00439   return $self->{'merge_fragments'};
00440 }
00441 
00442 
00443 sub check_length {
00444   my $self = shift;
00445   $self->{'check_length'} = shift if (@_);
00446   return $self->{'check_length'};
00447 }
00448 
00449 
00450 sub last_status {
00451   my $self = shift;
00452   $self->{'last_status'} = shift if (@_);
00453   return $self->{'last_status'};
00454 }
00455 
00456 
00457 1;
00458 
00459