Archive Ensembl HomeArchive Ensembl Home
Slice.pm
Go to the documentation of this file.
00001 #
00002 # Ensembl module for Bio::EnsEMBL::Compara::AlignSlice::Slice
00003 #
00004 # Original author: Javier Herrero <jherrero@ebi.ac.uk>
00005 #
00006 # Copyright EnsEMBL Team
00007 #
00008 # You may distribute this module under the same terms as perl itself
00009 
00010 # pod documentation - main docs before the code
00011 
00012 =head1 NAME
00013 
00014 Bio::EnsEMBL::Compara::AlignSlice::Slice - These objects contain all the information needed
00015 for mapping features through genomic alignments
00016 
00017 =head1 INHERITING
00018 
00019 This module inherits methods and attributes from Bio::EnsEMBL::Slice module.
00020 
00021 =head1 SYNOPSIS
00022   
00023 
00024 SET VALUES
00025 
00026 GET VALUES
00027 
00028 =head1 OBJECT ATTRIBUTES
00029 
00030 =over
00031 
00032 =item attribute
00033 
00034 Description
00035 
00036 =back
00037 
00038 =head1 AUTHORS
00039 
00040 Javier Herrero (jherrero@ebi.ac.uk)
00041 
00042 =head1 COPYRIGHT
00043 
00044 Copyright (c) 1999-2012. EnsEMBL Team
00045 
00046 You may distribute this module under the same terms as perl itself
00047 
00048 =head1 CONTACT
00049 
00050 This modules is part of the EnsEMBL project (http://www.ensembl.org)
00051 
00052 Questions can be posted to the ensembl-dev mailing list:
00053 dev@ensembl.org
00054 
00055 =head1 APPENDIX
00056 
00057 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
00058 
00059 =cut
00060 
00061 
00062 # Let the code begin...
00063 
00064 
00065 package Bio::EnsEMBL::Compara::AlignSlice::Slice;
00066 
00067 use strict;
00068 use Bio::EnsEMBL::Slice;
00069 use Bio::EnsEMBL::CoordSystem;
00070 use Bio::EnsEMBL::Compara::AlignSlice::Translation;
00071 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00072 use Bio::EnsEMBL::Utils::Exception qw(throw warning info verbose);
00073 use Scalar::Util qw(weaken);
00074 
00075 
00076 our @ISA = qw(Bio::EnsEMBL::Slice);
00077 
00078 ## Creates a new coordinate system for creating gap Slices.
00079 my $gap_coord_system = new Bio::EnsEMBL::CoordSystem(
00080         -NAME => 'gap',
00081         -VERSION => "none",
00082         -TOP_LEVEL => 0,
00083         -SEQUENCE_LEVEL => 1,
00084         -RANK => 1,
00085     );
00086 
00087 
00088 =head2 new (CONSTRUCTOR)
00089 
00090   Arg[1]     : 
00091   Example    : 
00092   Description: 
00093   Returntype : 
00094   Exceptions : 
00095   Caller     : Bio::EnsEMBL::Compara::AlignSlice->_create_underlying_Slices()
00096 
00097 =cut
00098 
00099 sub new {
00100   my ($class, @args) = @_;
00101 
00102   my $self = {};
00103   bless $self,$class;
00104 
00105   my ($length, $requesting_slice, $align_slice, $method_link_species_set, $genome_db, $expanded) =
00106       rearrange([qw(
00107           LENGTH REQUESTING_SLICE ALIGN_SLICE METHOD_LINK_SPECIES_SET GENOME_DB EXPANDED
00108       )], @args);
00109 
00110   my $version = "";
00111   if ($requesting_slice and ref($requesting_slice) and
00112       $requesting_slice->isa("Bio::EnsEMBL::Slice")) {
00113     $self->{'requesting_slice'} = $requesting_slice;
00114     my $name = $requesting_slice->name;
00115     $name =~ s/\:/_/g;
00116     $version .= $name;
00117   }
00118   weaken($self->{_align_slice} = $align_slice) if ($align_slice);
00119   if ($method_link_species_set and ref($method_link_species_set) and
00120       $method_link_species_set->isa("Bio::EnsEMBL::Compara::MethodLinkSpeciesSet")) {
00121     $self->{_method_link_species_set} = $method_link_species_set;
00122     $version .= "+".$method_link_species_set->method_link_type;
00123     my $species_set = $method_link_species_set->species_set();
00124     if ($species_set) {
00125       $species_set = [sort {$a->name cmp $b->name} @{$species_set}];
00126       $version .= "(\"".join("\"+\"", map {$_->name} @$species_set)."\")";
00127     }
00128     if ($expanded) {
00129       $version .= "+expanded";
00130     } else {
00131       $version .= "+condensed";
00132     }
00133   }
00134   my $coord_system = new Bio::EnsEMBL::CoordSystem(
00135           -NAME => 'align_slice',
00136           -VERSION => $version,
00137           -TOP_LEVEL => 0,
00138           -SEQUENCE_LEVEL => 1,
00139           -RANK => 1,
00140       );
00141 
00142   $self->{start} = 1;
00143   $self->{end} = $length;
00144   $self->{strand} = 1;
00145   $self->{adaptor} = undef;
00146   $self->{coord_system} = $coord_system;
00147   $self->genome_db($genome_db) if (defined($genome_db));
00148   $self->{seq_region_name} = (eval{$genome_db->name} or "FakeAlignSlice");
00149   $self->{display_Slice_name} = $self->{seq_region_name};
00150   $self->{display_Slice_name} =~ s/ /_/g;
00151   $self->{seq_region_length} = $length;
00152   $self->{slice_mapper_pairs} = [];
00153 #   $self->{located_slices} = [];
00154 
00155   if (!$self->genome_db) {
00156     throw("You must specify a Bio::EnsEMBL::Compara::GenomeDB when\n".
00157         "creating a Bio::EnsEMBL::Compara::AlignSlice::Slice object");
00158   }
00159 
00160   return $self;
00161 }
00162 
00163 
00164 =head2 genome_db
00165 
00166   Arg[1]     : Bio::EnsEMBL::Compara::GenomeDB $genome_db
00167   Example    : $slice->genome_db($human_gdb);
00168   Description: getter/setter for the attribute genome_db
00169   Returntype : Bio::EnsEMBL::Compara::GenomeDB object
00170   Exceptions : This attribute should never be unset. Several methods
00171                rely on this attribute.
00172   Caller     : $object->methodname
00173 
00174 =cut
00175 
00176 sub genome_db {
00177   my ($self, $genome_db) = @_;
00178 
00179   if (defined($genome_db)) {
00180     throw("[$genome_db] must bu a Bio::EnsEMBL::Compara::GenomeDB object")
00181       unless ($genome_db and ref($genome_db) and $genome_db->isa("Bio::EnsEMBL::Compara::GenomeDB"));
00182     $self->{genome_db} = $genome_db;
00183   }
00184 
00185   return $self->{genome_db};
00186 }
00187 
00188 
00189 =head2 display_Slice_name
00190 
00191   Arg[1]     : string $name
00192   Example    : $slice->display_Slice_name("Homo_sapiens");
00193   Description: getter/setter for the attribute display_Slice_name
00194   Returntype : string
00195   Caller     : $object->methodname
00196 
00197 =cut
00198 
00199 sub display_Slice_name {
00200   my ($self, $display_slice_name) = @_;
00201 
00202   if (defined($display_slice_name)) {
00203     $self->{display_Slice_name} = $display_slice_name;
00204   }
00205 
00206   return ucfirst $self->{display_Slice_name};
00207 }
00208 
00209 
00210 =head2 add_Slice_Mapper_pair
00211 
00212   Arg[1]     : Bio::EnsEMBL::Slice $slice
00213   Arg[2]     : Bio::EnsEMBL::Mapper $mapper
00214   Arg[3]     : integer $start
00215   Arg[4]     : integer $end
00216   Arg[5]     : integer $strand
00217   Example    : $slice->add_Slice_Mapper_pair($slice, $mapper, 124, 542, -1);
00218   Description: Attaches a pair of Slice and the corresponding Mapper
00219                to this Bio::EnsEMBL::Compara::AlginSlice::Slice
00220                object. The Mapper contains the information for
00221                mapping coordinates from (to) the Slice to (from) the
00222                Bio::EnsEMBL::Compara::AlignSlice. Start, end and strand
00223                locates the $slice in the AlignSlice::Slice.
00224                $start and $end refer to the Coordinate System. If you are using
00225                a sub_Slice, this method will be using the coordinates of the
00226                original Bio::EnsEMBL::Compara::AlignSlice::Slice object!
00227   Returntype : 
00228   Exceptions : throws if $slice is not a Bio::EnsEMBL::Slice object
00229   Exceptions : throws if $mapper is not a Bio::EnsEMBL::Mapper object
00230 
00231 =cut
00232 
00233 sub add_Slice_Mapper_pair {
00234   my ($self, $slice ,$mapper, $start, $end, $strand) = @_;
00235 
00236   if (!$slice or !ref($slice) or !$slice->isa("Bio::EnsEMBL::Slice")) {
00237     throw("[$slice] must be a Bio::EnsEMBL::Slice object");
00238   }
00239   if (!$mapper or !ref($mapper) or !$mapper->isa("Bio::EnsEMBL::Mapper")) {
00240     throw("[$slice] must be a Bio::EnsEMBL::Mapper object");
00241   }
00242 
00243   push(@{$self->{slice_mapper_pairs}}, {
00244           slice => $slice,
00245           mapper => $mapper,
00246           start => $start,
00247           end => $end,
00248           strand => $strand,
00249       });
00250 
00251   return $self->{slice_mapper_pairs};
00252 }
00253 
00254 
00255 =head2 get_all_Slice_Mapper_pairs
00256 
00257   Arg[1]     : [optional] bool $get_gap_slices
00258   Example    : $slice_mapper_pairs = $slice->get_all_Slice_Mapper_pairs();
00259   Description: Returns all pairs of Slices and Mappers attached to this
00260                Bio::EnsEMBL::Compara::AlignSlice::Slice
00261                The $get_gap_slice flag is normally used internally to get the
00262                the gap slices. These are created when the alignment(s)
00263                underlying the AlignSlice correspond to gaps only in one or more
00264                species. These are used to tell the difference between gap due to
00265                lack of alignments and gaps due to the alignments. If you set this
00266                this flag to true you will get these gap slices back but it is your
00267                responsability to deal with these gap slices properly.
00268   Returntype : list ref of hashes which keys are "slice", "mapper", "start",
00269                "end" and "strand". Each hash corresponds to a pair of Slice
00270                and Mapper and the coordintes needed to locate the Slice in
00271                the AlignSlice::Slice.
00272                start and end refer to the Coordinate System. If you are using
00273                a sub_Slice, this method will be using the coordinates of the
00274                original Bio::EnsEMBL::Compara::AlignSlice::Slice object!
00275   Exceptions : return a ref to an empty list if no pairs have been attached
00276                so far.
00277 
00278 =cut
00279 
00280 sub get_all_Slice_Mapper_pairs {
00281   my ($self, $get_gap_slices) = @_;
00282   my $slice_mapper_pairs = ($self->{slice_mapper_pairs} or []);
00283   if (!$get_gap_slices) {
00284     $slice_mapper_pairs = [grep {$_->{slice}->coord_system_name ne "alignment"} @$slice_mapper_pairs];
00285   }
00286 
00287   return $slice_mapper_pairs;
00288 }
00289 
00290 
00291 =head2 get_all_Genes_by_type
00292 
00293   Arg [1]    : string $type
00294                The biotype of genes wanted.
00295   Arg [2]    : (optional) string $logic_name
00296   Arg [3]    : (optional) boolean $load_transcripts
00297                If set to true, transcripts will be loaded immediately rather
00298                than being lazy-loaded on request.  This will result in a
00299                significant speed up if the Transcripts and Exons are going to
00300                be used (but a slow down if they are not).
00301   Example    : @genes = @{$slice->get_all_Genes_by_type('protein_coding',
00302                'ensembl')};
00303   Description: Retrieves genes that overlap this slice of biotype $type.
00304                This is primarily used by the genebuilding code when several
00305                biotypes of genes are used.
00306 
00307                The logic name is the analysis of the genes that are retrieved.
00308                If not provided all genes will be retrieved instead.
00309 
00310                This methods overwrites the core one since it sends a warning
00311                message and return an empty array because this AlignSlice::Slice
00312                object has no adaptor. This implementation calls the
00313                get_all_Genes methdo elsewhere in this module to fulfil the
00314                query.
00315 
00316   Returntype : listref of Bio::EnsEMBL::Genes
00317   Exceptions : none
00318   Caller     : genebuilder, general
00319   Status     : Stable
00320 
00321 =cut
00322 
00323 sub get_all_Genes_by_type{
00324   my ($self, $type, $logic_name, $load_transcripts) = @_;
00325 
00326   my @out = grep { $_->biotype eq $type } 
00327     @{ $self->get_all_Genes($logic_name, undef, $load_transcripts)};
00328 
00329   return \@out;
00330 }
00331 
00332 
00333 =head2 get_all_Genes
00334 
00335   Arg [1]    : (optional) string $logic_name
00336                The name of the analysis used to generate the genes to retrieve
00337   Arg [2]    : (optional) string $dbtype
00338                The dbtype of genes to obtain.
00339   Arg [3]    : (optional) boolean $load_transcripts
00340                This option is always disabled for AlingSlices. It only exists for
00341                compatibility with the Bio::EnsEMBL::Slice objects.
00342   Example    : @genes = @{$slice->get_all_Genes};
00343   Description: Retrieves all genes that overlap this slice.
00344   Returntype : listref of Bio::EnsEMBL::Genes
00345   Exceptions : none
00346   Caller     : none
00347 
00348 =cut
00349 
00350 sub get_all_Genes {
00351   my ($self, $logic_name, $dbtype, $load_transcripts, @parameters) = @_;
00352 
00353   $logic_name ||= "";
00354   $dbtype ||= "core";
00355   my ($max_repetition_length,
00356       $strict_order_of_exon_pieces,
00357       $strict_order_of_exons,
00358       $return_unmapped_exons) = rearrange([qw(
00359           MAX_REPETITION_LENGTH
00360           STRICT_ORDER_OF_EXON_PIECES
00361           STRICT_ORDER_OF_EXONS
00362           RETURN_UNMAPPED_EXONS
00363       )], @parameters);
00364   my $max_gap_length = 0;
00365   my $max_intron_length = 0;
00366 
00367   $max_repetition_length = 100 if (!defined($max_repetition_length));
00368   $strict_order_of_exon_pieces = 1 if (!defined($strict_order_of_exon_pieces));
00369   $strict_order_of_exons = 0 if (!defined($strict_order_of_exons));
00370   $return_unmapped_exons = 1 if (!defined($return_unmapped_exons));
00371 
00372   my $key = 'key_'.
00373       $dbtype.":".
00374       $logic_name.":".
00375       $max_repetition_length.":".
00376       $strict_order_of_exon_pieces.":".
00377       $strict_order_of_exons.":".
00378       $return_unmapped_exons;
00379 
00380   if (!defined($self->{$key})) {
00381     my $all_genes = [];
00382 
00383     my $all_pairs = $self->get_all_Slice_Mapper_pairs;
00384     return [] if (!$all_pairs or !@$all_pairs);
00385 
00386     ## Create larger Slices in order to speed up fetching of genes
00387     my $all_slices_coordinates;
00388     my $this_slice_coordinates;
00389     my $this_slice_adaptor = $all_pairs->[0]->{slice}->adaptor;
00390     foreach my $pair (sort {
00391             $a->{slice}->seq_region_name cmp $b->{slice}->seq_region_name or
00392             $a->{slice}->start <=> $b->{slice}->start } @$all_pairs) {
00393 # print STDERR "Foreach pair ($pair)... $pair->{slice}->{seq_region_name} $pair->{slice}->{start}\n";
00394             
00395       my $this_slice = $pair->{slice};
00396       if ($this_slice_coordinates and
00397           ($this_slice_coordinates->{seq_region_name} eq $this_slice->seq_region_name) and
00398           (($this_slice->start - $this_slice_coordinates->{end}) < 10000000)) {
00399         ## lengthen current slice_coordinates
00400         $this_slice_coordinates->{end} = $this_slice->end;
00401       } else {
00402         ## save a deep copy (if needed) and reset current slice_coordinates
00403         if ($this_slice_coordinates->{seq_region_name}) {
00404           my $new_slice_coordinates = {
00405                   "coord_system_name" => $this_slice_coordinates->{coord_system_name},
00406                   "seq_region_name" => $this_slice_coordinates->{seq_region_name},
00407                   "start" => $this_slice_coordinates->{start},
00408                   "end" => $this_slice_coordinates->{end},
00409                   "pairs" => $this_slice_coordinates->{pairs}
00410               };
00411           push(@$all_slices_coordinates, $new_slice_coordinates);
00412         }
00413         $this_slice_coordinates->{coord_system_name} = $this_slice->coord_system_name;
00414         $this_slice_coordinates->{seq_region_name} = $this_slice->seq_region_name;
00415         $this_slice_coordinates->{start} = $this_slice->start;
00416         $this_slice_coordinates->{end} = $this_slice->end;
00417         $this_slice_coordinates->{pairs} = [];
00418       }
00419       ## Add this pair to the set of pairs of the current slice_coordinates
00420       push(@{$this_slice_coordinates->{pairs}}, $pair);
00421     }
00422     push(@$all_slices_coordinates, $this_slice_coordinates) if ($this_slice_coordinates);
00423 
00424     foreach $this_slice_coordinates (@$all_slices_coordinates) {
00425 # print STDERR "Foreach this_slice_coordinates...\n";
00426       my $this_slice = $this_slice_adaptor->fetch_by_region(
00427               $this_slice_coordinates->{coord_system_name},
00428               $this_slice_coordinates->{seq_region_name},
00429               $this_slice_coordinates->{start},
00430               $this_slice_coordinates->{end}
00431           );
00432 
00433       ## Do not load transcripts immediately or the cache could produce
00434       ## some troubles in some special cases! Moreover, in this way we will have
00435       ## the genes, the transcripts and the exons in the same slice!!
00436       my $these_genes = $this_slice->get_all_Genes($logic_name, $dbtype);
00437       foreach my $pair (@{$this_slice_coordinates->{pairs}}) {
00438 # print STDERR "Foreach pair ($pair)...\n";
00439         foreach my $this_gene (@$these_genes) {
00440 # print STDERR "1. GENE: $this_gene->{stable_id} ($this_gene->{start} - $this_gene->{end})\n";
00441           my $mapped_gene = $self->_get_mapped_Gene($this_gene, $pair, $return_unmapped_exons);
00442 # print STDERR "2. GENE: $mapped_gene->{stable_id} ($mapped_gene->{start} - $mapped_gene->{end})\n";
00443 #           $mapped_gene = $self->_get_mapped_Gene($this_gene, $pair);
00444           if ($mapped_gene and @{$mapped_gene->get_all_Transcripts}) {
00445             push(@$all_genes, $mapped_gene);
00446           }
00447         }
00448       }
00449     }
00450 
00451     $all_genes = $self->_compile_mapped_Genes(
00452             $all_genes,
00453             $max_repetition_length,
00454             $max_gap_length,
00455             $max_intron_length,
00456             $strict_order_of_exon_pieces,
00457             $strict_order_of_exons
00458         );
00459 
00460     $self->{$key} = $all_genes;
00461   }
00462 
00463   return $self->{$key};
00464 }
00465 
00466 
00467 =head2 _get_mapped_Gene
00468 
00469   Arg[1]     : Bio::EnsEMBL::Gene $original_gene
00470   Arg[2]     : Bio::EnsEMBL::Compara::GenomicAlign $genomic_align
00471   Example    : my $mapped_gene = $align_slice->get_mapped_Gene($orignal_gene, $genomic_align);
00472   Description: returns a new Bio::EnsEMBL::Gene object. Mapping is based on exons.
00473                The object returned contains Bio::EnsEMBL::Transcripts objects. Those
00474                mapped transcripts contain Bio::EnsEMBL::Compara::AlignSlice::Exon objects.
00475                If no exon can be mapped, the returned object will contain an empty array of
00476                transcripts. Since mapped objects are not stored in the DB, they have no dbID
00477                and no adaptor.
00478   Returntype : Bio::EnsEMBL::Gene object. (new object)
00479   Exceptions : returns undef if no part of $original_gene can be mapped using this
00480                $genomic_align. This method assumes that the slices of all the exons got
00481                from that gene have the same coordinates as the gene slice. It will throw
00482                if this is not true.
00483   Caller     : get_all_Genes
00484 
00485 =cut
00486 
00487 sub _get_mapped_Gene {
00488   my ($self, $gene, $pair, $return_unmapped_exons) = @_;
00489 
00490   ## $range_start and $range_end are used to get rid of extra genes fetched because of the
00491   ## previous speed improvement where several slices are merged in order to access the DB
00492   ## a minimum number of times. $pair->{slice} correspond to the actual alignment while
00493   ## $gene->slice might be much larger and include several alignments. $range_start and
00494   ## $range_end are the coordinates (using $gene->slice as a ref.) where the alignment is.
00495   ## If the gene (or later on, the exon) falls outside of this range, it can be discarded
00496   ## as it will be impossible to map it with using $pair->mapper!
00497   my $range_start = $pair->{slice}->start - $gene->slice->start + 1;
00498   my $range_end = $pair->{slice}->end - $gene->slice->start + 1;
00499   return undef if (($gene->start > $range_end) or ($gene->end < $range_start));
00500 
00501   my $from_mapper = $pair->{mapper};
00502   my $to_mapper = $self->{to_mapper};
00503 
00504   my $these_transcripts = [];
00505 
00506   foreach my $this_transcript (@{$gene->get_all_Transcripts}) {
00507     my $these_exons = [];
00508     my $all_exons = $this_transcript->get_all_Exons;
00509     for (my $i=0; $i<@$all_exons; $i++){
00510       my $this_exon = $all_exons->[$i];
00511       throw("Oops, this method assumes that all the exons are defined on the same slice as the".
00512           " gene they belong to") if ($this_exon->slice->start != $gene->slice->start);
00513       if ($this_exon->start <= $range_end and $this_exon->end >= $range_start) {
00514         my $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon(
00515                 -EXON => $this_exon,
00516                 -ALIGN_SLICE => $self,
00517                 -FROM_MAPPER => $from_mapper,
00518                 -TO_MAPPER => $to_mapper,
00519                 -ORIGINAL_RANK => $i + 1
00520             );
00521         if ($this_align_exon) {
00522           push(@{$these_exons}, $this_align_exon);
00523         } elsif ($return_unmapped_exons) {
00524           $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon(
00525                   -EXON => $this_exon,
00526                   -ORIGINAL_RANK => $i + 1
00527               );
00528           push(@{$these_exons}, $this_align_exon) if ($this_align_exon);
00529         }
00530       } elsif ($return_unmapped_exons) {
00531         my $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon(
00532                 -EXON => $this_exon,
00533                 -ORIGINAL_RANK => $i + 1
00534             );
00535         push(@{$these_exons}, $this_align_exon) if ($this_align_exon);
00536       }
00537     }
00538     if (grep {defined($_->start)} @$these_exons) { ## if any of the exons has been mapped
00539       my $new_transcript = $this_transcript->new(
00540               -dbID => $this_transcript->dbID,
00541               -adaptor => $this_transcript->adaptor,
00542               -stable_id => $this_transcript->stable_id,
00543               -version => $this_transcript->version,
00544               -external_db => $this_transcript->external_db,
00545               -external_name => $this_transcript->external_name,
00546               -external_status => $this_transcript->external_status,
00547               -display_xref => $this_transcript->display_xref,
00548               -analysis => $this_transcript->analysis,
00549               -status => $this_transcript->status,
00550               -biotype => $this_transcript->biotype,
00551               -exons => $these_exons,
00552           );
00553       if ($this_transcript->translation) {
00554         $new_transcript->translation($this_transcript->translation);
00555       }
00556       push(@{$these_transcripts}, $new_transcript);
00557     }
00558   }
00559   if (!@$these_transcripts) {
00560     return undef;
00561   }
00562   ## Create a new gene object. This is needed in order to avoid
00563   ## troubles with cache. Attach mapped transcripts.
00564   my $mapped_gene = $gene->new(
00565           -ANALYSIS => $gene->analysis,
00566           -SEQNAME => $gene->seqname,
00567           -STABLE_ID => $gene->stable_id,
00568           -VERSION => $gene->version,
00569           -EXTERNAL_NAME => $gene->external_name,
00570           -TYPE => $gene->biotype,
00571           -BIOTYPE => $gene->biotype,
00572           -STATUS => $gene->status,
00573           -EXTERNAL_DB => $gene->external_db,
00574           -EXTERNAL_STATUS => $gene->external_status,
00575           -DISPLAY_XREF => $gene->display_xref,
00576           -DESCRIPTION => $gene->description,
00577           -TRANSCRIPTS => $these_transcripts
00578       );
00579 
00580   return $mapped_gene;
00581 }
00582 
00583 
00584 =head2 _compile_mapped_Genes
00585 
00586   Arg[1]     : listref of Bio::EnsEMBL::Gene $mapped_genes
00587   Arg[2]     : int $max_repetition_length
00588   Arg[3]     : int $max_gap_length
00589   Arg[4]     : int $max_intron_length
00590   Arg[5]     : int $strict_order_of_exon_pieces
00591   Arg[6]     : int $strict_order_of_exons
00592   Example    : my $compiled_genes = $align_slice->compile_mapped_Genes($mapped_genes,
00593                        100, 1000, 100000, 1, 0);
00594   Description: This method compiles all the pieces of gene mapped before into a list
00595                of Bio::EnsEMBL::Gene objects. It tries to merge pieces of the same
00596                exon, link them according to their original transcript and finally
00597                group the transcripts in Bio::EnsEMBL::Gene objects. Merging and
00598                linking of Exons is done according to some rules that can be changed
00599                with the parameters.
00600   Parameters:  They are sent as is to _separate_in_incompatible_sets_of_Exons() and
00601                _merge_Exons() methods. Please refer to them elsewhere in this
00602                document.
00603   Returntype : listref of Bio::EnsEMBL::Gene objects. (overrides $mapped_genes)
00604   Exceptions : none
00605   Caller     : $object->methodname
00606 
00607 =cut
00608 
00609 sub _compile_mapped_Genes {
00610   my ($self, $mapped_genes, $max_repetition_length, $max_gap_length, $max_intron_length, 
00611       $strict_order_of_exon_pieces, $strict_order_of_exons) = @_;
00612 
00613   my $verbose = verbose();
00614   verbose(0); # Avoid warnings when mapped transcripts are not in the same strand
00615   
00616   ## Compile genes: group transcripts by gene->stable_id
00617   my $gene_by_gene_stable_id;
00618   my $transcripts_by_gene_stable_id;
00619   foreach my $mapped_gene (@$mapped_genes) {
00620 # print STDERR "1. GENE:$mapped_gene->{stable_id} ", join(" - ", map {$_->stable_id} @{$mapped_gene->get_all_Transcripts}), "\n";
00621     if (!$gene_by_gene_stable_id->{$mapped_gene->stable_id}) {
00622       $gene_by_gene_stable_id->{$mapped_gene->stable_id} = $mapped_gene;
00623     }
00624     ## Group all the transcripts by gene stable_id...
00625     push(@{$transcripts_by_gene_stable_id->{$mapped_gene->stable_id}},
00626         @{$mapped_gene->get_all_Transcripts});
00627   }
00628 #   $mapped_genes = [values %$genes_by_stable_id];
00629 
00630   ## Compile transcripts: group exons by transcript->stable_id
00631   while (my ($gene_stable_id, $set_of_transcripts) = each %$transcripts_by_gene_stable_id) {
00632     my $transcript_by_transcript_stable_id;
00633     my $exons_by_transcript_stable_id;
00634     foreach my $transcript (@{$set_of_transcripts}) {
00635       if (!$transcript_by_transcript_stable_id->{$transcript->stable_id}) {
00636         $transcript_by_transcript_stable_id->{$transcript->stable_id} = $transcript;
00637       }
00638       ## Group all the exons by the transcript stable_id...
00639 # print STDERR "0. TRANS:$transcript->{stable_id} ", join(" - ", map {$_->stable_id} @{$transcript->get_all_Exons}), "\n";
00640       push(@{$exons_by_transcript_stable_id->{$transcript->stable_id}},
00641           @{$transcript->get_all_Exons});
00642     }
00643 
00644     ## Try to merge splitted exons whenever possible
00645     while (my ($transcript_stable_id, $set_of_exons) = each %$exons_by_transcript_stable_id) {
00646 # print STDERR "1. ", join(" - ", map {$_->stable_id} @{$set_of_exons}), "\n";
00647       $exons_by_transcript_stable_id->{$transcript_stable_id} = _merge_Exons(
00648               $set_of_exons,
00649               $max_repetition_length,
00650               $max_gap_length, 
00651               $strict_order_of_exon_pieces
00652           );
00653 # print STDERR "2. ", join(" - ", map {$_->stable_id} @{$exons_by_transcript_stable_id->{$transcript_stable_id}}), "\n";
00654     }
00655     my $all_transcripts;
00656     while (my ($transcript_stable_id, $set_of_exons) = each %$exons_by_transcript_stable_id) {
00657 #       my $sets_of_compatible_exons = [$set_of_exons];
00658       my $sets_of_compatible_exons = _separate_in_incompatible_sets_of_Exons(
00659               $set_of_exons,
00660               $max_repetition_length,
00661               $max_intron_length,
00662               $strict_order_of_exons
00663           );
00664 
00665       my $old_transcript = $transcript_by_transcript_stable_id->{$transcript_stable_id};
00666 #       # Save first set of exons in the 
00667 #       my $first_set_of_compatible_exons = shift(@{$sets_of_compatible_exons});
00668 #       my $first_transcript = $transcript_by_transcript_stable_id->{$transcript_stable_id};
00669 #       $first_transcript->flush_Exons();
00670 #       foreach my $exon (@$first_set_of_compatible_exons) {
00671 #         $first_transcript->add_Exon($exon);
00672 #       }
00673 #       push(@$all_transcripts, $first_transcript);
00674 
00675       foreach my $this_set_of_compatible_exons (@{$sets_of_compatible_exons}) {
00676         my $new_transcript = $old_transcript->new(
00677                 -dbID => $old_transcript->dbID,
00678                 -adaptor => $old_transcript->adaptor,
00679                 -stable_id => $old_transcript->stable_id,
00680                 -version => $old_transcript->version,
00681                 -external_db => $old_transcript->external_db,
00682                 -external_name => $old_transcript->external_name,
00683                 -external_status => $old_transcript->external_status,
00684                 -display_xref => $old_transcript->display_xref,
00685                 -analysis => $old_transcript->analysis,
00686                 -status => $old_transcript->status,
00687                 -biotype => $old_transcript->biotype,
00688                 -EXONS => $this_set_of_compatible_exons
00689             );
00690         ## $old_transcript->translation is the original Bio::EnsEMBL::Translation!
00691         if ($old_transcript->translation) {
00692           my $start_exon;
00693           my $seq_start;
00694           my $end_exon;
00695           my $seq_end;
00696           my $all_start_codon_mappings;
00697           my $all_end_codon_mappings;
00698           my @sorted_exons;
00699           if ($new_transcript->strand == 1) {
00700             @sorted_exons = @{$new_transcript->get_all_Exons};
00701           } else {
00702             @sorted_exons = reverse @{$new_transcript->get_all_Exons};
00703           }
00704           my $coding = 0;
00705           foreach my $this_exon (@sorted_exons) {
00706             if ($old_transcript->translation->start_Exon->stable_id eq $this_exon->stable_id) {
00707               $coding = 1;
00708             }
00709             if ($coding and $this_exon->start) {
00710               if (!$start_exon) {
00711                 if ($old_transcript->translation->start_Exon->stable_id eq $this_exon->stable_id) {
00712                   if ($old_transcript->translation->start_Exon->strand == 1) {
00713                     $all_start_codon_mappings = $self->map_original_Slice(
00714                         $this_exon->exon->slice->sub_Slice(
00715                           $old_transcript->coding_region_start,
00716                           $old_transcript->coding_region_start + 2));
00717                   } else {
00718                     $all_start_codon_mappings = $self->map_original_Slice(
00719                         $this_exon->exon->slice->sub_Slice(
00720                           $old_transcript->coding_region_end-2,
00721                           $old_transcript->coding_region_end));
00722                  }
00723                   $seq_start = _map_position_using_cigar_line($this_exon->cigar_line,
00724                       $old_transcript->translation->start, $this_exon->exon->length, 1);
00725                   if ($seq_start <= $this_exon->length) {
00726                     $start_exon = $this_exon;
00727                   }
00728                 } else {
00729                   $seq_start = 1;
00730                   $start_exon = $this_exon;
00731                 }
00732               }
00733               if ($old_transcript->translation->end_Exon->stable_id eq $this_exon->stable_id) {
00734                 if ($old_transcript->translation->end_Exon->strand == 1) {
00735                   $all_end_codon_mappings = $self->map_original_Slice(
00736                       $this_exon->exon->slice->sub_Slice($old_transcript->coding_region_end - 2,$old_transcript->coding_region_end));
00737                 } else {
00738                   $all_end_codon_mappings = $self->map_original_Slice(
00739                       $this_exon->exon->slice->sub_Slice(
00740                        $old_transcript->coding_region_start,
00741                         $old_transcript->coding_region_start + 2));
00742                 }
00743                 $seq_end = _map_position_using_cigar_line($this_exon->cigar_line,
00744                     $old_transcript->translation->end, $this_exon->exon->length, 0);
00745                 $seq_end = $this_exon->length if ($seq_end > $this_exon->length);
00746                 if ($seq_end >= 1) {
00747                   $end_exon = $this_exon;
00748                 } else {
00749                   ## Set $seq_end to previous value
00750                   $seq_end = $end_exon->length if ($end_exon);
00751                 }
00752               } else {
00753                 $end_exon = $this_exon;
00754                 $seq_end = $end_exon->length;
00755               }
00756         }
00757             if ($old_transcript->translation->end_Exon->stable_id eq $this_exon->stable_id) {
00758               $coding = 0;
00759               last;
00760             }
00761           }
00762           if ($start_exon and $end_exon) {
00763             $new_transcript->translation(new Bio::EnsEMBL::Compara::AlignSlice::Translation(
00764                     -start_exon => $start_exon,
00765                     -end_exon => $end_exon,
00766                     -seq_start => $seq_start,
00767                     -seq_end => $seq_end,
00768                     -stable_id => $old_transcript->translation->stable_id,
00769                     -version => $old_transcript->translation->version,
00770                     -created_date => ($old_transcript->translation->created_date or undef),
00771                     -modified_date =>$old_transcript->translation->modified_date,
00772                 ));
00773           } else {
00774             ## Translation cannot be mapped. In this case we should return a translation outside
00775             ## of the coordinate system (negative coding_region_start and coding_region_end).
00776             $new_transcript->translation(new Bio::EnsEMBL::Compara::AlignSlice::Translation(
00777                     -stable_id => $old_transcript->translation->stable_id,
00778                     -version => $old_transcript->translation->version,
00779                     -created_date => ($old_transcript->translation->created_date or undef),
00780                     -modified_date =>$old_transcript->translation->modified_date,
00781                 ));
00782             $new_transcript->coding_region_start(-10000000000);
00783             $new_transcript->coding_region_end(-10000000000);
00784           }
00785 
00786           $new_transcript->translation->all_start_codon_mappings($all_start_codon_mappings);
00787           $new_transcript->translation->all_end_codon_mappings($all_end_codon_mappings);
00788         }
00789         push(@$all_transcripts, $new_transcript);
00790       }
00791     }
00792 
00793     $gene_by_gene_stable_id->{$gene_stable_id}->{'_transcript_array'} = $all_transcripts;
00794     # adjust start, end, strand and slice
00795     $gene_by_gene_stable_id->{$gene_stable_id}->recalculate_coordinates;
00796   }
00797   verbose($verbose);
00798 
00799   return [values %$gene_by_gene_stable_id];
00800 }
00801 
00802 
00803 =head2 _map_position_using_cigar_line
00804 
00805   Arg[1]     : string $cigar_line
00806   Arg[2]     : int $original_position
00807   Arg[3]     : int $original_length
00808   Arg[4]     : bool $start
00809   Example    : my $mapped_start_position = _map_position_using_cigar_line(
00810                    "16M4I", 10, 20, 1);
00811   Example    : my $mapped_end_position = _map_position_using_cigar_line(
00812                    "16M4I", 10, 20, 1);
00813   Description: This method is used to locate the start or the end position of
00814                a Translation on the Bio::EnsEMBL::Compara::AlignSlice::Exon object
00815                using the cigar_line of the object. As the Bio::EnsEMBL::Compara::
00816                AlignSlice::Exon object may result from the fusion of the same
00817                Bio::EnsEMBL::Exon object several times, this method returns the
00818                best mapping among all the available possibilities. When the original
00819                position maps on an insertion in the Bio::EnsEMBL::Compara::
00820                AlignSlice::Exon object, the resulting mapped start position is the
00821                next one to the latest mapped one.
00822   Returntype : int $mapped_position
00823   Exceptions : returns 0 when the end position cannot be mapped.
00824   Exceptions : returns a number larger than the length of the Bio::EnsEMBL::Compara::
00825                AlignSlice::Exon when the start position cannot be mapped.
00826 
00827 =cut
00828 
00829 sub _map_position_using_cigar_line {
00830   my ($cigar_line, $original_pos, $original_length, $start) = @_;
00831   my $mapped_pos = 0;
00832 
00833 
00834   my @cigar = grep {$_} split(/(\d*[GIDM])/, $cigar_line);
00835   my $original_count = 0;
00836   my $mapped_count = 0;
00837   my $pending_count = 0;
00838 
00839   foreach my $cigar_piece (@cigar) {
00840     my ($num, $mode) = $cigar_piece =~ /(\d*)([GIDM])/;
00841     $num = 1 if ($num eq "");
00842     if ($mode eq "D" or $mode eq "G") {
00843       $pending_count += $num;
00844     } elsif ($mode eq "I") {
00845       $original_count += $num;
00846     } elsif ($mode eq "M") {
00847       if ($pending_count) {
00848         $mapped_count += $pending_count;
00849         $pending_count = 0;
00850       }
00851       if ($original_count + $num < $original_pos) {
00852         $original_count += $num;
00853         $mapped_count += $num;
00854       } else {
00855         $mapped_count += ($original_pos - $original_count);
00856         $original_count += ($original_pos - $original_count);
00857         $mapped_pos = $mapped_count;
00858         last;
00859       }
00860     }
00861     if ($original_count >= $original_pos and $mode eq "I") {
00862       ## This position matches in an insertion.
00863       ## If we are mapping the end position we will prefer the first match in an insertion
00864       ## rather than the following ones (except if the following matches in a "M" region).
00865       ## On the other hand, if we are mapping the start position, we will prefer the last
00866       ## match in an insertion
00867       if ($start or !$mapped_pos) {
00868         if ($start and $pending_count) {
00869           ## $pending_count corresponds to deletion and needs to be added when mapping the
00870           ## start position (we will want the position AFTER the deletion) but not when
00871           ## mapping the end position (we will want the position BEFORE the deletion)
00872           $mapped_count += $pending_count;
00873           $pending_count = 0;
00874         }
00875         $mapped_pos = $mapped_count;
00876         ## When matching the start position, add 1 as the mapped start will be just after
00877         ## this position. If this position appears after the end of the mapped exon, this
00878         ## means we fail to map the start position
00879         $mapped_pos ++ if ($start);
00880       }
00881       ## Try to look further. A second match may happen if this exon is the result of a fusion process
00882       $original_pos += $original_length;
00883     }
00884   }
00885 
00886   return $mapped_pos;
00887 }
00888 
00889 
00890 =head2 _merge_Exons
00891 
00892   Arg[1]     : listref of Bio::EnsEMBL::Compara::AlignSlice::Exon $set_of_exons
00893   Arg[2]     : int $max_repetition_length
00894   Arg[3]     : int $max_gap_length
00895   Arg[4]     : int $strict_order_of_exon_pieces
00896   Example    : my $merged_exons = _merge_Exons($exons_to_be_merged, 100, 1000, 1);
00897   Description: Takes a list of Bio::EnsEMBL::Compara::AlignSlice::Exon objects and
00898                tries to merge them according to exon stable_id and some rules that can
00899                be tunned using some optional parameters. This method can overwrite some
00900                of the exon in the $set_of_exons.
00901   Parameters:  MAX_REPETITION_LENGTH. In principle you want to merge together pieces
00902                    of an exon which do not overlap (the beginning and the end of the
00903                    exon). With this flag you can to set up what amount of the original
00904                    exon is allowed on two aligned exons to be merged.
00905                MAX_GAP_LENGTH. If the distance between two pieces of exons in the
00906                    aligned slice is larger than this parameter, they will not be
00907                    merged. Setting this parameter to -1 will
00908                    avoid any merging event.
00909                STRICT_ORDER_OF_EXON_PIECES. This flag allows you to decide whether two
00910                    pieces of an exon should be merged or not if they are not in the
00911                    right order, for instance if the end of the original exon will
00912                    appear before the start on the merged exon.
00913   Returntype : lisref of Bio::EnsEMBL::Compara::AlignSlice::Exon objects.
00914   Exceptions : none
00915   Caller     : methodname
00916 
00917 =cut
00918 
00919 sub _merge_Exons {
00920   my ($set_of_exons, $max_repetition_length, $max_gap_length, $strict_order_of_exon_pieces) = @_;
00921   my $merged_exons = []; # returned value
00922 
00923   my $exon_by_stable_id;
00924   # Group exons by stable_id
00925   foreach my $exon (@$set_of_exons) {
00926     push(@{$exon_by_stable_id->{$exon->stable_id}}, $exon);
00927   }
00928       
00929   # Merge compatible pieces of exons
00930   foreach my $these_exons (values %$exon_by_stable_id) {
00931 
00932     if (!grep {defined($_->start)} @$these_exons) {
00933       push(@$merged_exons, $these_exons->[0]);
00934       next;
00935     }
00936 
00937     # Sort exons according to 
00938     $these_exons= [sort {($a->start or 0) <=> ($b->start or 0)} @$these_exons];
00939   
00940     while (my $first_exon = shift @$these_exons) {
00941       if (!defined($first_exon->start)) {
00942 #         push(@$merged_exons, $first_exon);
00943         next;
00944       }
00945       for (my $count=0; $count<@$these_exons; $count++) {
00946         my $second_exon = $these_exons->[$count];
00947         # Check strands
00948         next if ($first_exon->strand != $second_exon->strand);
00949 
00950         my $gap_between_pieces_of_exon = $second_exon->start - $first_exon->end - 1;
00951         # Check whether both mapped parts do not overlap
00952         next if ($gap_between_pieces_of_exon < 0);
00953 
00954         # Check maximum gap between both pieces of exon
00955         next if ($max_gap_length and $gap_between_pieces_of_exon > $max_gap_length);
00956 
00957         # Check whether both mapped parts are in the right order
00958         if ($strict_order_of_exon_pieces) {
00959           if ($first_exon->strand == 1) {
00960             next if ($first_exon->get_aligned_start > $second_exon->get_aligned_start);
00961           } else {
00962             next if ($first_exon->get_aligned_end < $second_exon->get_aligned_end);
00963           }
00964         }
00965 
00966         # Check maximum overlapping within original exon, i.e. how much of the
00967         # same exon can be mapped twice
00968         my $repetition_length;
00969         if ($first_exon->strand == 1) {
00970           $repetition_length = $first_exon->get_aligned_end - $second_exon->get_aligned_start + 1;
00971         } else {
00972           $repetition_length = $second_exon->get_aligned_end - $first_exon->get_aligned_start + 1;
00973         }
00974         next if ($repetition_length > $max_repetition_length);
00975 
00976         ## Merge exons!!
00977         $second_exon = splice(@$these_exons, $count, 1); # remove exon from the list
00978         $count-- if (@$these_exons);
00979         $first_exon->end($second_exon->end);
00980         if ($first_exon->strand == 1) {
00981           $first_exon->append_Exon($second_exon, $gap_between_pieces_of_exon);
00982         } else {
00983           $first_exon->prepend_Exon($second_exon, $gap_between_pieces_of_exon);
00984         }
00985       }
00986       push(@$merged_exons, $first_exon);
00987     }
00988   }
00989 
00990   return $merged_exons;
00991 }
00992 
00993 
00994 =head2 _separate_in_incompatible_sets_of_Exons
00995 
00996   Arg[1]     : listref of Bio::EnsEMBL::Compara::AlignSlice::Exon $set_of_exons
00997   Arg[2]     : int $max_repetition_length
00998   Arg[3]     : int $max_intron_length
00999   Arg[4]     : int $strict_order_of_exons
01000   Example    : my $sets_of_exons = _separate_in_incompatible_sets_of_Exons(
01001                    $set_of_exons, 100, 100000, 0);
01002   Description: Takes a list of Bio::EnsEMBL::Compara::AlignSlice::Exon and separate
01003                them in sets of comaptible exons. Compatibility is defined taking into
01004                account 5 parameters:
01005                  - exons must be in the same strand
01006                  - exons cannot overlap on the align_slice
01007                  - distance between exons cannot be larger than MAX_INTRON_LENGTH
01008                  - two exons with the same stable_id can belong to the same transcript
01009                    only if they represent diferent parts of the original exon. Some
01010                    overlapping is allowed (see MAX_REPETITION_LENGTH parameter).
01011                  - exons must be in the same order as in the original transcript
01012                    if the STRICT_ORDER_OF_EXONS parameter is true.
01013   Parameters:  MAX_REPETITION_LENGTH. In principle you want to link together pieces
01014                    of an exon which do not overlap (the beginning and the end of the
01015                    exon). With this flag you can to set up what amount of the original
01016                    exon is allowed on two aligned exons to be linked.
01017                MAX_INTRON_LENGTH. If the distance between two exons in the aligned slice
01018                    is larger than this parameter, they will not be linked.
01019                STRICT_ORDER_OF_EXONS. This flag allows you to decide whether two
01020                    exons should be linked or not if they are not in the
01021                    original order.
01022   Returntype : listref of lisrefs of Bio::EnsEMBL::Compara::AlignSlice::Exon objects.
01023   Exceptions : none
01024   Caller     : methodname
01025 
01026 =cut
01027 
01028 sub _separate_in_incompatible_sets_of_Exons {
01029   my ($set_of_exons, $max_repetition_length, $max_intron_length, $strict_order_of_exons) = @_;
01030   my $sets_of_exons = [];
01031 
01032   my $last_exon;
01033   my $this_set_of_exons = [];
01034 
01035   ###################################################################
01036   ##
01037   ##  Return all the exons by strand.
01038   ##   - A given exon can be mapped partially on both strands!
01039   ##
01040   if (grep {$_->strand and $_->strand == 1} @$set_of_exons and grep {$_->strand and $_->strand == -1} @$set_of_exons) {
01041     ## Keep track of the exons that have been mapped by strand
01042     my $indexes;
01043     foreach my $exon (@$set_of_exons) {
01044       if ($exon->strand) {
01045         $indexes->{$exon->strand}->{$exon->original_rank} = 1;
01046       }
01047     }
01048 
01049     my $forward_stranded_set_of_exons; # set of exons to be returned in the forward strand
01050     my $reverse_stranded_set_of_exons; # set of exons to be returned in the reverse strand
01051     foreach my $exon (@$set_of_exons) {
01052       my $new_exon = $exon->copy();
01053       if ($exon->strand) {
01054         if ($exon->strand == 1) {
01055           if ($indexes->{"-1"}->{$exon->original_rank}) {
01056             ## This exon has been mapped on the reverse strand as well
01057             push(@$forward_stranded_set_of_exons, $exon);
01058             next;
01059           } else {
01060             ## Undef coordinates of the new exon before adding it to the set
01061             ## of reverse stranded exons
01062             $new_exon->start(undef);
01063             $new_exon->end(undef);
01064             $new_exon->strand(undef);
01065           }
01066         } else {
01067           if ($indexes->{"1"}->{$exon->original_rank}) {
01068             ## This exon has been mapped on the forward strand as well
01069             push(@$reverse_stranded_set_of_exons, $new_exon);
01070             next;
01071           } else {
01072             ## Undef coordinates of the new exon before adding it to the set
01073             ## of forward stranded exons
01074             $exon->start(undef);
01075             $exon->end(undef);
01076             $exon->strand(undef);
01077           }
01078         }
01079       }
01080       push(@$forward_stranded_set_of_exons, $exon);
01081       push(@$reverse_stranded_set_of_exons, $new_exon);
01082     }
01083     push(@$sets_of_exons, @{_separate_in_incompatible_sets_of_Exons($reverse_stranded_set_of_exons,
01084         $max_repetition_length, $max_intron_length, $strict_order_of_exons)});
01085     $set_of_exons = $forward_stranded_set_of_exons;
01086   }
01087   ##
01088   ###################################################################
01089 
01090   my $transcript_strand = 1;
01091   if (grep {$_->strand and $_->strand == -1} @$set_of_exons) {
01092     $transcript_strand = -1;
01093   }
01094   foreach my $this_exon (_sort_Exons(@$set_of_exons)) {
01095     if (!defined($this_exon->start)) {
01096       if ($transcript_strand == -1) {
01097         ## Insert this exon in the right place
01098         my $inserted = 0;
01099         for (my $i=0; $i<@$this_set_of_exons; $i++) {
01100           if ($this_set_of_exons->[$i]->original_rank == $this_exon->original_rank - 1) {
01101             splice(@$this_set_of_exons, $i, 0, $this_exon);
01102             $inserted = 1;
01103             last;
01104           }
01105         }
01106         if (!$inserted) {
01107           push(@$this_set_of_exons, $this_exon);
01108         }
01109       } else {
01110         ## Append this exon
01111         push(@$this_set_of_exons, $this_exon);
01112       }
01113       next;
01114     }
01115     if ($last_exon) {
01116       # Calculate intron length
01117       my $intron_length = $this_exon->start - $last_exon->end - 1;
01118       # Calculate whether both mapped parts are in the right order
01119       my $order_is_ok = 1;
01120       if ($strict_order_of_exons) {
01121         if ($this_exon->strand == $this_exon->exon->strand) {
01122           $order_is_ok = 0 if ($this_exon->exon->start < $last_exon->exon->start);
01123         } else {
01124           $order_is_ok = 0 if ($this_exon->exon->start > $last_exon->exon->start);
01125         }
01126       }
01127       my $repetition_length = 0;
01128       if ($last_exon->stable_id eq $this_exon->stable_id) {
01129         if ($this_exon->strand == 1) {
01130           $repetition_length = $last_exon->get_aligned_end - $this_exon->get_aligned_start + 1;
01131         } else {
01132           $repetition_length = $this_exon->get_aligned_end - $last_exon->get_aligned_start + 1;
01133         }
01134       }
01135 
01136       if (($last_exon->strand != $this_exon->strand) or
01137           ($intron_length < 0) or
01138           ($max_intron_length and ($intron_length > $max_intron_length)) or
01139           (!$order_is_ok) or
01140           ($repetition_length > $max_repetition_length)) {
01141         # this_exon and last_exon should be in separate sets. Save current
01142         # set_of_exons and start a new set_of_exons
01143         push(@$sets_of_exons, $this_set_of_exons);
01144         $this_set_of_exons = [];
01145       }
01146     }
01147     push(@$this_set_of_exons, $this_exon);
01148     $last_exon = $this_exon;
01149   }
01150   push(@$sets_of_exons, $this_set_of_exons);
01151 
01152   return $sets_of_exons;
01153 }
01154 
01155 sub _sort_Exons {
01156   my @exons = @_;
01157   my @sorted_exons = ();
01158 
01159   my $transcript_strand = 1;
01160   if (grep {$_->strand and $_->strand == -1} @exons) {
01161     $transcript_strand = -1;
01162   }
01163 
01164   my @mapped_exons = grep {defined($_->start)} @exons;
01165   my @unmapped_exons = grep {!defined($_->start)} @exons;
01166 
01167   @mapped_exons = sort {$a->start <=> $b->start} @mapped_exons;
01168   my $sorted_exons_by_rank;
01169   foreach my $mapped_exon (@mapped_exons) {
01170     my $rank = $mapped_exon->original_rank;
01171     # the same exon can be mapped twice!
01172     push(@{$sorted_exons_by_rank->{$rank}}, $mapped_exon);
01173   }
01174 
01175   @unmapped_exons = sort {$a->original_rank <=> $b->original_rank} @unmapped_exons;
01176   foreach my $unmapped_exon (@unmapped_exons) {
01177     my $rank = $unmapped_exon->original_rank;
01178     do {
01179       if (defined($sorted_exons_by_rank->{$rank})) {
01180         push(@{$sorted_exons_by_rank->{$rank}}, $unmapped_exon);
01181         $rank = 0;
01182       } elsif ($rank == 1) {
01183         if ($transcript_strand == 1) {
01184           push(@{$sorted_exons_by_rank->{$rank}}, $unmapped_exon);
01185         } else {
01186           $rank++ while(!defined($sorted_exons_by_rank->{$rank}));
01187           splice(@{$sorted_exons_by_rank->{$rank}}, 1, 0, $unmapped_exon);
01188         }
01189         $rank = 0;
01190       }
01191       $rank--;
01192     } while ($rank > 0);
01193   }
01194 
01195   foreach my $exons (sort {
01196             if (defined($a->[0]->start) and defined($b->[0]->start)) {
01197               $a->[0]->start <=> $b->[0]->start;
01198             } else {
01199               $a->[0]->original_rank <=> $b->[0]->original_rank;
01200             }
01201         } values %$sorted_exons_by_rank) {
01202     push(@sorted_exons, @{$exons});
01203   }
01204 
01205   return @sorted_exons;
01206 }
01207 
01208 # OVERWRITEN METHODS FROM Bio::EnsEMBL::Slice module
01209 #
01210 # WARNING - WARNING - WARNING - WARNING - WARNING - WARNING - WARNING
01211 #
01212 # All the methods that need access to the database (the adaptor) and which
01213 # are not listed here are not supported!!
01214 #
01215 # WARNING - WARNING - WARNING - WARNING - WARNING - WARNING - WARNING
01216 #
01217 =head2 invert (not supported)
01218 
01219 Maybe at some point...
01220 
01221 This method returns undef
01222 
01223 =cut
01224 
01225 sub invert {
01226   warning("Cannot invert a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); 
01227   return undef;
01228 }
01229 
01230 
01231 =head2 sub_Slice
01232 
01233   Arg   1    : int $start
01234   Arg   2    : int $end
01235   Arge [3]   : int $strand
01236   Example    : none
01237   Description: Makes another Slice that covers only part of this slice
01238                If a slice is requested which lies outside of the boundaries
01239                of this function will return undef.  This means that
01240                behaviour will be consistant whether or not the slice is
01241                attached to the database (i.e. if there is attached sequence
01242                to the slice).  Alternatively the expand() method or the
01243                SliceAdaptor::fetch_by_region method can be used instead.
01244   Returntype : Bio::EnsEMBL::Slice or undef if arguments are wrong
01245   Exceptions : return undef if $start and $end define a region outside
01246                of actual Slice.
01247   Caller     : general
01248   Status     : Testing
01249 
01250 =cut
01251 
01252 sub sub_Slice {
01253   my ( $self, $start, $end, $strand ) = @_;
01254 
01255   if( $start < 1 || $start > $self->{'end'} ) {
01256     # throw( "start argument not valid" );
01257     return undef;
01258   }
01259 
01260   if( $end < $start || $end > $self->{'end'} ) {
01261     # throw( "end argument not valid" )
01262     return undef;
01263   }
01264 
01265   my ($new_start, $new_end, $new_strand);
01266   if (!defined($strand)) {
01267     $strand = 1;
01268   }
01269 
01270   if( $self->{'strand'} == 1 ) {
01271     $new_start = $self->{'start'} + $start - 1;
01272     $new_end = $self->{'start'} + $end - 1;
01273     $new_strand = $strand;
01274   } else {
01275     $new_start = $self->{'end'} - $end + 1;;
01276     $new_end = $self->{'end'} - $start + 1;
01277     $new_strand = -$strand;
01278   }
01279 
01280   #fastest way to copy a slice is to do a shallow hash copy
01281   my %new_slice = %$self;
01282 
01283   ## Delete cached genes
01284   foreach my $key (grep {/^key_/} keys %new_slice) {
01285     delete($new_slice{$key});
01286   }
01287   $new_slice{'seq'} = undef;
01288   $new_slice{'start'} = int($new_start);
01289   $new_slice{'end'}   = int($new_end);
01290   $new_slice{'strand'} = $new_strand;
01291 
01292   return bless \%new_slice, ref($self);
01293 }
01294 
01295 =head2 seq
01296 
01297   Arg [1]    : none
01298   Example    : print "SEQUENCE = ", $slice->seq();
01299   Description: Returns the sequence of the region represented by this
01300                slice formatted as a string.
01301                This Slice is made of several Bio::EnsEMBL::Slices mapped
01302                on it with gaps inside and regions with no matching
01303                sequence. The resulting string might contain gaps and/or
01304                dots as padding characters. If several Slices map on the
01305                same positions, the last one will override the positions.
01306   Returntype : string
01307   Exceptions : none
01308   Caller     : general
01309 
01310 =cut
01311 
01312 sub seq {
01313   my $self = shift;
01314   my $start = 1;
01315   my $end = $self->length;
01316   my $strand = 1; # strand is reversed in the subseq method if needed
01317 
01318   return $self->{seq} if (defined($self->{seq}));
01319 
01320   $self->{seq} = $self->subseq($start, $end, $strand);
01321   
01322   return $self->{seq};
01323 }
01324 
01325 
01326 =head2 subseq
01327 
01328   Arg  [1]   : int $startBasePair
01329                relative to start of slice, which is 1.
01330   Arg  [2]   : int $endBasePair
01331                relative to start of slice.
01332   Arg  [3]   : (optional) int $strand
01333                The strand of the slice to obtain sequence from. Default
01334                value is 1.
01335   Description: returns string of dna sequence
01336                This Slice is made of several Bio::EnsEMBL::Slices mapped
01337                on it with gaps inside and regions with no matching
01338                sequence. The resulting string might contain gaps and/or
01339                dots as padding characters. If several Slices map on the
01340                same positions, the last one will override the positions.
01341   Returntype : txt
01342   Exceptions : end should be at least as big as start
01343                strand must be set
01344   Caller     : general
01345 
01346 =cut
01347 
01348 sub subseq {
01349   my ($self, $start, $end, $strand) = @_;
01350 
01351   $start = 1 if (!defined($start));
01352   $end ||= $self->length;
01353   $strand ||= 1;
01354 
01355   ## Fix coordinates (needed for sub_Slices)
01356   if ($self->strand == -1) {
01357     $strand = -$strand;
01358     my $aux = $start;
01359     $start = $self->start + ($self->length - $end);
01360     $end = $self->start + ($self->length - $aux);
01361   } else {
01362     $start += $self->start - 1;
01363     $end += $self->start - 1;
01364   }
01365 
01366   my $length = ($end - $start + 1);
01367   my $seq = "." x $length;
01368 
01369   foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs()}) {
01370     my $this_slice = $pair->{slice};
01371     my $mapper = $pair->{mapper};
01372     my $slice_start = $pair->{start};
01373     my $slice_end = $pair->{end};
01374     next if ($slice_start > $end or $slice_end < $start);
01375     my $this_slice_seq = $this_slice->seq();
01376 
01377     # Set slice_start and slice_end in "subseq" coordinates (0 based, for compliance wiht substr() perl func) and trim them
01378     $slice_start -= $start; # $slice_start is now in subseq coordinates
01379     $slice_start = 0 if ($slice_start < 0);
01380     $slice_end -= $start; # $slice_end is now in subseq coordinates
01381     $slice_end = $length if ($slice_end > $length);
01382 
01383     # Invert start and end for the reverse strand
01384     if ($strand == -1) {
01385       my $aux = $slice_end;
01386       $slice_end = $length - $slice_start;
01387       $slice_start = $length - $aux;
01388     }
01389 
01390     my @sequence_coords = $mapper->map_coordinates(
01391             'alignment',
01392             $start,
01393             $end,
01394             $strand,
01395             'alignment'
01396         );
01397 
01398     #####################
01399     # $this_pos refers to the starting position of the subseq if requesting the forward strand
01400     # or the ending position of the subseq if the reverse strand has been requested:
01401     #
01402     # FORWARD STRAND (1)
01403     # $this_pos = 0
01404     #      |
01405     #      ---------------------------------------------------------------------->
01406     #      <----------------------------------------------------------------------
01407     #
01408     # REVERSE STRAND (-1)
01409     #      ---------------------------------------------------------------------->
01410     #      <----------------------------------------------------------------------
01411     #                                                                            |
01412     #                                                                      $this_pos = 0
01413     #
01414     # All remaining coordinates work in the same way except the start and end position
01415     # of the gaps which correspond to the coordinates in the original Slice...
01416     #
01417     my $this_pos = 0;
01418     foreach my $sequence_coord (@sequence_coords) {
01419       ## $sequence_coord refer to genomic_align (a slice in the [+] strand)
01420 
01421       if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
01422         my $subseq;
01423         if ($this_slice->strand == 1) {
01424           $subseq = substr($this_slice_seq,
01425               $sequence_coord->start - $this_slice->start,
01426               $sequence_coord->length);
01427           if ($sequence_coord->strand * $pair->{strand} == -1) {
01428             $subseq = reverse($subseq);
01429             $subseq =~ tr/ACGTacgt/TGCAtgca/;
01430           }
01431         } else {
01432           $subseq = substr($this_slice_seq,
01433               $this_slice->end - $sequence_coord->end,
01434               $sequence_coord->length);
01435           if ($sequence_coord->strand * $pair->{strand} == -1) {
01436             $subseq = reverse($subseq);
01437             $subseq =~ tr/ACGTacgt/TGCAtgca/;
01438           }
01439         }
01440         substr($seq, $this_pos, $sequence_coord->length, $subseq);
01441 
01442       } else {  ## Gap or sequence outside of any alignment
01443         ############
01444         # Get the start and end positions of this gap in "subseq" coordinates
01445         my $this_original_start = $sequence_coord->start - $start;
01446         my $this_original_end = $sequence_coord->end - $start;
01447         if ($strand == -1) {
01448           my $aux = $this_original_end;
01449           $this_original_end = $length - $this_original_start;
01450           $this_original_start = $length - $aux;
01451         }
01452         if ($this_original_start <= $slice_end and $this_original_end >= $slice_start) {
01453           ## This is a gap
01454           my $start_position_of_gap_seq = $this_pos;
01455           my $end_position_of_gap_seq = $this_pos + $sequence_coord->length;
01456           if ($start_position_of_gap_seq < $slice_start) {
01457             $start_position_of_gap_seq = $slice_start;
01458           }
01459           if ($end_position_of_gap_seq > $slice_end + 1) {
01460             $end_position_of_gap_seq = $slice_end + 1;
01461           }
01462           my $length_of_gap_seq = $end_position_of_gap_seq - $start_position_of_gap_seq;
01463           substr($seq, $start_position_of_gap_seq, $length_of_gap_seq, "-" x $length_of_gap_seq)
01464               if ($length_of_gap_seq > 0);
01465         }
01466       }
01467       $this_pos += $sequence_coord->length;
01468     }
01469   }
01470   return $seq;
01471 }
01472 
01473 
01474 =head2 get_cigar_line
01475 
01476   Arg        : -none-
01477   Description: returns a cigar_line describing the gaps in the mapped sequence
01478                This Slice is made of several Bio::EnsEMBL::Slices mapped
01479                on it with gaps inside and regions with no matching
01480                sequence. The resulting cigar line corresponds to the mapping
01481                of all the nucleotides that can be mapepd. If several Slices map
01482                on the same positions, the behaviour is undefined.
01483                The cigar_line includes 3 types of regions: M for matches/mismatches,
01484                D for alignment gaps (formerly known as deletions) and G for
01485                gaps between alignment blocks.
01486   Returntype : txt
01487   Exceptions :
01488   Caller     : general
01489 
01490 =cut
01491 
01492 sub get_cigar_line {
01493   my ($self, $start, $end, $strand) = @_;
01494 
01495   $start = $self->start;
01496   $end = $self->end;
01497   $strand ||= 1;
01498 
01499   ## Fix coordinates (needed for sub_Slices)
01500   if ($self->strand == -1) {
01501     $strand = -$strand;
01502     my $aux = $start;
01503     $start = $self->start + ($self->length - $end);
01504     $end = $self->start + ($self->length - $aux);
01505   } else {
01506     $start += $self->start - 1;
01507     $end += $self->start - 1;
01508   }
01509 
01510   my $length = ($end - $start + 1);
01511   my $seq = "." x $length;
01512   foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs("get_gap_slices")}) {
01513     my $this_slice = $pair->{slice};
01514     my $mapper = $pair->{mapper};
01515     my $slice_start = $pair->{start};
01516     my $slice_end = $pair->{end};
01517     next if ($slice_start > $end or $slice_end < $start);
01518 
01519     # Set slice_start and slice_end in "subseq" coordinates (0 based, for compliance wiht substr() perl func) and trim them
01520     $slice_start -= $start; # $slice_start is now in subseq coordinates
01521     $slice_start = 0 if ($slice_start < 0);
01522     $slice_end -= $start; # $slice_end is now in subseq coordinates
01523     $slice_end = $length if ($slice_end > $length);
01524 
01525     # Invert start and end for the reverse strand
01526     if ($strand == -1) {
01527       my $aux = $slice_end;
01528       $slice_end = $length - $slice_start;
01529       $slice_start = $length - $aux;
01530     }
01531 
01532     my @sequence_coords = $mapper->map_coordinates(
01533             'alignment',
01534             $start,
01535             $end,
01536             $strand,
01537             'alignment'
01538         );
01539     #####################
01540     # $this_pos refers to the starting position of the subseq if requesting the forward strand
01541     # or the ending position of the subseq if the reverse strand has been requested:
01542     #
01543     # FORWARD STRAND (1)
01544     # $this_pos = 0
01545     #      |
01546     #      ---------------------------------------------------------------------->
01547     #      <----------------------------------------------------------------------
01548     #
01549     # REVERSE STRAND (-1)
01550     #      ---------------------------------------------------------------------->
01551     #      <----------------------------------------------------------------------
01552     #                                                                            |
01553     #                                                                      $this_pos = 0
01554     #
01555     # All remaining coordinates work in the same way except the start and end position
01556     # of the gaps which correspond to the coordinates in the original Slice...
01557     #
01558     my $this_pos = 0;
01559     foreach my $sequence_coord (@sequence_coords) {
01560       ## $sequence_coord refer to genomic_align (a slice in the [+] strand)
01561 
01562       if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
01563         my $subseq = "N" x ($sequence_coord->length);
01564         substr($seq, $this_pos, $sequence_coord->length, $subseq);
01565 
01566       } else {  ## Gap or sequence outside of any alignment
01567         ############
01568         # Get the start and end positions of this gap in "subseq" coordinates
01569         my $this_original_start = $sequence_coord->start - $start;
01570         my $this_original_end = $sequence_coord->end - $start;
01571         if ($strand == -1) {
01572           my $aux = $this_original_end;
01573           $this_original_end = $length - $this_original_start;
01574           $this_original_start = $length - $aux;
01575         }
01576         if ($this_original_start <= $slice_end and $this_original_end >= $slice_start) {
01577           ## This is a gap
01578           my $start_position_of_gap_seq = $this_pos;
01579           my $end_position_of_gap_seq = $this_pos + $sequence_coord->length;
01580           if ($start_position_of_gap_seq < $slice_start) {
01581             $start_position_of_gap_seq = $slice_start;
01582           }
01583           if ($end_position_of_gap_seq > $slice_end + 1) {
01584             $end_position_of_gap_seq = $slice_end + 1;
01585           }
01586           my $length_of_gap_seq = $end_position_of_gap_seq - $start_position_of_gap_seq;
01587           substr($seq, $start_position_of_gap_seq, $length_of_gap_seq, "-" x $length_of_gap_seq)
01588               if ($length_of_gap_seq > 0);
01589         }
01590       }
01591       $this_pos += $sequence_coord->length;
01592     }
01593   }
01594   my $cigar_line = "";
01595 
01596   my @pieces = split(/(\-+|\.+)/, $seq);
01597   foreach my $piece (@pieces) {
01598     my $mode;
01599     if ($piece =~ /\./) {
01600       $mode = "G"; # D for gaps (deletions)
01601     } elsif ($piece =~ /\-/) {
01602       $mode = "D"; # D for gaps (deletions)
01603     } else {
01604       $mode = "M"; # M for matches/mismatches
01605     }
01606     if (length($piece) == 1) {
01607       $cigar_line .= $mode;
01608     } elsif (length($piece) > 1) { #length can be 0 if the sequence starts with a gap
01609       $cigar_line .= length($piece).$mode;
01610     }
01611   }
01612 
01613   return $cigar_line;
01614 }
01615 
01616 
01617 =head2 get_all_underlying_Slices
01618 
01619   Arg  [1]   : int $startBasePair
01620                relative to start of slice, which is 1.
01621   Arg  [2]   : int $endBasePair
01622                relative to start of slice.
01623   Arg  [3]   : (optional) int $strand
01624                The strand of the slice to obtain sequence from. Default
01625                value is 1.
01626   Description: This Slice is made of several Bio::EnsEMBL::Slices mapped
01627                on it with gaps inside and regions with no matching
01628                sequence. This method returns these Slices (or part of
01629                them) with the original coordinates and the gapped
01630                sequence attached to them. Additionally, extra Slices
01631                could be returned in order to fill in gaps between
01632                underlying Slices.
01633   Returntype : listref of Bio::EnsEMBL::Slice objects
01634   Exceptions : end should be at least as big as start
01635   Caller     : general
01636 
01637 =cut
01638 
01639 sub get_all_underlying_Slices {
01640   my ($self, $start, $end, $strand) = @_;
01641   my $underlying_slices = [];
01642 
01643   $start = 1 if (!defined($start));
01644   $end ||= $self->length;
01645   $strand ||= 1;
01646 
01647   ## Fix coordinates (needed for sub_Slices)
01648   if ($self->strand == -1) {
01649     $strand = -$strand;
01650     my $aux = $start;
01651     $start = $self->start + ($self->length - $end);
01652     $end = $self->start + ($self->length - $aux);
01653   } else {
01654     $start += $self->start - 1;
01655     $end += $self->start - 1;
01656   }
01657 
01658   my $current_position;
01659 #   if ($strand == 1) {
01660     $current_position = $start;
01661 #   } else {
01662 #     $current_position = $end;
01663 #   }
01664   foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs}) {
01665     my $this_slice = $pair->{slice};
01666     my $mapper = $pair->{mapper};
01667     my $slice_start = $pair->{start};
01668     my $slice_end = $pair->{end};
01669     next if ($slice_start > $end or $slice_end < $start);
01670 
01671     my @sequence_coords = $mapper->map_coordinates(
01672             'alignment',
01673             $start,
01674             $end,
01675             $strand,
01676             'alignment'
01677         );
01678     my $this_subseq_start;
01679     my $this_subseq_end;
01680     my $this_subseq_strand;
01681     foreach my $sequence_coord (@sequence_coords) {
01682       ## $sequence_coord refer to genomic_align (a slice in the [+] strand)
01683       if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
01684         $this_subseq_start = $sequence_coord->start
01685             if (!defined($this_subseq_start) or $this_subseq_start > $sequence_coord->start);
01686         $this_subseq_end = $sequence_coord->end
01687             if (!defined($this_subseq_end) or $this_subseq_end < $sequence_coord->end);
01688         $this_subseq_strand = $sequence_coord->strand
01689             if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate") and !defined($this_subseq_strand));
01690       }
01691     }
01692 #     next if (!defined($this_subseq_start)); # if the whole requested region correspond to a gap
01693     my $start_position = ($start>$slice_start)?$start:$slice_start; # in AlignSlice coordinates
01694     my $end_position = ($end<$slice_end)?$end:$slice_end; # in AlignSlice coordinates
01695 #     if ($strand == 1) {
01696       if ($start_position > $current_position) {
01697         my $this_underlying_slice = new Bio::EnsEMBL::Slice(
01698               -coord_system => $gap_coord_system,
01699               -seq_region_name => "GAP",
01700               -start => $current_position,
01701               -end => $start_position - 1,
01702               -strand => 0
01703             );
01704         $this_underlying_slice->{seq} = "." x ($start_position - $current_position);
01705         push(@$underlying_slices, $this_underlying_slice);
01706       }
01707 #     } else {
01708 #       if ($end_position < $current_position) {
01709 #         my $this_underlying_slice = new Bio::EnsEMBL::Slice(
01710 #               -seq_region_name => "GAP",
01711 #               -start => $end_position + 1,
01712 #               -end => $current_position,
01713 #               -strand => 0
01714 #             );
01715 #         $this_underlying_slice->{seq} = "." x ($current_position - $end_position);
01716 #         unshift(@$underlying_slices, $this_underlying_slice);
01717 #       }
01718 #     }
01719     $current_position = $end_position + 1;
01720     my $this_underlying_slice;
01721     if (!defined($this_subseq_start)) {
01722       $this_underlying_slice = new Bio::EnsEMBL::Slice(
01723               -coord_system => $gap_coord_system,
01724               -seq_region_name => "GAP",
01725               -start => $start_position,
01726               -end => $end_position,
01727               -strand => 0
01728           );
01729       $this_underlying_slice->{seq} = "-" x ($end_position - $start_position + 1);
01730     } else {
01731       $this_underlying_slice = new Bio::EnsEMBL::Slice(
01732               -coord_system => $this_slice->coord_system,
01733               -seq_region_name => $this_slice->seq_region_name,
01734               -start => $this_subseq_start,
01735               -end => $this_subseq_end,
01736               -strand => $this_subseq_strand
01737           );
01738       $this_underlying_slice->{seq} = $self->subseq($start_position, $end_position, $strand);
01739       $this_underlying_slice->{_tree} = $this_slice->{_tree} if (defined($this_slice->{_tree}));
01740     }
01741 #     if ($strand == 1) {
01742       push(@$underlying_slices, $this_underlying_slice);
01743 #     } else {
01744 #       unshift(@$underlying_slices, $this_underlying_slice);
01745 #     }
01746   }
01747   if ($end >= $current_position) {
01748     my $this_underlying_slice = new Bio::EnsEMBL::Slice(
01749           -coord_system => $gap_coord_system,
01750           -seq_region_name => "GAP",
01751           -seq_region_length => ($end - $current_position + 1),
01752           -start => $current_position,
01753           -end => $end,
01754           -strand => 0
01755         );
01756     $this_underlying_slice->{seq} = "." x ($end - $current_position + 1);
01757     push(@$underlying_slices, $this_underlying_slice);
01758   }
01759   if ($strand == -1) {
01760     @$underlying_slices = reverse(@$underlying_slices);
01761   }
01762 
01763   return $underlying_slices;
01764 }
01765 
01766 =head2 get_original_seq_region_position
01767 
01768   Arg  [1]   : int $position
01769                relative to start of slice, which is 1.
01770   Description: This Slice is made of several Bio::EnsEMBL::Slices mapped
01771                on it with gaps inside and regions with no matching
01772                sequence. This method returns the original seq_region_position
01773                in the original Slice of the requested position in AlignSlice
01774                coordinates
01775   Example    : my ($slice, $seq_region_position) = $as_slice->
01776                    get_original_seq_region_position(100);
01777   Returntype : ($slice, $seq_region_position), an array where the first
01778                element is a Bio::EnsEMBL::Slice and the second one is the
01779                requested seq_region_position.
01780   Exceptions : if the position corresponds to a gap, the slice will be a fake GAP
01781                slice and the position will be the requested one (in AlignSlice
01782                coordinates)
01783   Caller     : general
01784 
01785 =cut
01786 
01787 sub get_original_seq_region_position {
01788   my ($self, $position) = @_;
01789   my $underlying_slice = $self->get_all_underlying_Slices($position, $position, 1)->[0];
01790 
01791   return ($underlying_slice, $underlying_slice->start);
01792 }
01793 
01794 
01795 =head2 get_all_constrained_elements
01796 
01797   Arg  1     : (opt) string $method_link_type (default = GERP_CONSTRAINED_ELEMENT)
01798   Arg  2     : (opt) listref Bio::EnsEMBL::Compara::GenomeDB $species_set
01799                (default, the set of species from the MethodLinkSpeciesSet used
01800                to build this AlignSlice)
01801   Example    : my $constrained_elements =
01802                     $align_slice->get_all_constrained_elements();
01803   Description: Retrieve the corresponding constrained elements for these alignments.
01804                Objects will be mapped on this AlignSlice::Slice, i.e. the
01805                reference_slice, reference_slice_start, reference_slice_end
01806                and reference_slice_strand will refer to this AlignSlice::Slice
01807                object
01808   Returntype : ref. to an array of Bio::EnsEMBL::Compara::GenomicAlignBlock
01809                objects.
01810   Caller     : object::methodname
01811   Status     : At risk
01812 
01813 =cut
01814 
01815 sub get_all_constrained_elements {
01816   my ($self, $method_link_type, $species_set) = @_;
01817   my $all_constrained_elements = [];
01818 
01819   return [] if (!$self->{_align_slice});
01820   my $all_original_constrained_elements = $self->{_align_slice}->
01821       get_all_constrained_elements($method_link_type, $species_set);
01822   foreach my $this_constrained_element (@$all_original_constrained_elements) {
01823     foreach my $this_genomic_align (@{$this_constrained_element->get_all_GenomicAligns}) {
01824       if ($this_genomic_align->genome_db->name eq $self->genome_db->name) {
01825         my $constrained_slice = $this_genomic_align->get_Slice;
01826         my $slices = $self->map_original_Slice($constrained_slice);
01827         $this_constrained_element->reference_slice($self);
01828         $this_constrained_element->reference_slice_start($slices->[0]->start);
01829         $this_constrained_element->reference_slice_end($slices->[0]->end);
01830         $this_constrained_element->reference_slice_strand($slices->[0]->strand);
01831         push(@$all_constrained_elements, $this_constrained_element);
01832       }
01833     }
01834   }
01835 
01836   return $all_constrained_elements;
01837 }
01838 
01839 
01840 =head2 map_original_Slice
01841 
01842   Arg  [1]   : Bio::EnsEMBL::Slice $original_slice
01843   Description: This Slice is made of several Bio::EnsEMBL::Slices mapped
01844                on it with gaps inside and regions with no matching
01845                sequence. This method tries to map on this Slice the
01846                region(s) corresponding to the provided $original_slice
01847                NB: This method does not know how to project Slices onto
01848                other coordinate systems. It is your responsability to
01849                provide an original Slice on the same coordinate system
01850                as the underlying Bio::EnsEMBL::Slices
01851   Example    : my $slices = $as_slice->map_original_Slice($orginal_slice);
01852   Returntype : listref of Bio::EnsEMBL::Compara::AlignSlice::Slice objects
01853                which are the sub_Slices of this Bio::EnsEMBL::Compara::
01854                AlignSlice::Slice where the $original_slice maps
01855   Exceptions :
01856 
01857 =cut
01858 
01859 sub map_original_Slice {
01860   my ($self, $original_slice) = @_;
01861   my $mapped_slices = [];
01862 
01863   return $mapped_slices if (!defined($original_slice));
01864 
01865   foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) {
01866     my $this_slice = $pair->{slice};
01867     my $mapper = $pair->{mapper};
01868     my $slice_start = $pair->{start};
01869     my $slice_end = $pair->{end};
01870     next if (!$this_slice->coord_system->equals($original_slice->coord_system));
01871     next if ($this_slice->seq_region_name ne $original_slice->seq_region_name);
01872 
01873     next if ($this_slice->start > $original_slice->end or $this_slice->end < $original_slice->start);
01874 
01875     my @sequence_coords = $mapper->map_coordinates(
01876             'sequence',
01877             $original_slice->start,
01878             $original_slice->end,
01879             $original_slice->strand,
01880             'sequence'
01881         );
01882     my $mapped_start;
01883     my $mapped_end;
01884     my $mapped_strand;
01885     foreach my $sequence_coord (@sequence_coords) {
01886       if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
01887         if (!defined($mapped_start) or ($mapped_start > $sequence_coord->start)) {
01888           $mapped_start = $sequence_coord->start;
01889         }
01890         if (!defined($mapped_end) or ($mapped_end < $sequence_coord->end)) {
01891           $mapped_end = $sequence_coord->end;
01892         }
01893         if (!defined($mapped_strand)) {
01894           $mapped_strand = $sequence_coord->strand;
01895         } elsif ($mapped_strand != $sequence_coord->strand) {
01896           warning("strand inversion within a Slice-Mapper pair!");
01897           $mapped_start = undef;
01898           $mapped_end = undef;
01899           $mapped_strand = undef;
01900           last;
01901         }
01902       }
01903     }
01904     if (defined($mapped_start) and defined($mapped_end) and defined($mapped_strand)) {
01905       my $sub_slice = $self->sub_Slice($mapped_start, $mapped_end, $mapped_strand);
01906       push(@$mapped_slices, $sub_slice) if ($sub_slice);
01907     }
01908   }
01909 
01910   return $mapped_slices;
01911 }
01912 
01913 
01914 =head2 expand (not supported)
01915 
01916 Expanding a Bio::EnsEMBL::Compara::AlignSlice::Slice object is not supported at the moment.
01917 You could create a new Bio::EnsEMBL::Slice and fetch a new Bio::EnsEMBL::Compara::AlignSlice
01918 from it. On the hand, if you only want to extend the sequence, you could use the subseq method
01919 with a negatve start value or an end value larger than the end of this Slice.
01920 
01921 This method returns undef
01922 
01923 =cut
01924 
01925 sub expand {
01926   my $self = shift;
01927   warning("Cannot expand a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); 
01928   return undef;
01929 }
01930 
01931 
01932 =head2 get_all_Attributes
01933 
01934   Arg [1]    : optional string $attrib_code
01935                The code of the attribute type to retrieve values for.
01936   Example    : ($htg_phase) = @{$slice->get_all_Attributes('htg_phase')};
01937                @slice_attributes    = @{$slice->get_all_Attributes()};
01938   Description: Gets a list of Attributes of all teh underlying slice''s
01939                seq_region. Optionally just get Attributes for given code.
01940                This Slice is made of several Bio::EnsEMBL::Slices mapped
01941                on it. This method go through all of them, retrieves the
01942                data and return them in order. There will be one set of
01943                Attributes by underlying slice.
01944   Returntype : listref Bio::EnsEMBL::Attribute
01945   Exceptions : warning if slice does not have attached adaptor
01946   Caller     : general
01947 
01948 =cut
01949 
01950 sub get_all_Attributes {
01951   my $self = shift;
01952   my $attributes = [];
01953 
01954   foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) {
01955     my $this_slice = $pair->{slice};
01956     push(@$attributes, @{$this_slice->get_all_Attributes(@_)});
01957   }
01958   return $attributes;
01959 }
01960 
01961 
01962 =head2 get_all_VariationFeatures
01963 
01964     Args       : none
01965     Description :returns all variation features on this slice. This function will only work 
01966                 correctly if the variation database has been attached to the core database.
01967                 This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This
01968                 method go through all of them, retrieves the data and maps them on this
01969                 Bio::EnsEMBL::Compara::AlignSlice::Slice object.
01970     ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature
01971     Exceptions : none
01972     Caller     : contigview, snpview
01973 
01974 =cut
01975 
01976 sub get_all_VariationFeatures {
01977   my $self = shift;
01978 
01979   return $self->_method_returning_simple_features("get_all_VariationFeatures", @_)
01980 }
01981 
01982 
01983 =head2 get_all_genotyped_VariationFeatures
01984 
01985   Args       : none
01986   Function   : returns all variation features on this slice that have been genotyped. This
01987                function will only work correctly if the variation database has been
01988                attached to the core database.
01989                This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This
01990                method go through all of them, retrieves the data and maps them on this
01991                Bio::EnsEMBL::Compara::AlignSlice::Slice object by changing start, end,
01992                strand and slice attributes.
01993   ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature
01994   Exceptions : none
01995   Caller     : contigview, snpview
01996 
01997 =cut
01998 
01999 sub get_all_genotyped_VariationFeatures {
02000   my $self = shift;
02001 
02002   return $self->_method_returning_simple_features("get_all_genotyped_VariationFeatures", @_)
02003 }
02004 
02005 
02006 =head2 get_all_RepeatFeatures
02007 
02008   Arg [1]    : (optional) string $logic_name
02009                The name of the analysis performed on the repeat features
02010                to obtain.
02011   Arg [2]    : (optional) string $repeat_type
02012                Limits features returned to those of the specified repeat_type
02013   Example    : @repeat_feats = @{$slice->get_all_RepeatFeatures(undef,'LTR')};
02014   Description: Retrieves the RepeatFeatures which overlap  with
02015                logic name $logic_name and with score above $score.  If 
02016                $logic_name is not defined features of all logic names are 
02017                retrieved.
02018                This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This
02019                method go through all of them, retrieves the data and maps them on this
02020                Bio::EnsEMBL::Compara::AlignSlice::Slice object by changing start, end,
02021                strand and slice attributes.
02022   Returntype : listref of Bio::EnsEMBL::RepeatFeatures
02023   Exceptions : warning if slice does not have attached adaptor
02024   Caller     : general
02025 
02026 =cut
02027 
02028 sub get_all_RepeatFeatures {
02029   my $self = shift;
02030 
02031   return $self->_method_returning_simple_features("get_all_RepeatFeatures", @_)
02032 }
02033 
02034 
02035 =head2 project (testing)
02036 
02037   Arg [1]    : string $name
02038                The name of the coordinate system to project this slice onto
02039   Arg [2]    : string $version
02040                The version of the coordinate system (such as 'NCBI34') to
02041                project this slice onto
02042   Example    :
02043     my $clone_projection = $slice->project('clone');
02044 
02045     foreach my $seg (@$clone_projection) {
02046       my $clone = $segment->to_Slice();
02047       print $slice->seq_region_name(), ':', $seg->from_start(), '-',
02048             $seg->from_end(), ' -> ',
02049             $clone->seq_region_name(), ':', $clone->start(), '-',$clone->end(),
02050             $clone->strand(), "\n";
02051     }
02052   Description: This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This
02053                method go through all of them, porject them and maps the projections
02054                on this Bio::EnsEMBL::Compara::AlignSlice::Slice object.
02055                The original 'project' method returns the results of 'projecting'
02056                a slice onto another coordinate system.  Projecting to a coordinate
02057                system that the slice is assembled from is analagous to retrieving a tiling
02058                path. The original method may also be used to 'project up' to a higher
02059                level coordinate system, however.
02060 
02061                This method returns a listref of triplets [start,end,slice]
02062                which represents the projection.  The start and end defined the
02063                region of this slice which is made up of the third value of
02064                the triplet: a slice in the requested coordinate system.
02065 
02066                Because of the gaps in the mapping of the Bio::EnsEMBL::Slices
02067                the lenght of the slice returned in the tripet may be different
02068                than the distance defined by the start and end of the
02069                Bio::EnsEMBL::ProjectionSegment object.
02070   Returntype : list reference of Bio::EnsEMBL::ProjectionSegment objects which
02071                can also be used as [$start,$end,$slice] triplets
02072   Exceptions : none
02073   Caller     : general
02074 
02075 =cut
02076 
02077 sub project {
02078   my $self = shift;
02079   my $cs_name = shift;
02080   my $cs_version = shift;
02081   my $projections = [];
02082 
02083   throw('Coord_system name argument is required') if(!$cs_name);
02084 
02085   foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) {
02086     my $this_slice = $pair->{slice};
02087     my $this_mapper = $pair->{mapper};
02088     my $this_projections = $this_slice->project($cs_name, $cs_version);
02089     foreach my $this_projection (@$this_projections) {
02090       my ($this_start, $this_end);
02091       if ($this_slice->strand == 1) {
02092         $this_start = $this_slice->start + $this_projection->from_start - 1;
02093         $this_end = $this_slice->start + $this_projection->from_end - 1;
02094       } else {
02095         $this_start = $this_slice->start + ($this_slice->length - $this_projection->from_start);
02096         $this_end = $this_slice->start + ($this_slice->length - $this_projection->from_end);
02097       }
02098       my $new_slice = $this_projection->to_Slice;
02099       my ($new_start, $new_end);
02100       my @alignment_coords = $this_mapper->map_coordinates(
02101               'sequence',
02102               $this_start,
02103               $this_start,
02104               1,
02105               'sequence'
02106           );
02107       foreach my $alignment_coord (@alignment_coords) {
02108         if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
02109           $new_start = $alignment_coord->start;
02110         }
02111       }
02112       next if (!defined($new_start));
02113       @alignment_coords = $this_mapper->map_coordinates(
02114               'sequence',
02115               $this_end,
02116               $this_end,
02117               1,
02118               'sequence'
02119           );
02120       foreach my $alignment_coord (@alignment_coords) {
02121         if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
02122           $new_end = $alignment_coord->start;
02123         }
02124       }
02125       next if (!defined($new_end));
02126 
02127       ## Truncate projection in order to fit into this AlignSlice
02128       if ($new_start > $self->end) {
02129         next; ## Maps outside of sub_AlignSlice
02130       } elsif ($new_start < $self->start) {
02131         if ($new_slice->strand == -1) {
02132           $new_slice = $new_slice->sub_Slice(1, $new_slice->length - ($self->start - $new_start));
02133         } else {
02134           $new_slice = $new_slice->sub_Slice($self->start - $new_start + 1, $new_slice->length);
02135         }
02136         $new_start = 1;
02137         $new_end = $new_slice->length;
02138       }
02139 
02140       ## Truncate projection in order to fit into this AlignSlice
02141       if ($new_end < $self->start) {
02142         next; ## Maps outside of sub_AlignSlice
02143       } elsif ($new_end > $self->length) {
02144         # We have to truncate this projection
02145         if ($new_slice->strand == -1) {
02146           $new_slice = $new_slice->sub_Slice(1 + $new_end - $self->length, $new_slice->length);
02147         } else {
02148           $new_slice = $new_slice->sub_Slice(1, $new_slice->length - ($new_end - $self->length));
02149         }
02150         $new_end = $self->length;
02151       }
02152 
02153       my $new_projection = bless([$new_start, $new_end, $new_slice],
02154                                 "Bio::EnsEMBL::ProjectionSegment");
02155       push(@$projections, $new_projection);
02156     }
02157   }
02158 
02159   return $projections;
02160 }
02161 
02162 
02163 =head2 _method_returning_simple_features
02164 
02165   Args[1]     : method_name
02166   Description : This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This
02167                 method go through all of them, calls method_name and maps teh result on
02168                 this Bio::EnsEMBL::Compara::AlignSlice::Slice object.
02169   ReturnType  : listref of Bio::EnsEMBL::Variation::VariationFeature
02170   Exceptions  : none
02171   Caller      : contigview, snpview
02172 
02173 =cut
02174 
02175 sub _method_returning_simple_features {
02176   my $self = shift;
02177   my $method = shift;
02178   my $ret = [];
02179 
02180   foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) {
02181     my $this_slice = $pair->{slice};
02182     my $this_mapper = $pair->{mapper};
02183     my $this_ret = $this_slice->$method(@_);
02184     foreach my $this_object (@$this_ret) {
02185       my ($this_start, $this_end);
02186       if ($this_slice->strand == 1) {
02187         $this_start = $this_object->slice->start + $this_object->start - 1;
02188         $this_end = $this_object->slice->start + $this_object->end - 1;
02189       } else {
02190         $this_start = $this_object->slice->start + ($this_object->slice->length - $this_object->end);
02191         $this_end = $this_object->slice->start + ($this_object->slice->length - $this_object->start);
02192       }
02193       my @alignment_coords = $this_mapper->map_coordinates(
02194               'sequence',
02195               $this_start,
02196               $this_end,
02197               $this_object->strand,
02198               'sequence'
02199           );
02200       my ($start, $end, $strand);
02201       foreach my $alignment_coord (@alignment_coords) {
02202         if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) {
02203           if (!defined($start) or $start > $alignment_coord->start) {
02204             $start = $alignment_coord->start;
02205           }
02206           if (!defined($end) or $end < $alignment_coord->end) {
02207             $end = $alignment_coord->end;
02208           }
02209           if (!defined($strand)) {
02210             $strand = $alignment_coord->strand * $this_slice->strand;
02211           }
02212         }
02213       }
02214       my $new_object;
02215       %$new_object = %$this_object;
02216       bless $new_object, ref($this_object);
02217       if (defined($start) and defined($end) and defined($strand)) {
02218         $new_object->{start} = $start - $self->start + 1;
02219         $new_object->{end} = $end - $self->start + 1;
02220         $new_object->{strand} = $strand;
02221         $new_object->{slice} = $self;
02222         # Skip this object if it maps outside of this AlignSlice
02223         next if ($new_object->{start} > $self->length or $new_object->{end} < 1);
02224         push(@$ret, $new_object);
02225       }
02226     }
02227   }
02228   return $ret;
02229 }
02230 
02231 1;