Slice.pm
Go to the documentation of this file.
00001 # 00002 # Ensembl module for Bio::EnsEMBL::Compara::AlignSlice::Slice 00003 # 00004 # Original author: Javier Herrero <jherrero@ebi.ac.uk> 00005 # 00006 # Copyright EnsEMBL Team 00007 # 00008 # You may distribute this module under the same terms as perl itself 00009 00010 # pod documentation - main docs before the code 00011 00012 =head1 NAME 00013 00014 Bio::EnsEMBL::Compara::AlignSlice::Slice - These objects contain all the information needed 00015 for mapping features through genomic alignments 00016 00017 =head1 INHERITING 00018 00019 This module inherits methods and attributes from Bio::EnsEMBL::Slice module. 00020 00021 =head1 SYNOPSIS 00022 00023 00024 SET VALUES 00025 00026 GET VALUES 00027 00028 =head1 OBJECT ATTRIBUTES 00029 00030 =over 00031 00032 =item attribute 00033 00034 Description 00035 00036 =back 00037 00038 =head1 AUTHORS 00039 00040 Javier Herrero (jherrero@ebi.ac.uk) 00041 00042 =head1 COPYRIGHT 00043 00044 Copyright (c) 1999-2012. EnsEMBL Team 00045 00046 You may distribute this module under the same terms as perl itself 00047 00048 =head1 CONTACT 00049 00050 This modules is part of the EnsEMBL project (http://www.ensembl.org) 00051 00052 Questions can be posted to the ensembl-dev mailing list: 00053 dev@ensembl.org 00054 00055 =head1 APPENDIX 00056 00057 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ 00058 00059 =cut 00060 00061 00062 # Let the code begin... 00063 00064 00065 package Bio::EnsEMBL::Compara::AlignSlice::Slice; 00066 00067 use strict; 00068 use Bio::EnsEMBL::Slice; 00069 use Bio::EnsEMBL::CoordSystem; 00070 use Bio::EnsEMBL::Compara::AlignSlice::Translation; 00071 use Bio::EnsEMBL::Utils::Argument qw(rearrange); 00072 use Bio::EnsEMBL::Utils::Exception qw(throw warning info verbose); 00073 use Scalar::Util qw(weaken); 00074 00075 00076 our @ISA = qw(Bio::EnsEMBL::Slice); 00077 00078 ## Creates a new coordinate system for creating gap Slices. 00079 my $gap_coord_system = new Bio::EnsEMBL::CoordSystem( 00080 -NAME => 'gap', 00081 -VERSION => "none", 00082 -TOP_LEVEL => 0, 00083 -SEQUENCE_LEVEL => 1, 00084 -RANK => 1, 00085 ); 00086 00087 00088 =head2 new (CONSTRUCTOR) 00089 00090 Arg[1] : 00091 Example : 00092 Description: 00093 Returntype : 00094 Exceptions : 00095 Caller : Bio::EnsEMBL::Compara::AlignSlice->_create_underlying_Slices() 00096 00097 =cut 00098 00099 sub new { 00100 my ($class, @args) = @_; 00101 00102 my $self = {}; 00103 bless $self,$class; 00104 00105 my ($length, $requesting_slice, $align_slice, $method_link_species_set, $genome_db, $expanded) = 00106 rearrange([qw( 00107 LENGTH REQUESTING_SLICE ALIGN_SLICE METHOD_LINK_SPECIES_SET GENOME_DB EXPANDED 00108 )], @args); 00109 00110 my $version = ""; 00111 if ($requesting_slice and ref($requesting_slice) and 00112 $requesting_slice->isa("Bio::EnsEMBL::Slice")) { 00113 $self->{'requesting_slice'} = $requesting_slice; 00114 my $name = $requesting_slice->name; 00115 $name =~ s/\:/_/g; 00116 $version .= $name; 00117 } 00118 weaken($self->{_align_slice} = $align_slice) if ($align_slice); 00119 if ($method_link_species_set and ref($method_link_species_set) and 00120 $method_link_species_set->isa("Bio::EnsEMBL::Compara::MethodLinkSpeciesSet")) { 00121 $self->{_method_link_species_set} = $method_link_species_set; 00122 $version .= "+".$method_link_species_set->method_link_type; 00123 my $species_set = $method_link_species_set->species_set(); 00124 if ($species_set) { 00125 $species_set = [sort {$a->name cmp $b->name} @{$species_set}]; 00126 $version .= "(\"".join("\"+\"", map {$_->name} @$species_set)."\")"; 00127 } 00128 if ($expanded) { 00129 $version .= "+expanded"; 00130 } else { 00131 $version .= "+condensed"; 00132 } 00133 } 00134 my $coord_system = new Bio::EnsEMBL::CoordSystem( 00135 -NAME => 'align_slice', 00136 -VERSION => $version, 00137 -TOP_LEVEL => 0, 00138 -SEQUENCE_LEVEL => 1, 00139 -RANK => 1, 00140 ); 00141 00142 $self->{start} = 1; 00143 $self->{end} = $length; 00144 $self->{strand} = 1; 00145 $self->{adaptor} = undef; 00146 $self->{coord_system} = $coord_system; 00147 $self->genome_db($genome_db) if (defined($genome_db)); 00148 $self->{seq_region_name} = (eval{$genome_db->name} or "FakeAlignSlice"); 00149 $self->{display_Slice_name} = $self->{seq_region_name}; 00150 $self->{display_Slice_name} =~ s/ /_/g; 00151 $self->{seq_region_length} = $length; 00152 $self->{slice_mapper_pairs} = []; 00153 # $self->{located_slices} = []; 00154 00155 if (!$self->genome_db) { 00156 throw("You must specify a Bio::EnsEMBL::Compara::GenomeDB when\n". 00157 "creating a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); 00158 } 00159 00160 return $self; 00161 } 00162 00163 00164 =head2 genome_db 00165 00166 Arg[1] : Bio::EnsEMBL::Compara::GenomeDB $genome_db 00167 Example : $slice->genome_db($human_gdb); 00168 Description: getter/setter for the attribute genome_db 00169 Returntype : Bio::EnsEMBL::Compara::GenomeDB object 00170 Exceptions : This attribute should never be unset. Several methods 00171 rely on this attribute. 00172 Caller : $object->methodname 00173 00174 =cut 00175 00176 sub genome_db { 00177 my ($self, $genome_db) = @_; 00178 00179 if (defined($genome_db)) { 00180 throw("[$genome_db] must bu a Bio::EnsEMBL::Compara::GenomeDB object") 00181 unless ($genome_db and ref($genome_db) and $genome_db->isa("Bio::EnsEMBL::Compara::GenomeDB")); 00182 $self->{genome_db} = $genome_db; 00183 } 00184 00185 return $self->{genome_db}; 00186 } 00187 00188 00189 =head2 display_Slice_name 00190 00191 Arg[1] : string $name 00192 Example : $slice->display_Slice_name("Homo_sapiens"); 00193 Description: getter/setter for the attribute display_Slice_name 00194 Returntype : string 00195 Caller : $object->methodname 00196 00197 =cut 00198 00199 sub display_Slice_name { 00200 my ($self, $display_slice_name) = @_; 00201 00202 if (defined($display_slice_name)) { 00203 $self->{display_Slice_name} = $display_slice_name; 00204 } 00205 00206 return ucfirst $self->{display_Slice_name}; 00207 } 00208 00209 00210 =head2 add_Slice_Mapper_pair 00211 00212 Arg[1] : Bio::EnsEMBL::Slice $slice 00213 Arg[2] : Bio::EnsEMBL::Mapper $mapper 00214 Arg[3] : integer $start 00215 Arg[4] : integer $end 00216 Arg[5] : integer $strand 00217 Example : $slice->add_Slice_Mapper_pair($slice, $mapper, 124, 542, -1); 00218 Description: Attaches a pair of Slice and the corresponding Mapper 00219 to this Bio::EnsEMBL::Compara::AlginSlice::Slice 00220 object. The Mapper contains the information for 00221 mapping coordinates from (to) the Slice to (from) the 00222 Bio::EnsEMBL::Compara::AlignSlice. Start, end and strand 00223 locates the $slice in the AlignSlice::Slice. 00224 $start and $end refer to the Coordinate System. If you are using 00225 a sub_Slice, this method will be using the coordinates of the 00226 original Bio::EnsEMBL::Compara::AlignSlice::Slice object! 00227 Returntype : 00228 Exceptions : throws if $slice is not a Bio::EnsEMBL::Slice object 00229 Exceptions : throws if $mapper is not a Bio::EnsEMBL::Mapper object 00230 00231 =cut 00232 00233 sub add_Slice_Mapper_pair { 00234 my ($self, $slice ,$mapper, $start, $end, $strand) = @_; 00235 00236 if (!$slice or !ref($slice) or !$slice->isa("Bio::EnsEMBL::Slice")) { 00237 throw("[$slice] must be a Bio::EnsEMBL::Slice object"); 00238 } 00239 if (!$mapper or !ref($mapper) or !$mapper->isa("Bio::EnsEMBL::Mapper")) { 00240 throw("[$slice] must be a Bio::EnsEMBL::Mapper object"); 00241 } 00242 00243 push(@{$self->{slice_mapper_pairs}}, { 00244 slice => $slice, 00245 mapper => $mapper, 00246 start => $start, 00247 end => $end, 00248 strand => $strand, 00249 }); 00250 00251 return $self->{slice_mapper_pairs}; 00252 } 00253 00254 00255 =head2 get_all_Slice_Mapper_pairs 00256 00257 Arg[1] : [optional] bool $get_gap_slices 00258 Example : $slice_mapper_pairs = $slice->get_all_Slice_Mapper_pairs(); 00259 Description: Returns all pairs of Slices and Mappers attached to this 00260 Bio::EnsEMBL::Compara::AlignSlice::Slice 00261 The $get_gap_slice flag is normally used internally to get the 00262 the gap slices. These are created when the alignment(s) 00263 underlying the AlignSlice correspond to gaps only in one or more 00264 species. These are used to tell the difference between gap due to 00265 lack of alignments and gaps due to the alignments. If you set this 00266 this flag to true you will get these gap slices back but it is your 00267 responsability to deal with these gap slices properly. 00268 Returntype : list ref of hashes which keys are "slice", "mapper", "start", 00269 "end" and "strand". Each hash corresponds to a pair of Slice 00270 and Mapper and the coordintes needed to locate the Slice in 00271 the AlignSlice::Slice. 00272 start and end refer to the Coordinate System. If you are using 00273 a sub_Slice, this method will be using the coordinates of the 00274 original Bio::EnsEMBL::Compara::AlignSlice::Slice object! 00275 Exceptions : return a ref to an empty list if no pairs have been attached 00276 so far. 00277 00278 =cut 00279 00280 sub get_all_Slice_Mapper_pairs { 00281 my ($self, $get_gap_slices) = @_; 00282 my $slice_mapper_pairs = ($self->{slice_mapper_pairs} or []); 00283 if (!$get_gap_slices) { 00284 $slice_mapper_pairs = [grep {$_->{slice}->coord_system_name ne "alignment"} @$slice_mapper_pairs]; 00285 } 00286 00287 return $slice_mapper_pairs; 00288 } 00289 00290 00291 =head2 get_all_Genes_by_type 00292 00293 Arg [1] : string $type 00294 The biotype of genes wanted. 00295 Arg [2] : (optional) string $logic_name 00296 Arg [3] : (optional) boolean $load_transcripts 00297 If set to true, transcripts will be loaded immediately rather 00298 than being lazy-loaded on request. This will result in a 00299 significant speed up if the Transcripts and Exons are going to 00300 be used (but a slow down if they are not). 00301 Example : @genes = @{$slice->get_all_Genes_by_type('protein_coding', 00302 'ensembl')}; 00303 Description: Retrieves genes that overlap this slice of biotype $type. 00304 This is primarily used by the genebuilding code when several 00305 biotypes of genes are used. 00306 00307 The logic name is the analysis of the genes that are retrieved. 00308 If not provided all genes will be retrieved instead. 00309 00310 This methods overwrites the core one since it sends a warning 00311 message and return an empty array because this AlignSlice::Slice 00312 object has no adaptor. This implementation calls the 00313 get_all_Genes methdo elsewhere in this module to fulfil the 00314 query. 00315 00316 Returntype : listref of Bio::EnsEMBL::Genes 00317 Exceptions : none 00318 Caller : genebuilder, general 00319 Status : Stable 00320 00321 =cut 00322 00323 sub get_all_Genes_by_type{ 00324 my ($self, $type, $logic_name, $load_transcripts) = @_; 00325 00326 my @out = grep { $_->biotype eq $type } 00327 @{ $self->get_all_Genes($logic_name, undef, $load_transcripts)}; 00328 00329 return \@out; 00330 } 00331 00332 00333 =head2 get_all_Genes 00334 00335 Arg [1] : (optional) string $logic_name 00336 The name of the analysis used to generate the genes to retrieve 00337 Arg [2] : (optional) string $dbtype 00338 The dbtype of genes to obtain. 00339 Arg [3] : (optional) boolean $load_transcripts 00340 This option is always disabled for AlingSlices. It only exists for 00341 compatibility with the Bio::EnsEMBL::Slice objects. 00342 Example : @genes = @{$slice->get_all_Genes}; 00343 Description: Retrieves all genes that overlap this slice. 00344 Returntype : listref of Bio::EnsEMBL::Genes 00345 Exceptions : none 00346 Caller : none 00347 00348 =cut 00349 00350 sub get_all_Genes { 00351 my ($self, $logic_name, $dbtype, $load_transcripts, @parameters) = @_; 00352 00353 $logic_name ||= ""; 00354 $dbtype ||= "core"; 00355 my ($max_repetition_length, 00356 $strict_order_of_exon_pieces, 00357 $strict_order_of_exons, 00358 $return_unmapped_exons) = rearrange([qw( 00359 MAX_REPETITION_LENGTH 00360 STRICT_ORDER_OF_EXON_PIECES 00361 STRICT_ORDER_OF_EXONS 00362 RETURN_UNMAPPED_EXONS 00363 )], @parameters); 00364 my $max_gap_length = 0; 00365 my $max_intron_length = 0; 00366 00367 $max_repetition_length = 100 if (!defined($max_repetition_length)); 00368 $strict_order_of_exon_pieces = 1 if (!defined($strict_order_of_exon_pieces)); 00369 $strict_order_of_exons = 0 if (!defined($strict_order_of_exons)); 00370 $return_unmapped_exons = 1 if (!defined($return_unmapped_exons)); 00371 00372 my $key = 'key_'. 00373 $dbtype.":". 00374 $logic_name.":". 00375 $max_repetition_length.":". 00376 $strict_order_of_exon_pieces.":". 00377 $strict_order_of_exons.":". 00378 $return_unmapped_exons; 00379 00380 if (!defined($self->{$key})) { 00381 my $all_genes = []; 00382 00383 my $all_pairs = $self->get_all_Slice_Mapper_pairs; 00384 return [] if (!$all_pairs or !@$all_pairs); 00385 00386 ## Create larger Slices in order to speed up fetching of genes 00387 my $all_slices_coordinates; 00388 my $this_slice_coordinates; 00389 my $this_slice_adaptor = $all_pairs->[0]->{slice}->adaptor; 00390 foreach my $pair (sort { 00391 $a->{slice}->seq_region_name cmp $b->{slice}->seq_region_name or 00392 $a->{slice}->start <=> $b->{slice}->start } @$all_pairs) { 00393 # print STDERR "Foreach pair ($pair)... $pair->{slice}->{seq_region_name} $pair->{slice}->{start}\n"; 00394 00395 my $this_slice = $pair->{slice}; 00396 if ($this_slice_coordinates and 00397 ($this_slice_coordinates->{seq_region_name} eq $this_slice->seq_region_name) and 00398 (($this_slice->start - $this_slice_coordinates->{end}) < 10000000)) { 00399 ## lengthen current slice_coordinates 00400 $this_slice_coordinates->{end} = $this_slice->end; 00401 } else { 00402 ## save a deep copy (if needed) and reset current slice_coordinates 00403 if ($this_slice_coordinates->{seq_region_name}) { 00404 my $new_slice_coordinates = { 00405 "coord_system_name" => $this_slice_coordinates->{coord_system_name}, 00406 "seq_region_name" => $this_slice_coordinates->{seq_region_name}, 00407 "start" => $this_slice_coordinates->{start}, 00408 "end" => $this_slice_coordinates->{end}, 00409 "pairs" => $this_slice_coordinates->{pairs} 00410 }; 00411 push(@$all_slices_coordinates, $new_slice_coordinates); 00412 } 00413 $this_slice_coordinates->{coord_system_name} = $this_slice->coord_system_name; 00414 $this_slice_coordinates->{seq_region_name} = $this_slice->seq_region_name; 00415 $this_slice_coordinates->{start} = $this_slice->start; 00416 $this_slice_coordinates->{end} = $this_slice->end; 00417 $this_slice_coordinates->{pairs} = []; 00418 } 00419 ## Add this pair to the set of pairs of the current slice_coordinates 00420 push(@{$this_slice_coordinates->{pairs}}, $pair); 00421 } 00422 push(@$all_slices_coordinates, $this_slice_coordinates) if ($this_slice_coordinates); 00423 00424 foreach $this_slice_coordinates (@$all_slices_coordinates) { 00425 # print STDERR "Foreach this_slice_coordinates...\n"; 00426 my $this_slice = $this_slice_adaptor->fetch_by_region( 00427 $this_slice_coordinates->{coord_system_name}, 00428 $this_slice_coordinates->{seq_region_name}, 00429 $this_slice_coordinates->{start}, 00430 $this_slice_coordinates->{end} 00431 ); 00432 00433 ## Do not load transcripts immediately or the cache could produce 00434 ## some troubles in some special cases! Moreover, in this way we will have 00435 ## the genes, the transcripts and the exons in the same slice!! 00436 my $these_genes = $this_slice->get_all_Genes($logic_name, $dbtype); 00437 foreach my $pair (@{$this_slice_coordinates->{pairs}}) { 00438 # print STDERR "Foreach pair ($pair)...\n"; 00439 foreach my $this_gene (@$these_genes) { 00440 # print STDERR "1. GENE: $this_gene->{stable_id} ($this_gene->{start} - $this_gene->{end})\n"; 00441 my $mapped_gene = $self->_get_mapped_Gene($this_gene, $pair, $return_unmapped_exons); 00442 # print STDERR "2. GENE: $mapped_gene->{stable_id} ($mapped_gene->{start} - $mapped_gene->{end})\n"; 00443 # $mapped_gene = $self->_get_mapped_Gene($this_gene, $pair); 00444 if ($mapped_gene and @{$mapped_gene->get_all_Transcripts}) { 00445 push(@$all_genes, $mapped_gene); 00446 } 00447 } 00448 } 00449 } 00450 00451 $all_genes = $self->_compile_mapped_Genes( 00452 $all_genes, 00453 $max_repetition_length, 00454 $max_gap_length, 00455 $max_intron_length, 00456 $strict_order_of_exon_pieces, 00457 $strict_order_of_exons 00458 ); 00459 00460 $self->{$key} = $all_genes; 00461 } 00462 00463 return $self->{$key}; 00464 } 00465 00466 00467 =head2 _get_mapped_Gene 00468 00469 Arg[1] : Bio::EnsEMBL::Gene $original_gene 00470 Arg[2] : Bio::EnsEMBL::Compara::GenomicAlign $genomic_align 00471 Example : my $mapped_gene = $align_slice->get_mapped_Gene($orignal_gene, $genomic_align); 00472 Description: returns a new Bio::EnsEMBL::Gene object. Mapping is based on exons. 00473 The object returned contains Bio::EnsEMBL::Transcripts objects. Those 00474 mapped transcripts contain Bio::EnsEMBL::Compara::AlignSlice::Exon objects. 00475 If no exon can be mapped, the returned object will contain an empty array of 00476 transcripts. Since mapped objects are not stored in the DB, they have no dbID 00477 and no adaptor. 00478 Returntype : Bio::EnsEMBL::Gene object. (new object) 00479 Exceptions : returns undef if no part of $original_gene can be mapped using this 00480 $genomic_align. This method assumes that the slices of all the exons got 00481 from that gene have the same coordinates as the gene slice. It will throw 00482 if this is not true. 00483 Caller : get_all_Genes 00484 00485 =cut 00486 00487 sub _get_mapped_Gene { 00488 my ($self, $gene, $pair, $return_unmapped_exons) = @_; 00489 00490 ## $range_start and $range_end are used to get rid of extra genes fetched because of the 00491 ## previous speed improvement where several slices are merged in order to access the DB 00492 ## a minimum number of times. $pair->{slice} correspond to the actual alignment while 00493 ## $gene->slice might be much larger and include several alignments. $range_start and 00494 ## $range_end are the coordinates (using $gene->slice as a ref.) where the alignment is. 00495 ## If the gene (or later on, the exon) falls outside of this range, it can be discarded 00496 ## as it will be impossible to map it with using $pair->mapper! 00497 my $range_start = $pair->{slice}->start - $gene->slice->start + 1; 00498 my $range_end = $pair->{slice}->end - $gene->slice->start + 1; 00499 return undef if (($gene->start > $range_end) or ($gene->end < $range_start)); 00500 00501 my $from_mapper = $pair->{mapper}; 00502 my $to_mapper = $self->{to_mapper}; 00503 00504 my $these_transcripts = []; 00505 00506 foreach my $this_transcript (@{$gene->get_all_Transcripts}) { 00507 my $these_exons = []; 00508 my $all_exons = $this_transcript->get_all_Exons; 00509 for (my $i=0; $i<@$all_exons; $i++){ 00510 my $this_exon = $all_exons->[$i]; 00511 throw("Oops, this method assumes that all the exons are defined on the same slice as the". 00512 " gene they belong to") if ($this_exon->slice->start != $gene->slice->start); 00513 if ($this_exon->start <= $range_end and $this_exon->end >= $range_start) { 00514 my $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon( 00515 -EXON => $this_exon, 00516 -ALIGN_SLICE => $self, 00517 -FROM_MAPPER => $from_mapper, 00518 -TO_MAPPER => $to_mapper, 00519 -ORIGINAL_RANK => $i + 1 00520 ); 00521 if ($this_align_exon) { 00522 push(@{$these_exons}, $this_align_exon); 00523 } elsif ($return_unmapped_exons) { 00524 $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon( 00525 -EXON => $this_exon, 00526 -ORIGINAL_RANK => $i + 1 00527 ); 00528 push(@{$these_exons}, $this_align_exon) if ($this_align_exon); 00529 } 00530 } elsif ($return_unmapped_exons) { 00531 my $this_align_exon = new Bio::EnsEMBL::Compara::AlignSlice::Exon( 00532 -EXON => $this_exon, 00533 -ORIGINAL_RANK => $i + 1 00534 ); 00535 push(@{$these_exons}, $this_align_exon) if ($this_align_exon); 00536 } 00537 } 00538 if (grep {defined($_->start)} @$these_exons) { ## if any of the exons has been mapped 00539 my $new_transcript = $this_transcript->new( 00540 -dbID => $this_transcript->dbID, 00541 -adaptor => $this_transcript->adaptor, 00542 -stable_id => $this_transcript->stable_id, 00543 -version => $this_transcript->version, 00544 -external_db => $this_transcript->external_db, 00545 -external_name => $this_transcript->external_name, 00546 -external_status => $this_transcript->external_status, 00547 -display_xref => $this_transcript->display_xref, 00548 -analysis => $this_transcript->analysis, 00549 -status => $this_transcript->status, 00550 -biotype => $this_transcript->biotype, 00551 -exons => $these_exons, 00552 ); 00553 if ($this_transcript->translation) { 00554 $new_transcript->translation($this_transcript->translation); 00555 } 00556 push(@{$these_transcripts}, $new_transcript); 00557 } 00558 } 00559 if (!@$these_transcripts) { 00560 return undef; 00561 } 00562 ## Create a new gene object. This is needed in order to avoid 00563 ## troubles with cache. Attach mapped transcripts. 00564 my $mapped_gene = $gene->new( 00565 -ANALYSIS => $gene->analysis, 00566 -SEQNAME => $gene->seqname, 00567 -STABLE_ID => $gene->stable_id, 00568 -VERSION => $gene->version, 00569 -EXTERNAL_NAME => $gene->external_name, 00570 -TYPE => $gene->biotype, 00571 -BIOTYPE => $gene->biotype, 00572 -STATUS => $gene->status, 00573 -EXTERNAL_DB => $gene->external_db, 00574 -EXTERNAL_STATUS => $gene->external_status, 00575 -DISPLAY_XREF => $gene->display_xref, 00576 -DESCRIPTION => $gene->description, 00577 -TRANSCRIPTS => $these_transcripts 00578 ); 00579 00580 return $mapped_gene; 00581 } 00582 00583 00584 =head2 _compile_mapped_Genes 00585 00586 Arg[1] : listref of Bio::EnsEMBL::Gene $mapped_genes 00587 Arg[2] : int $max_repetition_length 00588 Arg[3] : int $max_gap_length 00589 Arg[4] : int $max_intron_length 00590 Arg[5] : int $strict_order_of_exon_pieces 00591 Arg[6] : int $strict_order_of_exons 00592 Example : my $compiled_genes = $align_slice->compile_mapped_Genes($mapped_genes, 00593 100, 1000, 100000, 1, 0); 00594 Description: This method compiles all the pieces of gene mapped before into a list 00595 of Bio::EnsEMBL::Gene objects. It tries to merge pieces of the same 00596 exon, link them according to their original transcript and finally 00597 group the transcripts in Bio::EnsEMBL::Gene objects. Merging and 00598 linking of Exons is done according to some rules that can be changed 00599 with the parameters. 00600 Parameters: They are sent as is to _separate_in_incompatible_sets_of_Exons() and 00601 _merge_Exons() methods. Please refer to them elsewhere in this 00602 document. 00603 Returntype : listref of Bio::EnsEMBL::Gene objects. (overrides $mapped_genes) 00604 Exceptions : none 00605 Caller : $object->methodname 00606 00607 =cut 00608 00609 sub _compile_mapped_Genes { 00610 my ($self, $mapped_genes, $max_repetition_length, $max_gap_length, $max_intron_length, 00611 $strict_order_of_exon_pieces, $strict_order_of_exons) = @_; 00612 00613 my $verbose = verbose(); 00614 verbose(0); # Avoid warnings when mapped transcripts are not in the same strand 00615 00616 ## Compile genes: group transcripts by gene->stable_id 00617 my $gene_by_gene_stable_id; 00618 my $transcripts_by_gene_stable_id; 00619 foreach my $mapped_gene (@$mapped_genes) { 00620 # print STDERR "1. GENE:$mapped_gene->{stable_id} ", join(" - ", map {$_->stable_id} @{$mapped_gene->get_all_Transcripts}), "\n"; 00621 if (!$gene_by_gene_stable_id->{$mapped_gene->stable_id}) { 00622 $gene_by_gene_stable_id->{$mapped_gene->stable_id} = $mapped_gene; 00623 } 00624 ## Group all the transcripts by gene stable_id... 00625 push(@{$transcripts_by_gene_stable_id->{$mapped_gene->stable_id}}, 00626 @{$mapped_gene->get_all_Transcripts}); 00627 } 00628 # $mapped_genes = [values %$genes_by_stable_id]; 00629 00630 ## Compile transcripts: group exons by transcript->stable_id 00631 while (my ($gene_stable_id, $set_of_transcripts) = each %$transcripts_by_gene_stable_id) { 00632 my $transcript_by_transcript_stable_id; 00633 my $exons_by_transcript_stable_id; 00634 foreach my $transcript (@{$set_of_transcripts}) { 00635 if (!$transcript_by_transcript_stable_id->{$transcript->stable_id}) { 00636 $transcript_by_transcript_stable_id->{$transcript->stable_id} = $transcript; 00637 } 00638 ## Group all the exons by the transcript stable_id... 00639 # print STDERR "0. TRANS:$transcript->{stable_id} ", join(" - ", map {$_->stable_id} @{$transcript->get_all_Exons}), "\n"; 00640 push(@{$exons_by_transcript_stable_id->{$transcript->stable_id}}, 00641 @{$transcript->get_all_Exons}); 00642 } 00643 00644 ## Try to merge splitted exons whenever possible 00645 while (my ($transcript_stable_id, $set_of_exons) = each %$exons_by_transcript_stable_id) { 00646 # print STDERR "1. ", join(" - ", map {$_->stable_id} @{$set_of_exons}), "\n"; 00647 $exons_by_transcript_stable_id->{$transcript_stable_id} = _merge_Exons( 00648 $set_of_exons, 00649 $max_repetition_length, 00650 $max_gap_length, 00651 $strict_order_of_exon_pieces 00652 ); 00653 # print STDERR "2. ", join(" - ", map {$_->stable_id} @{$exons_by_transcript_stable_id->{$transcript_stable_id}}), "\n"; 00654 } 00655 my $all_transcripts; 00656 while (my ($transcript_stable_id, $set_of_exons) = each %$exons_by_transcript_stable_id) { 00657 # my $sets_of_compatible_exons = [$set_of_exons]; 00658 my $sets_of_compatible_exons = _separate_in_incompatible_sets_of_Exons( 00659 $set_of_exons, 00660 $max_repetition_length, 00661 $max_intron_length, 00662 $strict_order_of_exons 00663 ); 00664 00665 my $old_transcript = $transcript_by_transcript_stable_id->{$transcript_stable_id}; 00666 # # Save first set of exons in the 00667 # my $first_set_of_compatible_exons = shift(@{$sets_of_compatible_exons}); 00668 # my $first_transcript = $transcript_by_transcript_stable_id->{$transcript_stable_id}; 00669 # $first_transcript->flush_Exons(); 00670 # foreach my $exon (@$first_set_of_compatible_exons) { 00671 # $first_transcript->add_Exon($exon); 00672 # } 00673 # push(@$all_transcripts, $first_transcript); 00674 00675 foreach my $this_set_of_compatible_exons (@{$sets_of_compatible_exons}) { 00676 my $new_transcript = $old_transcript->new( 00677 -dbID => $old_transcript->dbID, 00678 -adaptor => $old_transcript->adaptor, 00679 -stable_id => $old_transcript->stable_id, 00680 -version => $old_transcript->version, 00681 -external_db => $old_transcript->external_db, 00682 -external_name => $old_transcript->external_name, 00683 -external_status => $old_transcript->external_status, 00684 -display_xref => $old_transcript->display_xref, 00685 -analysis => $old_transcript->analysis, 00686 -status => $old_transcript->status, 00687 -biotype => $old_transcript->biotype, 00688 -EXONS => $this_set_of_compatible_exons 00689 ); 00690 ## $old_transcript->translation is the original Bio::EnsEMBL::Translation! 00691 if ($old_transcript->translation) { 00692 my $start_exon; 00693 my $seq_start; 00694 my $end_exon; 00695 my $seq_end; 00696 my $all_start_codon_mappings; 00697 my $all_end_codon_mappings; 00698 my @sorted_exons; 00699 if ($new_transcript->strand == 1) { 00700 @sorted_exons = @{$new_transcript->get_all_Exons}; 00701 } else { 00702 @sorted_exons = reverse @{$new_transcript->get_all_Exons}; 00703 } 00704 my $coding = 0; 00705 foreach my $this_exon (@sorted_exons) { 00706 if ($old_transcript->translation->start_Exon->stable_id eq $this_exon->stable_id) { 00707 $coding = 1; 00708 } 00709 if ($coding and $this_exon->start) { 00710 if (!$start_exon) { 00711 if ($old_transcript->translation->start_Exon->stable_id eq $this_exon->stable_id) { 00712 if ($old_transcript->translation->start_Exon->strand == 1) { 00713 $all_start_codon_mappings = $self->map_original_Slice( 00714 $this_exon->exon->slice->sub_Slice( 00715 $old_transcript->coding_region_start, 00716 $old_transcript->coding_region_start + 2)); 00717 } else { 00718 $all_start_codon_mappings = $self->map_original_Slice( 00719 $this_exon->exon->slice->sub_Slice( 00720 $old_transcript->coding_region_end-2, 00721 $old_transcript->coding_region_end)); 00722 } 00723 $seq_start = _map_position_using_cigar_line($this_exon->cigar_line, 00724 $old_transcript->translation->start, $this_exon->exon->length, 1); 00725 if ($seq_start <= $this_exon->length) { 00726 $start_exon = $this_exon; 00727 } 00728 } else { 00729 $seq_start = 1; 00730 $start_exon = $this_exon; 00731 } 00732 } 00733 if ($old_transcript->translation->end_Exon->stable_id eq $this_exon->stable_id) { 00734 if ($old_transcript->translation->end_Exon->strand == 1) { 00735 $all_end_codon_mappings = $self->map_original_Slice( 00736 $this_exon->exon->slice->sub_Slice($old_transcript->coding_region_end - 2,$old_transcript->coding_region_end)); 00737 } else { 00738 $all_end_codon_mappings = $self->map_original_Slice( 00739 $this_exon->exon->slice->sub_Slice( 00740 $old_transcript->coding_region_start, 00741 $old_transcript->coding_region_start + 2)); 00742 } 00743 $seq_end = _map_position_using_cigar_line($this_exon->cigar_line, 00744 $old_transcript->translation->end, $this_exon->exon->length, 0); 00745 $seq_end = $this_exon->length if ($seq_end > $this_exon->length); 00746 if ($seq_end >= 1) { 00747 $end_exon = $this_exon; 00748 } else { 00749 ## Set $seq_end to previous value 00750 $seq_end = $end_exon->length if ($end_exon); 00751 } 00752 } else { 00753 $end_exon = $this_exon; 00754 $seq_end = $end_exon->length; 00755 } 00756 } 00757 if ($old_transcript->translation->end_Exon->stable_id eq $this_exon->stable_id) { 00758 $coding = 0; 00759 last; 00760 } 00761 } 00762 if ($start_exon and $end_exon) { 00763 $new_transcript->translation(new Bio::EnsEMBL::Compara::AlignSlice::Translation( 00764 -start_exon => $start_exon, 00765 -end_exon => $end_exon, 00766 -seq_start => $seq_start, 00767 -seq_end => $seq_end, 00768 -stable_id => $old_transcript->translation->stable_id, 00769 -version => $old_transcript->translation->version, 00770 -created_date => ($old_transcript->translation->created_date or undef), 00771 -modified_date =>$old_transcript->translation->modified_date, 00772 )); 00773 } else { 00774 ## Translation cannot be mapped. In this case we should return a translation outside 00775 ## of the coordinate system (negative coding_region_start and coding_region_end). 00776 $new_transcript->translation(new Bio::EnsEMBL::Compara::AlignSlice::Translation( 00777 -stable_id => $old_transcript->translation->stable_id, 00778 -version => $old_transcript->translation->version, 00779 -created_date => ($old_transcript->translation->created_date or undef), 00780 -modified_date =>$old_transcript->translation->modified_date, 00781 )); 00782 $new_transcript->coding_region_start(-10000000000); 00783 $new_transcript->coding_region_end(-10000000000); 00784 } 00785 00786 $new_transcript->translation->all_start_codon_mappings($all_start_codon_mappings); 00787 $new_transcript->translation->all_end_codon_mappings($all_end_codon_mappings); 00788 } 00789 push(@$all_transcripts, $new_transcript); 00790 } 00791 } 00792 00793 $gene_by_gene_stable_id->{$gene_stable_id}->{'_transcript_array'} = $all_transcripts; 00794 # adjust start, end, strand and slice 00795 $gene_by_gene_stable_id->{$gene_stable_id}->recalculate_coordinates; 00796 } 00797 verbose($verbose); 00798 00799 return [values %$gene_by_gene_stable_id]; 00800 } 00801 00802 00803 =head2 _map_position_using_cigar_line 00804 00805 Arg[1] : string $cigar_line 00806 Arg[2] : int $original_position 00807 Arg[3] : int $original_length 00808 Arg[4] : bool $start 00809 Example : my $mapped_start_position = _map_position_using_cigar_line( 00810 "16M4I", 10, 20, 1); 00811 Example : my $mapped_end_position = _map_position_using_cigar_line( 00812 "16M4I", 10, 20, 1); 00813 Description: This method is used to locate the start or the end position of 00814 a Translation on the Bio::EnsEMBL::Compara::AlignSlice::Exon object 00815 using the cigar_line of the object. As the Bio::EnsEMBL::Compara:: 00816 AlignSlice::Exon object may result from the fusion of the same 00817 Bio::EnsEMBL::Exon object several times, this method returns the 00818 best mapping among all the available possibilities. When the original 00819 position maps on an insertion in the Bio::EnsEMBL::Compara:: 00820 AlignSlice::Exon object, the resulting mapped start position is the 00821 next one to the latest mapped one. 00822 Returntype : int $mapped_position 00823 Exceptions : returns 0 when the end position cannot be mapped. 00824 Exceptions : returns a number larger than the length of the Bio::EnsEMBL::Compara:: 00825 AlignSlice::Exon when the start position cannot be mapped. 00826 00827 =cut 00828 00829 sub _map_position_using_cigar_line { 00830 my ($cigar_line, $original_pos, $original_length, $start) = @_; 00831 my $mapped_pos = 0; 00832 00833 00834 my @cigar = grep {$_} split(/(\d*[GIDM])/, $cigar_line); 00835 my $original_count = 0; 00836 my $mapped_count = 0; 00837 my $pending_count = 0; 00838 00839 foreach my $cigar_piece (@cigar) { 00840 my ($num, $mode) = $cigar_piece =~ /(\d*)([GIDM])/; 00841 $num = 1 if ($num eq ""); 00842 if ($mode eq "D" or $mode eq "G") { 00843 $pending_count += $num; 00844 } elsif ($mode eq "I") { 00845 $original_count += $num; 00846 } elsif ($mode eq "M") { 00847 if ($pending_count) { 00848 $mapped_count += $pending_count; 00849 $pending_count = 0; 00850 } 00851 if ($original_count + $num < $original_pos) { 00852 $original_count += $num; 00853 $mapped_count += $num; 00854 } else { 00855 $mapped_count += ($original_pos - $original_count); 00856 $original_count += ($original_pos - $original_count); 00857 $mapped_pos = $mapped_count; 00858 last; 00859 } 00860 } 00861 if ($original_count >= $original_pos and $mode eq "I") { 00862 ## This position matches in an insertion. 00863 ## If we are mapping the end position we will prefer the first match in an insertion 00864 ## rather than the following ones (except if the following matches in a "M" region). 00865 ## On the other hand, if we are mapping the start position, we will prefer the last 00866 ## match in an insertion 00867 if ($start or !$mapped_pos) { 00868 if ($start and $pending_count) { 00869 ## $pending_count corresponds to deletion and needs to be added when mapping the 00870 ## start position (we will want the position AFTER the deletion) but not when 00871 ## mapping the end position (we will want the position BEFORE the deletion) 00872 $mapped_count += $pending_count; 00873 $pending_count = 0; 00874 } 00875 $mapped_pos = $mapped_count; 00876 ## When matching the start position, add 1 as the mapped start will be just after 00877 ## this position. If this position appears after the end of the mapped exon, this 00878 ## means we fail to map the start position 00879 $mapped_pos ++ if ($start); 00880 } 00881 ## Try to look further. A second match may happen if this exon is the result of a fusion process 00882 $original_pos += $original_length; 00883 } 00884 } 00885 00886 return $mapped_pos; 00887 } 00888 00889 00890 =head2 _merge_Exons 00891 00892 Arg[1] : listref of Bio::EnsEMBL::Compara::AlignSlice::Exon $set_of_exons 00893 Arg[2] : int $max_repetition_length 00894 Arg[3] : int $max_gap_length 00895 Arg[4] : int $strict_order_of_exon_pieces 00896 Example : my $merged_exons = _merge_Exons($exons_to_be_merged, 100, 1000, 1); 00897 Description: Takes a list of Bio::EnsEMBL::Compara::AlignSlice::Exon objects and 00898 tries to merge them according to exon stable_id and some rules that can 00899 be tunned using some optional parameters. This method can overwrite some 00900 of the exon in the $set_of_exons. 00901 Parameters: MAX_REPETITION_LENGTH. In principle you want to merge together pieces 00902 of an exon which do not overlap (the beginning and the end of the 00903 exon). With this flag you can to set up what amount of the original 00904 exon is allowed on two aligned exons to be merged. 00905 MAX_GAP_LENGTH. If the distance between two pieces of exons in the 00906 aligned slice is larger than this parameter, they will not be 00907 merged. Setting this parameter to -1 will 00908 avoid any merging event. 00909 STRICT_ORDER_OF_EXON_PIECES. This flag allows you to decide whether two 00910 pieces of an exon should be merged or not if they are not in the 00911 right order, for instance if the end of the original exon will 00912 appear before the start on the merged exon. 00913 Returntype : lisref of Bio::EnsEMBL::Compara::AlignSlice::Exon objects. 00914 Exceptions : none 00915 Caller : methodname 00916 00917 =cut 00918 00919 sub _merge_Exons { 00920 my ($set_of_exons, $max_repetition_length, $max_gap_length, $strict_order_of_exon_pieces) = @_; 00921 my $merged_exons = []; # returned value 00922 00923 my $exon_by_stable_id; 00924 # Group exons by stable_id 00925 foreach my $exon (@$set_of_exons) { 00926 push(@{$exon_by_stable_id->{$exon->stable_id}}, $exon); 00927 } 00928 00929 # Merge compatible pieces of exons 00930 foreach my $these_exons (values %$exon_by_stable_id) { 00931 00932 if (!grep {defined($_->start)} @$these_exons) { 00933 push(@$merged_exons, $these_exons->[0]); 00934 next; 00935 } 00936 00937 # Sort exons according to 00938 $these_exons= [sort {($a->start or 0) <=> ($b->start or 0)} @$these_exons]; 00939 00940 while (my $first_exon = shift @$these_exons) { 00941 if (!defined($first_exon->start)) { 00942 # push(@$merged_exons, $first_exon); 00943 next; 00944 } 00945 for (my $count=0; $count<@$these_exons; $count++) { 00946 my $second_exon = $these_exons->[$count]; 00947 # Check strands 00948 next if ($first_exon->strand != $second_exon->strand); 00949 00950 my $gap_between_pieces_of_exon = $second_exon->start - $first_exon->end - 1; 00951 # Check whether both mapped parts do not overlap 00952 next if ($gap_between_pieces_of_exon < 0); 00953 00954 # Check maximum gap between both pieces of exon 00955 next if ($max_gap_length and $gap_between_pieces_of_exon > $max_gap_length); 00956 00957 # Check whether both mapped parts are in the right order 00958 if ($strict_order_of_exon_pieces) { 00959 if ($first_exon->strand == 1) { 00960 next if ($first_exon->get_aligned_start > $second_exon->get_aligned_start); 00961 } else { 00962 next if ($first_exon->get_aligned_end < $second_exon->get_aligned_end); 00963 } 00964 } 00965 00966 # Check maximum overlapping within original exon, i.e. how much of the 00967 # same exon can be mapped twice 00968 my $repetition_length; 00969 if ($first_exon->strand == 1) { 00970 $repetition_length = $first_exon->get_aligned_end - $second_exon->get_aligned_start + 1; 00971 } else { 00972 $repetition_length = $second_exon->get_aligned_end - $first_exon->get_aligned_start + 1; 00973 } 00974 next if ($repetition_length > $max_repetition_length); 00975 00976 ## Merge exons!! 00977 $second_exon = splice(@$these_exons, $count, 1); # remove exon from the list 00978 $count-- if (@$these_exons); 00979 $first_exon->end($second_exon->end); 00980 if ($first_exon->strand == 1) { 00981 $first_exon->append_Exon($second_exon, $gap_between_pieces_of_exon); 00982 } else { 00983 $first_exon->prepend_Exon($second_exon, $gap_between_pieces_of_exon); 00984 } 00985 } 00986 push(@$merged_exons, $first_exon); 00987 } 00988 } 00989 00990 return $merged_exons; 00991 } 00992 00993 00994 =head2 _separate_in_incompatible_sets_of_Exons 00995 00996 Arg[1] : listref of Bio::EnsEMBL::Compara::AlignSlice::Exon $set_of_exons 00997 Arg[2] : int $max_repetition_length 00998 Arg[3] : int $max_intron_length 00999 Arg[4] : int $strict_order_of_exons 01000 Example : my $sets_of_exons = _separate_in_incompatible_sets_of_Exons( 01001 $set_of_exons, 100, 100000, 0); 01002 Description: Takes a list of Bio::EnsEMBL::Compara::AlignSlice::Exon and separate 01003 them in sets of comaptible exons. Compatibility is defined taking into 01004 account 5 parameters: 01005 - exons must be in the same strand 01006 - exons cannot overlap on the align_slice 01007 - distance between exons cannot be larger than MAX_INTRON_LENGTH 01008 - two exons with the same stable_id can belong to the same transcript 01009 only if they represent diferent parts of the original exon. Some 01010 overlapping is allowed (see MAX_REPETITION_LENGTH parameter). 01011 - exons must be in the same order as in the original transcript 01012 if the STRICT_ORDER_OF_EXONS parameter is true. 01013 Parameters: MAX_REPETITION_LENGTH. In principle you want to link together pieces 01014 of an exon which do not overlap (the beginning and the end of the 01015 exon). With this flag you can to set up what amount of the original 01016 exon is allowed on two aligned exons to be linked. 01017 MAX_INTRON_LENGTH. If the distance between two exons in the aligned slice 01018 is larger than this parameter, they will not be linked. 01019 STRICT_ORDER_OF_EXONS. This flag allows you to decide whether two 01020 exons should be linked or not if they are not in the 01021 original order. 01022 Returntype : listref of lisrefs of Bio::EnsEMBL::Compara::AlignSlice::Exon objects. 01023 Exceptions : none 01024 Caller : methodname 01025 01026 =cut 01027 01028 sub _separate_in_incompatible_sets_of_Exons { 01029 my ($set_of_exons, $max_repetition_length, $max_intron_length, $strict_order_of_exons) = @_; 01030 my $sets_of_exons = []; 01031 01032 my $last_exon; 01033 my $this_set_of_exons = []; 01034 01035 ################################################################### 01036 ## 01037 ## Return all the exons by strand. 01038 ## - A given exon can be mapped partially on both strands! 01039 ## 01040 if (grep {$_->strand and $_->strand == 1} @$set_of_exons and grep {$_->strand and $_->strand == -1} @$set_of_exons) { 01041 ## Keep track of the exons that have been mapped by strand 01042 my $indexes; 01043 foreach my $exon (@$set_of_exons) { 01044 if ($exon->strand) { 01045 $indexes->{$exon->strand}->{$exon->original_rank} = 1; 01046 } 01047 } 01048 01049 my $forward_stranded_set_of_exons; # set of exons to be returned in the forward strand 01050 my $reverse_stranded_set_of_exons; # set of exons to be returned in the reverse strand 01051 foreach my $exon (@$set_of_exons) { 01052 my $new_exon = $exon->copy(); 01053 if ($exon->strand) { 01054 if ($exon->strand == 1) { 01055 if ($indexes->{"-1"}->{$exon->original_rank}) { 01056 ## This exon has been mapped on the reverse strand as well 01057 push(@$forward_stranded_set_of_exons, $exon); 01058 next; 01059 } else { 01060 ## Undef coordinates of the new exon before adding it to the set 01061 ## of reverse stranded exons 01062 $new_exon->start(undef); 01063 $new_exon->end(undef); 01064 $new_exon->strand(undef); 01065 } 01066 } else { 01067 if ($indexes->{"1"}->{$exon->original_rank}) { 01068 ## This exon has been mapped on the forward strand as well 01069 push(@$reverse_stranded_set_of_exons, $new_exon); 01070 next; 01071 } else { 01072 ## Undef coordinates of the new exon before adding it to the set 01073 ## of forward stranded exons 01074 $exon->start(undef); 01075 $exon->end(undef); 01076 $exon->strand(undef); 01077 } 01078 } 01079 } 01080 push(@$forward_stranded_set_of_exons, $exon); 01081 push(@$reverse_stranded_set_of_exons, $new_exon); 01082 } 01083 push(@$sets_of_exons, @{_separate_in_incompatible_sets_of_Exons($reverse_stranded_set_of_exons, 01084 $max_repetition_length, $max_intron_length, $strict_order_of_exons)}); 01085 $set_of_exons = $forward_stranded_set_of_exons; 01086 } 01087 ## 01088 ################################################################### 01089 01090 my $transcript_strand = 1; 01091 if (grep {$_->strand and $_->strand == -1} @$set_of_exons) { 01092 $transcript_strand = -1; 01093 } 01094 foreach my $this_exon (_sort_Exons(@$set_of_exons)) { 01095 if (!defined($this_exon->start)) { 01096 if ($transcript_strand == -1) { 01097 ## Insert this exon in the right place 01098 my $inserted = 0; 01099 for (my $i=0; $i<@$this_set_of_exons; $i++) { 01100 if ($this_set_of_exons->[$i]->original_rank == $this_exon->original_rank - 1) { 01101 splice(@$this_set_of_exons, $i, 0, $this_exon); 01102 $inserted = 1; 01103 last; 01104 } 01105 } 01106 if (!$inserted) { 01107 push(@$this_set_of_exons, $this_exon); 01108 } 01109 } else { 01110 ## Append this exon 01111 push(@$this_set_of_exons, $this_exon); 01112 } 01113 next; 01114 } 01115 if ($last_exon) { 01116 # Calculate intron length 01117 my $intron_length = $this_exon->start - $last_exon->end - 1; 01118 # Calculate whether both mapped parts are in the right order 01119 my $order_is_ok = 1; 01120 if ($strict_order_of_exons) { 01121 if ($this_exon->strand == $this_exon->exon->strand) { 01122 $order_is_ok = 0 if ($this_exon->exon->start < $last_exon->exon->start); 01123 } else { 01124 $order_is_ok = 0 if ($this_exon->exon->start > $last_exon->exon->start); 01125 } 01126 } 01127 my $repetition_length = 0; 01128 if ($last_exon->stable_id eq $this_exon->stable_id) { 01129 if ($this_exon->strand == 1) { 01130 $repetition_length = $last_exon->get_aligned_end - $this_exon->get_aligned_start + 1; 01131 } else { 01132 $repetition_length = $this_exon->get_aligned_end - $last_exon->get_aligned_start + 1; 01133 } 01134 } 01135 01136 if (($last_exon->strand != $this_exon->strand) or 01137 ($intron_length < 0) or 01138 ($max_intron_length and ($intron_length > $max_intron_length)) or 01139 (!$order_is_ok) or 01140 ($repetition_length > $max_repetition_length)) { 01141 # this_exon and last_exon should be in separate sets. Save current 01142 # set_of_exons and start a new set_of_exons 01143 push(@$sets_of_exons, $this_set_of_exons); 01144 $this_set_of_exons = []; 01145 } 01146 } 01147 push(@$this_set_of_exons, $this_exon); 01148 $last_exon = $this_exon; 01149 } 01150 push(@$sets_of_exons, $this_set_of_exons); 01151 01152 return $sets_of_exons; 01153 } 01154 01155 sub _sort_Exons { 01156 my @exons = @_; 01157 my @sorted_exons = (); 01158 01159 my $transcript_strand = 1; 01160 if (grep {$_->strand and $_->strand == -1} @exons) { 01161 $transcript_strand = -1; 01162 } 01163 01164 my @mapped_exons = grep {defined($_->start)} @exons; 01165 my @unmapped_exons = grep {!defined($_->start)} @exons; 01166 01167 @mapped_exons = sort {$a->start <=> $b->start} @mapped_exons; 01168 my $sorted_exons_by_rank; 01169 foreach my $mapped_exon (@mapped_exons) { 01170 my $rank = $mapped_exon->original_rank; 01171 # the same exon can be mapped twice! 01172 push(@{$sorted_exons_by_rank->{$rank}}, $mapped_exon); 01173 } 01174 01175 @unmapped_exons = sort {$a->original_rank <=> $b->original_rank} @unmapped_exons; 01176 foreach my $unmapped_exon (@unmapped_exons) { 01177 my $rank = $unmapped_exon->original_rank; 01178 do { 01179 if (defined($sorted_exons_by_rank->{$rank})) { 01180 push(@{$sorted_exons_by_rank->{$rank}}, $unmapped_exon); 01181 $rank = 0; 01182 } elsif ($rank == 1) { 01183 if ($transcript_strand == 1) { 01184 push(@{$sorted_exons_by_rank->{$rank}}, $unmapped_exon); 01185 } else { 01186 $rank++ while(!defined($sorted_exons_by_rank->{$rank})); 01187 splice(@{$sorted_exons_by_rank->{$rank}}, 1, 0, $unmapped_exon); 01188 } 01189 $rank = 0; 01190 } 01191 $rank--; 01192 } while ($rank > 0); 01193 } 01194 01195 foreach my $exons (sort { 01196 if (defined($a->[0]->start) and defined($b->[0]->start)) { 01197 $a->[0]->start <=> $b->[0]->start; 01198 } else { 01199 $a->[0]->original_rank <=> $b->[0]->original_rank; 01200 } 01201 } values %$sorted_exons_by_rank) { 01202 push(@sorted_exons, @{$exons}); 01203 } 01204 01205 return @sorted_exons; 01206 } 01207 01208 # OVERWRITEN METHODS FROM Bio::EnsEMBL::Slice module 01209 # 01210 # WARNING - WARNING - WARNING - WARNING - WARNING - WARNING - WARNING 01211 # 01212 # All the methods that need access to the database (the adaptor) and which 01213 # are not listed here are not supported!! 01214 # 01215 # WARNING - WARNING - WARNING - WARNING - WARNING - WARNING - WARNING 01216 # 01217 =head2 invert (not supported) 01218 01219 Maybe at some point... 01220 01221 This method returns undef 01222 01223 =cut 01224 01225 sub invert { 01226 warning("Cannot invert a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); 01227 return undef; 01228 } 01229 01230 01231 =head2 sub_Slice 01232 01233 Arg 1 : int $start 01234 Arg 2 : int $end 01235 Arge [3] : int $strand 01236 Example : none 01237 Description: Makes another Slice that covers only part of this slice 01238 If a slice is requested which lies outside of the boundaries 01239 of this function will return undef. This means that 01240 behaviour will be consistant whether or not the slice is 01241 attached to the database (i.e. if there is attached sequence 01242 to the slice). Alternatively the expand() method or the 01243 SliceAdaptor::fetch_by_region method can be used instead. 01244 Returntype : Bio::EnsEMBL::Slice or undef if arguments are wrong 01245 Exceptions : return undef if $start and $end define a region outside 01246 of actual Slice. 01247 Caller : general 01248 Status : Testing 01249 01250 =cut 01251 01252 sub sub_Slice { 01253 my ( $self, $start, $end, $strand ) = @_; 01254 01255 if( $start < 1 || $start > $self->{'end'} ) { 01256 # throw( "start argument not valid" ); 01257 return undef; 01258 } 01259 01260 if( $end < $start || $end > $self->{'end'} ) { 01261 # throw( "end argument not valid" ) 01262 return undef; 01263 } 01264 01265 my ($new_start, $new_end, $new_strand); 01266 if (!defined($strand)) { 01267 $strand = 1; 01268 } 01269 01270 if( $self->{'strand'} == 1 ) { 01271 $new_start = $self->{'start'} + $start - 1; 01272 $new_end = $self->{'start'} + $end - 1; 01273 $new_strand = $strand; 01274 } else { 01275 $new_start = $self->{'end'} - $end + 1;; 01276 $new_end = $self->{'end'} - $start + 1; 01277 $new_strand = -$strand; 01278 } 01279 01280 #fastest way to copy a slice is to do a shallow hash copy 01281 my %new_slice = %$self; 01282 01283 ## Delete cached genes 01284 foreach my $key (grep {/^key_/} keys %new_slice) { 01285 delete($new_slice{$key}); 01286 } 01287 $new_slice{'seq'} = undef; 01288 $new_slice{'start'} = int($new_start); 01289 $new_slice{'end'} = int($new_end); 01290 $new_slice{'strand'} = $new_strand; 01291 01292 return bless \%new_slice, ref($self); 01293 } 01294 01295 =head2 seq 01296 01297 Arg [1] : none 01298 Example : print "SEQUENCE = ", $slice->seq(); 01299 Description: Returns the sequence of the region represented by this 01300 slice formatted as a string. 01301 This Slice is made of several Bio::EnsEMBL::Slices mapped 01302 on it with gaps inside and regions with no matching 01303 sequence. The resulting string might contain gaps and/or 01304 dots as padding characters. If several Slices map on the 01305 same positions, the last one will override the positions. 01306 Returntype : string 01307 Exceptions : none 01308 Caller : general 01309 01310 =cut 01311 01312 sub seq { 01313 my $self = shift; 01314 my $start = 1; 01315 my $end = $self->length; 01316 my $strand = 1; # strand is reversed in the subseq method if needed 01317 01318 return $self->{seq} if (defined($self->{seq})); 01319 01320 $self->{seq} = $self->subseq($start, $end, $strand); 01321 01322 return $self->{seq}; 01323 } 01324 01325 01326 =head2 subseq 01327 01328 Arg [1] : int $startBasePair 01329 relative to start of slice, which is 1. 01330 Arg [2] : int $endBasePair 01331 relative to start of slice. 01332 Arg [3] : (optional) int $strand 01333 The strand of the slice to obtain sequence from. Default 01334 value is 1. 01335 Description: returns string of dna sequence 01336 This Slice is made of several Bio::EnsEMBL::Slices mapped 01337 on it with gaps inside and regions with no matching 01338 sequence. The resulting string might contain gaps and/or 01339 dots as padding characters. If several Slices map on the 01340 same positions, the last one will override the positions. 01341 Returntype : txt 01342 Exceptions : end should be at least as big as start 01343 strand must be set 01344 Caller : general 01345 01346 =cut 01347 01348 sub subseq { 01349 my ($self, $start, $end, $strand) = @_; 01350 01351 $start = 1 if (!defined($start)); 01352 $end ||= $self->length; 01353 $strand ||= 1; 01354 01355 ## Fix coordinates (needed for sub_Slices) 01356 if ($self->strand == -1) { 01357 $strand = -$strand; 01358 my $aux = $start; 01359 $start = $self->start + ($self->length - $end); 01360 $end = $self->start + ($self->length - $aux); 01361 } else { 01362 $start += $self->start - 1; 01363 $end += $self->start - 1; 01364 } 01365 01366 my $length = ($end - $start + 1); 01367 my $seq = "." x $length; 01368 01369 foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs()}) { 01370 my $this_slice = $pair->{slice}; 01371 my $mapper = $pair->{mapper}; 01372 my $slice_start = $pair->{start}; 01373 my $slice_end = $pair->{end}; 01374 next if ($slice_start > $end or $slice_end < $start); 01375 my $this_slice_seq = $this_slice->seq(); 01376 01377 # Set slice_start and slice_end in "subseq" coordinates (0 based, for compliance wiht substr() perl func) and trim them 01378 $slice_start -= $start; # $slice_start is now in subseq coordinates 01379 $slice_start = 0 if ($slice_start < 0); 01380 $slice_end -= $start; # $slice_end is now in subseq coordinates 01381 $slice_end = $length if ($slice_end > $length); 01382 01383 # Invert start and end for the reverse strand 01384 if ($strand == -1) { 01385 my $aux = $slice_end; 01386 $slice_end = $length - $slice_start; 01387 $slice_start = $length - $aux; 01388 } 01389 01390 my @sequence_coords = $mapper->map_coordinates( 01391 'alignment', 01392 $start, 01393 $end, 01394 $strand, 01395 'alignment' 01396 ); 01397 01398 ##################### 01399 # $this_pos refers to the starting position of the subseq if requesting the forward strand 01400 # or the ending position of the subseq if the reverse strand has been requested: 01401 # 01402 # FORWARD STRAND (1) 01403 # $this_pos = 0 01404 # | 01405 # ----------------------------------------------------------------------> 01406 # <---------------------------------------------------------------------- 01407 # 01408 # REVERSE STRAND (-1) 01409 # ----------------------------------------------------------------------> 01410 # <---------------------------------------------------------------------- 01411 # | 01412 # $this_pos = 0 01413 # 01414 # All remaining coordinates work in the same way except the start and end position 01415 # of the gaps which correspond to the coordinates in the original Slice... 01416 # 01417 my $this_pos = 0; 01418 foreach my $sequence_coord (@sequence_coords) { 01419 ## $sequence_coord refer to genomic_align (a slice in the [+] strand) 01420 01421 if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { 01422 my $subseq; 01423 if ($this_slice->strand == 1) { 01424 $subseq = substr($this_slice_seq, 01425 $sequence_coord->start - $this_slice->start, 01426 $sequence_coord->length); 01427 if ($sequence_coord->strand * $pair->{strand} == -1) { 01428 $subseq = reverse($subseq); 01429 $subseq =~ tr/ACGTacgt/TGCAtgca/; 01430 } 01431 } else { 01432 $subseq = substr($this_slice_seq, 01433 $this_slice->end - $sequence_coord->end, 01434 $sequence_coord->length); 01435 if ($sequence_coord->strand * $pair->{strand} == -1) { 01436 $subseq = reverse($subseq); 01437 $subseq =~ tr/ACGTacgt/TGCAtgca/; 01438 } 01439 } 01440 substr($seq, $this_pos, $sequence_coord->length, $subseq); 01441 01442 } else { ## Gap or sequence outside of any alignment 01443 ############ 01444 # Get the start and end positions of this gap in "subseq" coordinates 01445 my $this_original_start = $sequence_coord->start - $start; 01446 my $this_original_end = $sequence_coord->end - $start; 01447 if ($strand == -1) { 01448 my $aux = $this_original_end; 01449 $this_original_end = $length - $this_original_start; 01450 $this_original_start = $length - $aux; 01451 } 01452 if ($this_original_start <= $slice_end and $this_original_end >= $slice_start) { 01453 ## This is a gap 01454 my $start_position_of_gap_seq = $this_pos; 01455 my $end_position_of_gap_seq = $this_pos + $sequence_coord->length; 01456 if ($start_position_of_gap_seq < $slice_start) { 01457 $start_position_of_gap_seq = $slice_start; 01458 } 01459 if ($end_position_of_gap_seq > $slice_end + 1) { 01460 $end_position_of_gap_seq = $slice_end + 1; 01461 } 01462 my $length_of_gap_seq = $end_position_of_gap_seq - $start_position_of_gap_seq; 01463 substr($seq, $start_position_of_gap_seq, $length_of_gap_seq, "-" x $length_of_gap_seq) 01464 if ($length_of_gap_seq > 0); 01465 } 01466 } 01467 $this_pos += $sequence_coord->length; 01468 } 01469 } 01470 return $seq; 01471 } 01472 01473 01474 =head2 get_cigar_line 01475 01476 Arg : -none- 01477 Description: returns a cigar_line describing the gaps in the mapped sequence 01478 This Slice is made of several Bio::EnsEMBL::Slices mapped 01479 on it with gaps inside and regions with no matching 01480 sequence. The resulting cigar line corresponds to the mapping 01481 of all the nucleotides that can be mapepd. If several Slices map 01482 on the same positions, the behaviour is undefined. 01483 The cigar_line includes 3 types of regions: M for matches/mismatches, 01484 D for alignment gaps (formerly known as deletions) and G for 01485 gaps between alignment blocks. 01486 Returntype : txt 01487 Exceptions : 01488 Caller : general 01489 01490 =cut 01491 01492 sub get_cigar_line { 01493 my ($self, $start, $end, $strand) = @_; 01494 01495 $start = $self->start; 01496 $end = $self->end; 01497 $strand ||= 1; 01498 01499 ## Fix coordinates (needed for sub_Slices) 01500 if ($self->strand == -1) { 01501 $strand = -$strand; 01502 my $aux = $start; 01503 $start = $self->start + ($self->length - $end); 01504 $end = $self->start + ($self->length - $aux); 01505 } else { 01506 $start += $self->start - 1; 01507 $end += $self->start - 1; 01508 } 01509 01510 my $length = ($end - $start + 1); 01511 my $seq = "." x $length; 01512 foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs("get_gap_slices")}) { 01513 my $this_slice = $pair->{slice}; 01514 my $mapper = $pair->{mapper}; 01515 my $slice_start = $pair->{start}; 01516 my $slice_end = $pair->{end}; 01517 next if ($slice_start > $end or $slice_end < $start); 01518 01519 # Set slice_start and slice_end in "subseq" coordinates (0 based, for compliance wiht substr() perl func) and trim them 01520 $slice_start -= $start; # $slice_start is now in subseq coordinates 01521 $slice_start = 0 if ($slice_start < 0); 01522 $slice_end -= $start; # $slice_end is now in subseq coordinates 01523 $slice_end = $length if ($slice_end > $length); 01524 01525 # Invert start and end for the reverse strand 01526 if ($strand == -1) { 01527 my $aux = $slice_end; 01528 $slice_end = $length - $slice_start; 01529 $slice_start = $length - $aux; 01530 } 01531 01532 my @sequence_coords = $mapper->map_coordinates( 01533 'alignment', 01534 $start, 01535 $end, 01536 $strand, 01537 'alignment' 01538 ); 01539 ##################### 01540 # $this_pos refers to the starting position of the subseq if requesting the forward strand 01541 # or the ending position of the subseq if the reverse strand has been requested: 01542 # 01543 # FORWARD STRAND (1) 01544 # $this_pos = 0 01545 # | 01546 # ----------------------------------------------------------------------> 01547 # <---------------------------------------------------------------------- 01548 # 01549 # REVERSE STRAND (-1) 01550 # ----------------------------------------------------------------------> 01551 # <---------------------------------------------------------------------- 01552 # | 01553 # $this_pos = 0 01554 # 01555 # All remaining coordinates work in the same way except the start and end position 01556 # of the gaps which correspond to the coordinates in the original Slice... 01557 # 01558 my $this_pos = 0; 01559 foreach my $sequence_coord (@sequence_coords) { 01560 ## $sequence_coord refer to genomic_align (a slice in the [+] strand) 01561 01562 if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { 01563 my $subseq = "N" x ($sequence_coord->length); 01564 substr($seq, $this_pos, $sequence_coord->length, $subseq); 01565 01566 } else { ## Gap or sequence outside of any alignment 01567 ############ 01568 # Get the start and end positions of this gap in "subseq" coordinates 01569 my $this_original_start = $sequence_coord->start - $start; 01570 my $this_original_end = $sequence_coord->end - $start; 01571 if ($strand == -1) { 01572 my $aux = $this_original_end; 01573 $this_original_end = $length - $this_original_start; 01574 $this_original_start = $length - $aux; 01575 } 01576 if ($this_original_start <= $slice_end and $this_original_end >= $slice_start) { 01577 ## This is a gap 01578 my $start_position_of_gap_seq = $this_pos; 01579 my $end_position_of_gap_seq = $this_pos + $sequence_coord->length; 01580 if ($start_position_of_gap_seq < $slice_start) { 01581 $start_position_of_gap_seq = $slice_start; 01582 } 01583 if ($end_position_of_gap_seq > $slice_end + 1) { 01584 $end_position_of_gap_seq = $slice_end + 1; 01585 } 01586 my $length_of_gap_seq = $end_position_of_gap_seq - $start_position_of_gap_seq; 01587 substr($seq, $start_position_of_gap_seq, $length_of_gap_seq, "-" x $length_of_gap_seq) 01588 if ($length_of_gap_seq > 0); 01589 } 01590 } 01591 $this_pos += $sequence_coord->length; 01592 } 01593 } 01594 my $cigar_line = ""; 01595 01596 my @pieces = split(/(\-+|\.+)/, $seq); 01597 foreach my $piece (@pieces) { 01598 my $mode; 01599 if ($piece =~ /\./) { 01600 $mode = "G"; # D for gaps (deletions) 01601 } elsif ($piece =~ /\-/) { 01602 $mode = "D"; # D for gaps (deletions) 01603 } else { 01604 $mode = "M"; # M for matches/mismatches 01605 } 01606 if (length($piece) == 1) { 01607 $cigar_line .= $mode; 01608 } elsif (length($piece) > 1) { #length can be 0 if the sequence starts with a gap 01609 $cigar_line .= length($piece).$mode; 01610 } 01611 } 01612 01613 return $cigar_line; 01614 } 01615 01616 01617 =head2 get_all_underlying_Slices 01618 01619 Arg [1] : int $startBasePair 01620 relative to start of slice, which is 1. 01621 Arg [2] : int $endBasePair 01622 relative to start of slice. 01623 Arg [3] : (optional) int $strand 01624 The strand of the slice to obtain sequence from. Default 01625 value is 1. 01626 Description: This Slice is made of several Bio::EnsEMBL::Slices mapped 01627 on it with gaps inside and regions with no matching 01628 sequence. This method returns these Slices (or part of 01629 them) with the original coordinates and the gapped 01630 sequence attached to them. Additionally, extra Slices 01631 could be returned in order to fill in gaps between 01632 underlying Slices. 01633 Returntype : listref of Bio::EnsEMBL::Slice objects 01634 Exceptions : end should be at least as big as start 01635 Caller : general 01636 01637 =cut 01638 01639 sub get_all_underlying_Slices { 01640 my ($self, $start, $end, $strand) = @_; 01641 my $underlying_slices = []; 01642 01643 $start = 1 if (!defined($start)); 01644 $end ||= $self->length; 01645 $strand ||= 1; 01646 01647 ## Fix coordinates (needed for sub_Slices) 01648 if ($self->strand == -1) { 01649 $strand = -$strand; 01650 my $aux = $start; 01651 $start = $self->start + ($self->length - $end); 01652 $end = $self->start + ($self->length - $aux); 01653 } else { 01654 $start += $self->start - 1; 01655 $end += $self->start - 1; 01656 } 01657 01658 my $current_position; 01659 # if ($strand == 1) { 01660 $current_position = $start; 01661 # } else { 01662 # $current_position = $end; 01663 # } 01664 foreach my $pair (sort {$a->{start} <=> $b->{start}} @{$self->get_all_Slice_Mapper_pairs}) { 01665 my $this_slice = $pair->{slice}; 01666 my $mapper = $pair->{mapper}; 01667 my $slice_start = $pair->{start}; 01668 my $slice_end = $pair->{end}; 01669 next if ($slice_start > $end or $slice_end < $start); 01670 01671 my @sequence_coords = $mapper->map_coordinates( 01672 'alignment', 01673 $start, 01674 $end, 01675 $strand, 01676 'alignment' 01677 ); 01678 my $this_subseq_start; 01679 my $this_subseq_end; 01680 my $this_subseq_strand; 01681 foreach my $sequence_coord (@sequence_coords) { 01682 ## $sequence_coord refer to genomic_align (a slice in the [+] strand) 01683 if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { 01684 $this_subseq_start = $sequence_coord->start 01685 if (!defined($this_subseq_start) or $this_subseq_start > $sequence_coord->start); 01686 $this_subseq_end = $sequence_coord->end 01687 if (!defined($this_subseq_end) or $this_subseq_end < $sequence_coord->end); 01688 $this_subseq_strand = $sequence_coord->strand 01689 if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate") and !defined($this_subseq_strand)); 01690 } 01691 } 01692 # next if (!defined($this_subseq_start)); # if the whole requested region correspond to a gap 01693 my $start_position = ($start>$slice_start)?$start:$slice_start; # in AlignSlice coordinates 01694 my $end_position = ($end<$slice_end)?$end:$slice_end; # in AlignSlice coordinates 01695 # if ($strand == 1) { 01696 if ($start_position > $current_position) { 01697 my $this_underlying_slice = new Bio::EnsEMBL::Slice( 01698 -coord_system => $gap_coord_system, 01699 -seq_region_name => "GAP", 01700 -start => $current_position, 01701 -end => $start_position - 1, 01702 -strand => 0 01703 ); 01704 $this_underlying_slice->{seq} = "." x ($start_position - $current_position); 01705 push(@$underlying_slices, $this_underlying_slice); 01706 } 01707 # } else { 01708 # if ($end_position < $current_position) { 01709 # my $this_underlying_slice = new Bio::EnsEMBL::Slice( 01710 # -seq_region_name => "GAP", 01711 # -start => $end_position + 1, 01712 # -end => $current_position, 01713 # -strand => 0 01714 # ); 01715 # $this_underlying_slice->{seq} = "." x ($current_position - $end_position); 01716 # unshift(@$underlying_slices, $this_underlying_slice); 01717 # } 01718 # } 01719 $current_position = $end_position + 1; 01720 my $this_underlying_slice; 01721 if (!defined($this_subseq_start)) { 01722 $this_underlying_slice = new Bio::EnsEMBL::Slice( 01723 -coord_system => $gap_coord_system, 01724 -seq_region_name => "GAP", 01725 -start => $start_position, 01726 -end => $end_position, 01727 -strand => 0 01728 ); 01729 $this_underlying_slice->{seq} = "-" x ($end_position - $start_position + 1); 01730 } else { 01731 $this_underlying_slice = new Bio::EnsEMBL::Slice( 01732 -coord_system => $this_slice->coord_system, 01733 -seq_region_name => $this_slice->seq_region_name, 01734 -start => $this_subseq_start, 01735 -end => $this_subseq_end, 01736 -strand => $this_subseq_strand 01737 ); 01738 $this_underlying_slice->{seq} = $self->subseq($start_position, $end_position, $strand); 01739 $this_underlying_slice->{_tree} = $this_slice->{_tree} if (defined($this_slice->{_tree})); 01740 } 01741 # if ($strand == 1) { 01742 push(@$underlying_slices, $this_underlying_slice); 01743 # } else { 01744 # unshift(@$underlying_slices, $this_underlying_slice); 01745 # } 01746 } 01747 if ($end >= $current_position) { 01748 my $this_underlying_slice = new Bio::EnsEMBL::Slice( 01749 -coord_system => $gap_coord_system, 01750 -seq_region_name => "GAP", 01751 -seq_region_length => ($end - $current_position + 1), 01752 -start => $current_position, 01753 -end => $end, 01754 -strand => 0 01755 ); 01756 $this_underlying_slice->{seq} = "." x ($end - $current_position + 1); 01757 push(@$underlying_slices, $this_underlying_slice); 01758 } 01759 if ($strand == -1) { 01760 @$underlying_slices = reverse(@$underlying_slices); 01761 } 01762 01763 return $underlying_slices; 01764 } 01765 01766 =head2 get_original_seq_region_position 01767 01768 Arg [1] : int $position 01769 relative to start of slice, which is 1. 01770 Description: This Slice is made of several Bio::EnsEMBL::Slices mapped 01771 on it with gaps inside and regions with no matching 01772 sequence. This method returns the original seq_region_position 01773 in the original Slice of the requested position in AlignSlice 01774 coordinates 01775 Example : my ($slice, $seq_region_position) = $as_slice-> 01776 get_original_seq_region_position(100); 01777 Returntype : ($slice, $seq_region_position), an array where the first 01778 element is a Bio::EnsEMBL::Slice and the second one is the 01779 requested seq_region_position. 01780 Exceptions : if the position corresponds to a gap, the slice will be a fake GAP 01781 slice and the position will be the requested one (in AlignSlice 01782 coordinates) 01783 Caller : general 01784 01785 =cut 01786 01787 sub get_original_seq_region_position { 01788 my ($self, $position) = @_; 01789 my $underlying_slice = $self->get_all_underlying_Slices($position, $position, 1)->[0]; 01790 01791 return ($underlying_slice, $underlying_slice->start); 01792 } 01793 01794 01795 =head2 get_all_constrained_elements 01796 01797 Arg 1 : (opt) string $method_link_type (default = GERP_CONSTRAINED_ELEMENT) 01798 Arg 2 : (opt) listref Bio::EnsEMBL::Compara::GenomeDB $species_set 01799 (default, the set of species from the MethodLinkSpeciesSet used 01800 to build this AlignSlice) 01801 Example : my $constrained_elements = 01802 $align_slice->get_all_constrained_elements(); 01803 Description: Retrieve the corresponding constrained elements for these alignments. 01804 Objects will be mapped on this AlignSlice::Slice, i.e. the 01805 reference_slice, reference_slice_start, reference_slice_end 01806 and reference_slice_strand will refer to this AlignSlice::Slice 01807 object 01808 Returntype : ref. to an array of Bio::EnsEMBL::Compara::GenomicAlignBlock 01809 objects. 01810 Caller : object::methodname 01811 Status : At risk 01812 01813 =cut 01814 01815 sub get_all_constrained_elements { 01816 my ($self, $method_link_type, $species_set) = @_; 01817 my $all_constrained_elements = []; 01818 01819 return [] if (!$self->{_align_slice}); 01820 my $all_original_constrained_elements = $self->{_align_slice}-> 01821 get_all_constrained_elements($method_link_type, $species_set); 01822 foreach my $this_constrained_element (@$all_original_constrained_elements) { 01823 foreach my $this_genomic_align (@{$this_constrained_element->get_all_GenomicAligns}) { 01824 if ($this_genomic_align->genome_db->name eq $self->genome_db->name) { 01825 my $constrained_slice = $this_genomic_align->get_Slice; 01826 my $slices = $self->map_original_Slice($constrained_slice); 01827 $this_constrained_element->reference_slice($self); 01828 $this_constrained_element->reference_slice_start($slices->[0]->start); 01829 $this_constrained_element->reference_slice_end($slices->[0]->end); 01830 $this_constrained_element->reference_slice_strand($slices->[0]->strand); 01831 push(@$all_constrained_elements, $this_constrained_element); 01832 } 01833 } 01834 } 01835 01836 return $all_constrained_elements; 01837 } 01838 01839 01840 =head2 map_original_Slice 01841 01842 Arg [1] : Bio::EnsEMBL::Slice $original_slice 01843 Description: This Slice is made of several Bio::EnsEMBL::Slices mapped 01844 on it with gaps inside and regions with no matching 01845 sequence. This method tries to map on this Slice the 01846 region(s) corresponding to the provided $original_slice 01847 NB: This method does not know how to project Slices onto 01848 other coordinate systems. It is your responsability to 01849 provide an original Slice on the same coordinate system 01850 as the underlying Bio::EnsEMBL::Slices 01851 Example : my $slices = $as_slice->map_original_Slice($orginal_slice); 01852 Returntype : listref of Bio::EnsEMBL::Compara::AlignSlice::Slice objects 01853 which are the sub_Slices of this Bio::EnsEMBL::Compara:: 01854 AlignSlice::Slice where the $original_slice maps 01855 Exceptions : 01856 01857 =cut 01858 01859 sub map_original_Slice { 01860 my ($self, $original_slice) = @_; 01861 my $mapped_slices = []; 01862 01863 return $mapped_slices if (!defined($original_slice)); 01864 01865 foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { 01866 my $this_slice = $pair->{slice}; 01867 my $mapper = $pair->{mapper}; 01868 my $slice_start = $pair->{start}; 01869 my $slice_end = $pair->{end}; 01870 next if (!$this_slice->coord_system->equals($original_slice->coord_system)); 01871 next if ($this_slice->seq_region_name ne $original_slice->seq_region_name); 01872 01873 next if ($this_slice->start > $original_slice->end or $this_slice->end < $original_slice->start); 01874 01875 my @sequence_coords = $mapper->map_coordinates( 01876 'sequence', 01877 $original_slice->start, 01878 $original_slice->end, 01879 $original_slice->strand, 01880 'sequence' 01881 ); 01882 my $mapped_start; 01883 my $mapped_end; 01884 my $mapped_strand; 01885 foreach my $sequence_coord (@sequence_coords) { 01886 if ($sequence_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { 01887 if (!defined($mapped_start) or ($mapped_start > $sequence_coord->start)) { 01888 $mapped_start = $sequence_coord->start; 01889 } 01890 if (!defined($mapped_end) or ($mapped_end < $sequence_coord->end)) { 01891 $mapped_end = $sequence_coord->end; 01892 } 01893 if (!defined($mapped_strand)) { 01894 $mapped_strand = $sequence_coord->strand; 01895 } elsif ($mapped_strand != $sequence_coord->strand) { 01896 warning("strand inversion within a Slice-Mapper pair!"); 01897 $mapped_start = undef; 01898 $mapped_end = undef; 01899 $mapped_strand = undef; 01900 last; 01901 } 01902 } 01903 } 01904 if (defined($mapped_start) and defined($mapped_end) and defined($mapped_strand)) { 01905 my $sub_slice = $self->sub_Slice($mapped_start, $mapped_end, $mapped_strand); 01906 push(@$mapped_slices, $sub_slice) if ($sub_slice); 01907 } 01908 } 01909 01910 return $mapped_slices; 01911 } 01912 01913 01914 =head2 expand (not supported) 01915 01916 Expanding a Bio::EnsEMBL::Compara::AlignSlice::Slice object is not supported at the moment. 01917 You could create a new Bio::EnsEMBL::Slice and fetch a new Bio::EnsEMBL::Compara::AlignSlice 01918 from it. On the hand, if you only want to extend the sequence, you could use the subseq method 01919 with a negatve start value or an end value larger than the end of this Slice. 01920 01921 This method returns undef 01922 01923 =cut 01924 01925 sub expand { 01926 my $self = shift; 01927 warning("Cannot expand a Bio::EnsEMBL::Compara::AlignSlice::Slice object"); 01928 return undef; 01929 } 01930 01931 01932 =head2 get_all_Attributes 01933 01934 Arg [1] : optional string $attrib_code 01935 The code of the attribute type to retrieve values for. 01936 Example : ($htg_phase) = @{$slice->get_all_Attributes('htg_phase')}; 01937 @slice_attributes = @{$slice->get_all_Attributes()}; 01938 Description: Gets a list of Attributes of all teh underlying slice''s 01939 seq_region. Optionally just get Attributes for given code. 01940 This Slice is made of several Bio::EnsEMBL::Slices mapped 01941 on it. This method go through all of them, retrieves the 01942 data and return them in order. There will be one set of 01943 Attributes by underlying slice. 01944 Returntype : listref Bio::EnsEMBL::Attribute 01945 Exceptions : warning if slice does not have attached adaptor 01946 Caller : general 01947 01948 =cut 01949 01950 sub get_all_Attributes { 01951 my $self = shift; 01952 my $attributes = []; 01953 01954 foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { 01955 my $this_slice = $pair->{slice}; 01956 push(@$attributes, @{$this_slice->get_all_Attributes(@_)}); 01957 } 01958 return $attributes; 01959 } 01960 01961 01962 =head2 get_all_VariationFeatures 01963 01964 Args : none 01965 Description :returns all variation features on this slice. This function will only work 01966 correctly if the variation database has been attached to the core database. 01967 This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This 01968 method go through all of them, retrieves the data and maps them on this 01969 Bio::EnsEMBL::Compara::AlignSlice::Slice object. 01970 ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature 01971 Exceptions : none 01972 Caller : contigview, snpview 01973 01974 =cut 01975 01976 sub get_all_VariationFeatures { 01977 my $self = shift; 01978 01979 return $self->_method_returning_simple_features("get_all_VariationFeatures", @_) 01980 } 01981 01982 01983 =head2 get_all_genotyped_VariationFeatures 01984 01985 Args : none 01986 Function : returns all variation features on this slice that have been genotyped. This 01987 function will only work correctly if the variation database has been 01988 attached to the core database. 01989 This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This 01990 method go through all of them, retrieves the data and maps them on this 01991 Bio::EnsEMBL::Compara::AlignSlice::Slice object by changing start, end, 01992 strand and slice attributes. 01993 ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature 01994 Exceptions : none 01995 Caller : contigview, snpview 01996 01997 =cut 01998 01999 sub get_all_genotyped_VariationFeatures { 02000 my $self = shift; 02001 02002 return $self->_method_returning_simple_features("get_all_genotyped_VariationFeatures", @_) 02003 } 02004 02005 02006 =head2 get_all_RepeatFeatures 02007 02008 Arg [1] : (optional) string $logic_name 02009 The name of the analysis performed on the repeat features 02010 to obtain. 02011 Arg [2] : (optional) string $repeat_type 02012 Limits features returned to those of the specified repeat_type 02013 Example : @repeat_feats = @{$slice->get_all_RepeatFeatures(undef,'LTR')}; 02014 Description: Retrieves the RepeatFeatures which overlap with 02015 logic name $logic_name and with score above $score. If 02016 $logic_name is not defined features of all logic names are 02017 retrieved. 02018 This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This 02019 method go through all of them, retrieves the data and maps them on this 02020 Bio::EnsEMBL::Compara::AlignSlice::Slice object by changing start, end, 02021 strand and slice attributes. 02022 Returntype : listref of Bio::EnsEMBL::RepeatFeatures 02023 Exceptions : warning if slice does not have attached adaptor 02024 Caller : general 02025 02026 =cut 02027 02028 sub get_all_RepeatFeatures { 02029 my $self = shift; 02030 02031 return $self->_method_returning_simple_features("get_all_RepeatFeatures", @_) 02032 } 02033 02034 02035 =head2 project (testing) 02036 02037 Arg [1] : string $name 02038 The name of the coordinate system to project this slice onto 02039 Arg [2] : string $version 02040 The version of the coordinate system (such as 'NCBI34') to 02041 project this slice onto 02042 Example : 02043 my $clone_projection = $slice->project('clone'); 02044 02045 foreach my $seg (@$clone_projection) { 02046 my $clone = $segment->to_Slice(); 02047 print $slice->seq_region_name(), ':', $seg->from_start(), '-', 02048 $seg->from_end(), ' -> ', 02049 $clone->seq_region_name(), ':', $clone->start(), '-',$clone->end(), 02050 $clone->strand(), "\n"; 02051 } 02052 Description: This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This 02053 method go through all of them, porject them and maps the projections 02054 on this Bio::EnsEMBL::Compara::AlignSlice::Slice object. 02055 The original 'project' method returns the results of 'projecting' 02056 a slice onto another coordinate system. Projecting to a coordinate 02057 system that the slice is assembled from is analagous to retrieving a tiling 02058 path. The original method may also be used to 'project up' to a higher 02059 level coordinate system, however. 02060 02061 This method returns a listref of triplets [start,end,slice] 02062 which represents the projection. The start and end defined the 02063 region of this slice which is made up of the third value of 02064 the triplet: a slice in the requested coordinate system. 02065 02066 Because of the gaps in the mapping of the Bio::EnsEMBL::Slices 02067 the lenght of the slice returned in the tripet may be different 02068 than the distance defined by the start and end of the 02069 Bio::EnsEMBL::ProjectionSegment object. 02070 Returntype : list reference of Bio::EnsEMBL::ProjectionSegment objects which 02071 can also be used as [$start,$end,$slice] triplets 02072 Exceptions : none 02073 Caller : general 02074 02075 =cut 02076 02077 sub project { 02078 my $self = shift; 02079 my $cs_name = shift; 02080 my $cs_version = shift; 02081 my $projections = []; 02082 02083 throw('Coord_system name argument is required') if(!$cs_name); 02084 02085 foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { 02086 my $this_slice = $pair->{slice}; 02087 my $this_mapper = $pair->{mapper}; 02088 my $this_projections = $this_slice->project($cs_name, $cs_version); 02089 foreach my $this_projection (@$this_projections) { 02090 my ($this_start, $this_end); 02091 if ($this_slice->strand == 1) { 02092 $this_start = $this_slice->start + $this_projection->from_start - 1; 02093 $this_end = $this_slice->start + $this_projection->from_end - 1; 02094 } else { 02095 $this_start = $this_slice->start + ($this_slice->length - $this_projection->from_start); 02096 $this_end = $this_slice->start + ($this_slice->length - $this_projection->from_end); 02097 } 02098 my $new_slice = $this_projection->to_Slice; 02099 my ($new_start, $new_end); 02100 my @alignment_coords = $this_mapper->map_coordinates( 02101 'sequence', 02102 $this_start, 02103 $this_start, 02104 1, 02105 'sequence' 02106 ); 02107 foreach my $alignment_coord (@alignment_coords) { 02108 if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { 02109 $new_start = $alignment_coord->start; 02110 } 02111 } 02112 next if (!defined($new_start)); 02113 @alignment_coords = $this_mapper->map_coordinates( 02114 'sequence', 02115 $this_end, 02116 $this_end, 02117 1, 02118 'sequence' 02119 ); 02120 foreach my $alignment_coord (@alignment_coords) { 02121 if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { 02122 $new_end = $alignment_coord->start; 02123 } 02124 } 02125 next if (!defined($new_end)); 02126 02127 ## Truncate projection in order to fit into this AlignSlice 02128 if ($new_start > $self->end) { 02129 next; ## Maps outside of sub_AlignSlice 02130 } elsif ($new_start < $self->start) { 02131 if ($new_slice->strand == -1) { 02132 $new_slice = $new_slice->sub_Slice(1, $new_slice->length - ($self->start - $new_start)); 02133 } else { 02134 $new_slice = $new_slice->sub_Slice($self->start - $new_start + 1, $new_slice->length); 02135 } 02136 $new_start = 1; 02137 $new_end = $new_slice->length; 02138 } 02139 02140 ## Truncate projection in order to fit into this AlignSlice 02141 if ($new_end < $self->start) { 02142 next; ## Maps outside of sub_AlignSlice 02143 } elsif ($new_end > $self->length) { 02144 # We have to truncate this projection 02145 if ($new_slice->strand == -1) { 02146 $new_slice = $new_slice->sub_Slice(1 + $new_end - $self->length, $new_slice->length); 02147 } else { 02148 $new_slice = $new_slice->sub_Slice(1, $new_slice->length - ($new_end - $self->length)); 02149 } 02150 $new_end = $self->length; 02151 } 02152 02153 my $new_projection = bless([$new_start, $new_end, $new_slice], 02154 "Bio::EnsEMBL::ProjectionSegment"); 02155 push(@$projections, $new_projection); 02156 } 02157 } 02158 02159 return $projections; 02160 } 02161 02162 02163 =head2 _method_returning_simple_features 02164 02165 Args[1] : method_name 02166 Description : This Slice is made of several Bio::EnsEMBL::Slices mapped on it. This 02167 method go through all of them, calls method_name and maps teh result on 02168 this Bio::EnsEMBL::Compara::AlignSlice::Slice object. 02169 ReturnType : listref of Bio::EnsEMBL::Variation::VariationFeature 02170 Exceptions : none 02171 Caller : contigview, snpview 02172 02173 =cut 02174 02175 sub _method_returning_simple_features { 02176 my $self = shift; 02177 my $method = shift; 02178 my $ret = []; 02179 02180 foreach my $pair (@{$self->get_all_Slice_Mapper_pairs}) { 02181 my $this_slice = $pair->{slice}; 02182 my $this_mapper = $pair->{mapper}; 02183 my $this_ret = $this_slice->$method(@_); 02184 foreach my $this_object (@$this_ret) { 02185 my ($this_start, $this_end); 02186 if ($this_slice->strand == 1) { 02187 $this_start = $this_object->slice->start + $this_object->start - 1; 02188 $this_end = $this_object->slice->start + $this_object->end - 1; 02189 } else { 02190 $this_start = $this_object->slice->start + ($this_object->slice->length - $this_object->end); 02191 $this_end = $this_object->slice->start + ($this_object->slice->length - $this_object->start); 02192 } 02193 my @alignment_coords = $this_mapper->map_coordinates( 02194 'sequence', 02195 $this_start, 02196 $this_end, 02197 $this_object->strand, 02198 'sequence' 02199 ); 02200 my ($start, $end, $strand); 02201 foreach my $alignment_coord (@alignment_coords) { 02202 if ($alignment_coord->isa("Bio::EnsEMBL::Mapper::Coordinate")) { 02203 if (!defined($start) or $start > $alignment_coord->start) { 02204 $start = $alignment_coord->start; 02205 } 02206 if (!defined($end) or $end < $alignment_coord->end) { 02207 $end = $alignment_coord->end; 02208 } 02209 if (!defined($strand)) { 02210 $strand = $alignment_coord->strand * $this_slice->strand; 02211 } 02212 } 02213 } 02214 my $new_object; 02215 %$new_object = %$this_object; 02216 bless $new_object, ref($this_object); 02217 if (defined($start) and defined($end) and defined($strand)) { 02218 $new_object->{start} = $start - $self->start + 1; 02219 $new_object->{end} = $end - $self->start + 1; 02220 $new_object->{strand} = $strand; 02221 $new_object->{slice} = $self; 02222 # Skip this object if it maps outside of this AlignSlice 02223 next if ($new_object->{start} > $self->length or $new_object->{end} < 1); 02224 push(@$ret, $new_object); 02225 } 02226 } 02227 } 02228 return $ret; 02229 } 02230 02231 1;