Archive Ensembl HomeArchive Ensembl Home
BaseGenomicAlignSet.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::BaseGenomicAlignSet - Base class for GenomicAlignBlock and GenomicAlignTree
00022 
00023 =head1 APPENDIX
00024 
00025 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
00026 
00027 =cut
00028 
00029 
00030 # Let the code begin...
00031 
00032 package Bio::EnsEMBL::Compara::BaseGenomicAlignSet;
00033 use strict;
00034 
00035 # Object preamble
00036 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00037 use Bio::EnsEMBL::Utils::Exception qw(throw warning info deprecate verbose);
00038 
00039 
00040 =head2 slice
00041 
00042   Arg [1]    : (optional) Bio::EnsEMBL::Slice $reference_slice
00043   Example    : my $slice = $genomic_align_block->slice;
00044   Example    : $genomic_align_block->slice($slice);
00045   Description: get/set for attribute slice.
00046   Returntype : Bio::EnsEMBL::Slice object
00047   Exceptions : 
00048   Caller     : general
00049   Status     : Stable
00050 
00051 =cut
00052 
00053 sub slice {
00054   my ($self, $reference_slice) = @_;
00055 
00056   if (defined($reference_slice)) {
00057 #     throw "[$reference_slice] is not a Bio::EnsEMBL::Slice"
00058 #         unless $reference_slice->isa("Bio::EnsEMBL::Slice");
00059     $self->{'reference_slice'} = $reference_slice;
00060   }
00061 
00062   return $self->{'reference_slice'};
00063 }
00064 
00065 =head2 reference_slice
00066 
00067   Arg [1]    : (optional) Bio::EnsEMBL::Slice $reference_slice
00068   Example    : my $reference_slice = $genomic_align_block->reference_slice;
00069   Example    : $genomic_align_block->reference_slice($slice);
00070   Description: Alias for slice method. TO BE DEPRECATED.
00071   Returntype : Bio::EnsEMBL::Slice object
00072   Exceptions : 
00073   Caller     : general
00074   Status     : Stable
00075 
00076 =cut
00077 
00078 sub reference_slice {
00079   my ($self, $reference_slice) = @_;
00080 
00081   return $self->slice($reference_slice);
00082 }
00083 
00084 =head2 start
00085 
00086   Arg [1]    : (optional) integer $start
00087   Example    : my $start = $genomic_align_block->start;
00088   Example    : $genomic_align_block->start(1035);
00089   Description: get/set for attribute reference_slice_start. A value of 0 will set
00090                the attribute to undefined.
00091   Returntype : integer
00092   Exceptions : none
00093   Caller     : general
00094 
00095 =cut
00096 
00097 sub start {
00098   my $self = shift;
00099   return $self->reference_slice_start(@_);
00100 }
00101 
00102 
00103 =head2 reference_slice_start
00104 
00105   Arg [1]    : integer $reference_slice_start
00106   Example    : my $reference_slice_start = $genomic_align_block->reference_slice_start;
00107   Example    : $genomic_align_block->reference_slice_start(1035);
00108   Description: get/set for attribute reference_slice_start. A value of 0 will set
00109                the attribute to undefined.
00110   Returntype : integer
00111   Exceptions : none
00112   Caller     : general
00113   Status     : Stable
00114 
00115 =cut
00116 
00117 sub reference_slice_start {
00118     my ($self, $reference_slice_start) = @_;
00119     
00120     if (defined($reference_slice_start)) {
00121     $self->{'reference_slice_start'} = ($reference_slice_start or undef);
00122     }
00123     
00124     return $self->{'reference_slice_start'};
00125 }
00126 
00127 
00128 =head2 end
00129 
00130   Arg [1]    : (optional) integer $end
00131   Example    : my $end = $genomic_align_block->end;
00132   Example    : $genomic_align_block->end(1283);
00133   Description: get/set for attribute reference_slice_end. A value of 0 will set
00134                the attribute to undefined.
00135   Returntype : integer
00136   Exceptions : none
00137   Caller     : general
00138 
00139 =cut
00140 
00141 sub end {
00142   my $self = shift;
00143   return $self->reference_slice_end(@_);
00144 }
00145 
00146 
00147 =head2 reference_slice_end
00148 
00149   Arg [1]    : integer $reference_slice_end
00150   Example    : my $reference_slice_end = $genomic_align_block->reference_slice_end;
00151   Example    : $genomic_align_block->reference_slice_end(1283);
00152   Description: get/set for attribute reference_slice_end. A value of 0 will set
00153                the attribute to undefined.
00154   Returntype : integer
00155   Exceptions : none
00156   Caller     : general
00157   Status     : Stable
00158 
00159 =cut
00160 
00161 sub reference_slice_end {
00162   my ($self, $reference_slice_end) = @_;
00163  
00164   if (defined($reference_slice_end)) {
00165     $self->{'reference_slice_end'} = ($reference_slice_end or undef);
00166   }
00167   
00168   return $self->{'reference_slice_end'};
00169 
00170 }
00171 
00172 =head2 strand
00173 
00174   Arg [1]    : integer $strand
00175   Example    : my $strand = $genomic_align_block->strand;
00176   Example    : $genomic_align_block->strand(-1);
00177   Description: get/set for attribute strand. A value of 0 will set
00178                the attribute to undefined.
00179   Returntype : integer
00180   Exceptions : none
00181   Caller     : general
00182   Status     : Stable
00183 
00184 =cut
00185 
00186 sub strand {
00187     my ($self, $reference_slice_strand) = @_;
00188     
00189     if (defined($reference_slice_strand)) {
00190     $self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
00191     }
00192     
00193     return $self->{'reference_slice_strand'};
00194 }
00195 
00196 =head2 reference_slice_strand
00197 
00198   Arg [1]    : integer $reference_slice_strand
00199   Example    : my $reference_slice_strand = $genomic_align_block->reference_slice_strand;
00200   Example    : $genomic_align_block->reference_slice_strand(-1);
00201   Description: get/set for attribute reference_slice_strand. A value of 0 will set
00202                the attribute to undefined.
00203   Returntype : integer
00204   Exceptions : none
00205   Caller     : general
00206   Status     : Stable
00207 
00208 =cut
00209 
00210 sub reference_slice_strand {
00211   my ($self, $reference_slice_strand) = @_;
00212  
00213   if (defined($reference_slice_strand)) {
00214     $self->{'reference_slice_strand'} = ($reference_slice_strand or undef);
00215   }
00216   
00217   return $self->{'reference_slice_strand'};
00218 }
00219 
00220 =head2 get_original_strand
00221 
00222   Args       : none
00223   Example    : if (!$genomic_align_block->get_original_strand()) {
00224                  # original GenomicAlignBlock has been reverse-complemented
00225                }
00226   Description: getter for the _orignal_strand attribute
00227   Returntype : none
00228   Exceptions : none
00229   Caller     : general
00230   Status     : Stable
00231 
00232 =cut
00233 
00234 sub get_original_strand {
00235   my ($self) = @_;
00236 
00237   if (!defined($self->{_original_strand})) {
00238     $self->{_original_strand} = 1;
00239   }
00240 
00241   return $self->{_original_strand};
00242 }
00243 
00244 =head2 restrict_between_reference_positions
00245 
00246   Arg[1]     : [optional] int $start, refers to the reference_dnafrag
00247   Arg[2]     : [optional] int $end, refers to the reference_dnafrag
00248   Arg[3]     : [optional] Bio::EnsEMBL::Compara::GenomicAlign $reference_GenomicAlign
00249   Arg[4]     : [optional] boolean $skip_empty_GenomicAligns
00250   Example    : none
00251   Description: restrict this GenomicAlignBlock. It returns a new object unless no
00252                restriction is needed. In that case, it returns the original unchanged
00253                object
00254                It might be the case that the restricted region coincide with a gap
00255                in one or several GenomicAligns. By default these GenomicAligns are
00256                returned with a dnafrag_end equals to its dnafrag_start + 1. For instance,
00257                a GenomicAlign with dnafrag_start = 12345 and dnafrag_end = 12344
00258                correspond to a block which goes on this region from before 12345 to
00259                after 12344, ie just between 12344 and 12345. You can choose to remove
00260                these empty GenomicAligns by setting $skip_empty_GenomicAligns to any
00261                true value.
00262   Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object in scalar context. In
00263                list context, returns the previous object and the start and end
00264                positions of the restriction in alignment coordinates (from 1 to
00265                alignment_length)
00266   Exceptions : return undef if reference positions lie outside of the alignment
00267   Caller     : general
00268   Status     : At risk
00269 
00270 =cut
00271 
00272 sub restrict_between_reference_positions {
00273   my ($self, $start, $end, $reference_genomic_align, $skip_empty_GenomicAligns) = @_;
00274   my $genomic_align_set;
00275   my $new_reference_genomic_align;
00276   my $new_genomic_aligns = [];
00277   
00278   $reference_genomic_align ||= $self->reference_genomic_align;
00279   throw("A reference Bio::EnsEMBL::Compara::GenomicAlign must be given") if (!$reference_genomic_align);
00280   $start = $reference_genomic_align->dnafrag_start if (!defined($start));
00281   $end = $reference_genomic_align->dnafrag_end if (!defined($end));
00282 
00283   if ($start > $reference_genomic_align->dnafrag_end or $end < $reference_genomic_align->dnafrag_start) {
00284     # restricting outside of boundaries => return undef object
00285     warn("restricting outside of boundaries => return undef object: $start-$end (".$reference_genomic_align->dnafrag_start."-".$reference_genomic_align->dnafrag_end.")");
00286     return wantarray ? (undef, undef, undef) : undef;
00287   }
00288   my $number_of_base_pairs_to_trim_from_the_start = $start - $reference_genomic_align->dnafrag_start;
00289   my $number_of_base_pairs_to_trim_from_the_end  = $reference_genomic_align->dnafrag_end - $end;
00290 
00291   my $is_ref_low_coverage = 0;
00292   if ($reference_genomic_align->cigar_line =~ /X/) {
00293       $is_ref_low_coverage = 1;
00294   }
00295 
00296   ## Skip if no restriction is needed. Return original object! We are still going on with the
00297   ## restriction when either excess_at_the_start or excess_at_the_end are 0 as a (multiple)
00298   ## alignment may start or end with gaps in the reference species. In that case, we want to
00299   ## trim these gaps from the alignment as they fall just outside of the region of interest
00300 
00301   ##Exception if the reference species is low coverage, then need to continue 
00302   ##with this routine to find out the correct align positions
00303   if ($number_of_base_pairs_to_trim_from_the_start < 0 and $number_of_base_pairs_to_trim_from_the_end < 0 and !$is_ref_low_coverage) {
00304     return wantarray ? ($self, 1, $self->length) : $self;
00305   }
00306 
00307   my $negative_strand = ($reference_genomic_align->dnafrag_strand == -1);
00308 
00309   ## Create a new Bio::EnsEMBL::Compara::GenomicAlignBlock object
00310   throw("Reference GenomicAlign not found!") if (!$reference_genomic_align);
00311 
00312   my @reference_cigar = grep {$_} split(/(\d*[GDMXI])/, $reference_genomic_align->cigar_line);
00313   if ($negative_strand) {
00314     @reference_cigar = reverse @reference_cigar;
00315   }
00316 
00317   #If this is negative, eg when a slice starts in one block and ends in
00318   #another, need to set to 0 to ensure we enter the loop below. This
00319   #fixes a bug when using a 2x species as the reference and fetching using
00320   #an expanded align slice. 
00321   if ($number_of_base_pairs_to_trim_from_the_start < 0) {
00322       $number_of_base_pairs_to_trim_from_the_start = 0;
00323   }
00324 
00325   ## Parse start of cigar_line for the reference GenomicAlign
00326   my $counter_of_trimmed_columns_from_the_start = 0; # num. of bp in the alignment to be trimmed
00327 
00328   if ($number_of_base_pairs_to_trim_from_the_start >= 0) {
00329     my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the start)
00330     ## Loop through the cigar pieces
00331     while (my $cigar = shift(@reference_cigar)) {
00332       # Parse each cigar piece
00333       my ($num, $type) = ($cigar =~ /^(\d*)([GDMXI])/);
00334       $num = 1 if ($num eq "");
00335 
00336       # Insertions are not part of the alignment, don't count them
00337       if ($type ne "I") {
00338         $counter_of_trimmed_columns_from_the_start += $num;
00339       }
00340 
00341       # Matches and insertions are actual base pairs in the reference
00342       if ($type eq "M" or $type eq "I") {
00343         $counter_of_trimmed_base_pairs += $num;
00344         # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
00345         # we substitute this cigar piece by a shorter one
00346         if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_start) {
00347           my $new_cigar_piece = "";
00348           # length of the new cigar piece
00349           my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_start;
00350           if ($length > 1) {
00351             $new_cigar_piece = $length.$type;
00352           } elsif ($length == 1) {
00353             $new_cigar_piece = $type;
00354           }
00355           unshift(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);
00356 
00357           # There is no need to correct the counter of trimmed columns if we are in an insertion
00358           # when we overshoot
00359           if ($type eq "M") {
00360             $counter_of_trimmed_columns_from_the_start -= $length;
00361           }
00362 
00363           ## We don't want to start with an insertion or a deletion. Trim them!
00364           while (@reference_cigar and $reference_cigar[0] =~ /[DI]/) {
00365             my ($num, $type) = ($reference_cigar[0] =~ /^(\d*)([DIGMX])/);
00366             $num = 1 if ($num eq "");
00367             # only counts deletions, insertions are not part of the aligment
00368             $counter_of_trimmed_columns_from_the_start += $num if ($type eq "D");
00369             shift(@reference_cigar);
00370           }
00371           last;
00372         }
00373       }
00374     }
00375   }
00376 
00377   #If this is negative, eg when a slice starts in one block and ends in
00378   #another, need to set to 0 to ensure we enter the loop below. This
00379   #fixes a bug when using a 2x species as the reference and fetching using
00380   #an expanded align slice. 
00381   if ($number_of_base_pairs_to_trim_from_the_end < 0) {
00382       $number_of_base_pairs_to_trim_from_the_end = 0;
00383   }
00384 
00385   ## Parse end of cigar_line for the reference GenomicAlign
00386   my $counter_of_trimmed_columns_from_the_end = 0; # num. of bp in the alignment to be trimmed
00387   if ($number_of_base_pairs_to_trim_from_the_end >= 0) {
00388     my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the end)
00389     ## Loop through the cigar pieces
00390     while (my $cigar = pop(@reference_cigar)) {
00391       # Parse each cigar piece
00392       my ($num, $type) = ($cigar =~ /^(\d*)([DIGMX])/);
00393       $num = 1 if ($num eq "");
00394 
00395       # Insertions are not part of the alignment, don't count them
00396       if ($type ne "I") {
00397         $counter_of_trimmed_columns_from_the_end += $num;
00398       }
00399 
00400       # Matches and insertions are actual base pairs in the reference
00401       if ($type eq "M" or $type eq "I") {
00402         $counter_of_trimmed_base_pairs += $num;
00403         # If this cigar piece is too long and we overshoot the number of base pairs we want to trim,
00404         # we substitute this cigar piece by a shorter one
00405         if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_end) {
00406           my $new_cigar_piece = "";
00407           # length of the new cigar piece
00408           my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_end;
00409           if ($length > 1) {
00410             $new_cigar_piece = $length.$type;
00411           } elsif ($length == 1) {
00412             $new_cigar_piece = $type;
00413           }
00414           push(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece);
00415 
00416           # There is no need to correct the counter of trimmed columns if we are in an insertion
00417           # when we overshoot
00418           if ($type eq "M") {
00419             $counter_of_trimmed_columns_from_the_end -= $length;
00420           }
00421 
00422           ## We don't want to end with an insertion or a deletion. Trim them!
00423           while (@reference_cigar and $reference_cigar[-1] =~ /[DI]/) {
00424             my ($num, $type) = ($reference_cigar[-1] =~ /^(\d*)([DIGMX])/);
00425             $num = 1 if ($num eq "");
00426             # only counts deletions, insertions are not part of the aligment
00427             $counter_of_trimmed_columns_from_the_end += $num if ($type eq "D");
00428             pop(@reference_cigar);
00429           }
00430           last;
00431         }
00432       }
00433     }
00434   }
00435 
00436   ## Skip if no restriction is needed. Return original object! This may happen when
00437   ## either excess_at_the_start or excess_at_the_end are 0 but the alignment does not
00438   ## start or end with gaps in the reference species.
00439   if ($counter_of_trimmed_columns_from_the_start <= 0 and $counter_of_trimmed_columns_from_the_end <= 0) {
00440     return wantarray ? ($self, 1, $self->length) : $self;
00441   }
00442 
00443   my ($aln_start, $aln_end);
00444   if ($negative_strand) {
00445     $aln_start = $counter_of_trimmed_columns_from_the_end + 1;
00446     $aln_end = $self->length - $counter_of_trimmed_columns_from_the_start;
00447   } else {
00448     $aln_start = $counter_of_trimmed_columns_from_the_start + 1;
00449     $aln_end = $self->length - $counter_of_trimmed_columns_from_the_end;
00450   }
00451 
00452   $genomic_align_set = $self->restrict_between_alignment_positions($aln_start, $aln_end, $skip_empty_GenomicAligns);
00453   $new_reference_genomic_align = $genomic_align_set->reference_genomic_align;
00454 
00455   if (!defined $self->{'restricted_aln_start'}) {
00456       $self->{'restricted_aln_start'} = 0;
00457   }
00458   if (!defined $self->{'restricted_aln_end'}) {
00459       $self->{'restricted_aln_end'} = 0;
00460   }
00461   $genomic_align_set->{'restricted_aln_start'} = $counter_of_trimmed_columns_from_the_start + $self->{'restricted_aln_start'};
00462   $genomic_align_set->{'restricted_aln_end'} = $counter_of_trimmed_columns_from_the_end + $self->{'restricted_aln_end'};
00463   #$genomic_align_set->{'original_length'} = $self->length;
00464 
00465   #Need to use original gab length. If original_length is not set, length has
00466   #not changed. Needed when use 2X genome as reference. 
00467   if (defined $self->{'original_length'}) {
00468       $genomic_align_set->{'original_length'} = $self->{'original_length'};
00469   } else {
00470       $genomic_align_set->{'original_length'} = $self->length;
00471   }
00472 
00473   if (defined $self->slice) {
00474     if ($self->strand == 1) {
00475       $genomic_align_set->start($new_reference_genomic_align->dnafrag_start -
00476           $self->slice->start + 1);
00477       $genomic_align_set->end($new_reference_genomic_align->dnafrag_end -
00478           $self->slice->start + 1);
00479       $genomic_align_set->strand(1);
00480     } else {
00481       $genomic_align_set->start($self->{reference_slice}->{end} -
00482           $new_reference_genomic_align->{dnafrag_end} + 1);
00483       $genomic_align_set->end($self->{reference_slice}->{end} -
00484           $new_reference_genomic_align->{dnafrag_start} + 1);
00485       $genomic_align_set->strand(-1);
00486     }
00487   }
00488 
00489   return wantarray ? ($genomic_align_set, $aln_start, $aln_end) : $genomic_align_set;
00490 }
00491 
00492 1;