Archive Ensembl HomeArchive Ensembl Home
CigarString.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 AUTHOR
00020 
00021 Juguang Xiao <juguang@tll.org.sg>
00022 
00023 =cut
00024 
00025 =head1 NAME
00026 
00027 Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar
00028 strings
00029 
00030 =head1 DESCRIPTION
00031 
00032 Sequence alignment hits were previously stored within the core database
00033 as ungapped alignments. This imposed 2 major constraints on alignments:
00034 
00035 a) alignments for a single hit record would require multiple rows in the
00036    database, and
00037 b) it was not possible to accurately retrieve the exact original
00038    alignment.
00039 
00040 Therefore, in the new branch sequence alignments are now stored as
00041 ungapped alignments in the cigar line format (where CIGAR stands for
00042 Concise Idiosyncratic Gapped Alignment Report).
00043 
00044 In the cigar line format alignments are sotred as follows:
00045 
00046   M: Match
00047   D: Deletino
00048   I: Insertion
00049 
00050 An example of an alignment for a hypthetical protein match is shown
00051 below:
00052 
00053 
00054   Query:   42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
00055               PG    P    G     GP   R      PLGP
00056   Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
00057 
00058 protein_align_feature table as the following cigar line:
00059 
00060   7M4D12M2I2MD7M 
00061 
00062 =cut
00063 
00064 package Bio::EnsEMBL::Utils::CigarString;
00065 
00066 use strict;
00067 use vars qw(@ISA);
00068 use Bio::EnsEMBL::Root;
00069 
00070 @ISA = qw(Bio::EnsEMBL::Root);
00071 
00072 =head2 split_hsp
00073 
00074     Name  : split_hsp (this name is derived from the original sub in BlastWorn)
00075     Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object.
00076 my $factory = new Bio::EnsEMBL::Utils::CigarString;
00077 my $cigar_string = $factory->split_hsp($hsp);
00078   
00079     Function: generate cigar string.
00080     Argument: a HSP object.
00081     Returns : a text string.
00082   
00083 =cut
00084 
00085 sub split_hsp {
00086     my ($self, $hsp) = @_;
00087 
00088     $self->throw("a defined object needed") unless($hsp && defined($hsp));
00089     unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
00090         $self->throw("a HSP object needed");
00091     }
00092 
00093     my ($qtype, $htype) = $self->_findTypes($hsp);
00094     my ($qstrand, $hstrand) = $self->_findStrands($hsp);
00095     my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype);
00096 
00097     my @gaps = ();
00098     my @qchars = split(//, $hsp->query_string);
00099     my @hchars = split(//, $hsp->hit_string);
00100     my $qstart;
00101     if($qstrand == 1){
00102         $qstart = $hsp->query->start;
00103     }elsif($qstart == -1){
00104         $qstart = $hsp->query->end;
00105     }else{
00106         $self->warn("[$qstart], invalid strand value on query");
00107         $qstart = $hsp->query->start; 
00108         # Is this a SearchIO's bug???
00109     }
00110     
00111     my $hstart; 
00112     if($hstrand == 1){
00113         $hstart = $hsp->subject->start;
00114     }elsif($hstrand != -1){
00115         $hstart = $hsp->subject->end;
00116     }else{
00117         $self->throw("[$hstart], invalid strand value on subject");
00118     }
00119 
00120     my $qend = $qstart;
00121     my $hend = $hstart;
00122     my $count = 0;
00123     my $found = 0;
00124 
00125     my @align_coordinates = ();
00126     while($count <= $#qchars){
00127         if($qchars[$count] ne '-' && $hchars[$count] ne '-') {
00128             $qend += $qinc;
00129             $hend += $hinc;
00130             $found = 1;
00131         }else{ # gapped region
00132             push(@align_coordinates, [$qstart, $hstart]) if($found == 1);
00133 
00134             $qstart = $qend;
00135             $qstart += $qinc if($qchars[$count] ne '-');
00136 
00137             $hstart = $hend;
00138             $hstart += $hinc if($hchars[$count] ne '-');
00139 
00140             $qend = $qstart;
00141             $hend = $hstart;
00142             $found = 0;
00143         }
00144         $count++;
00145     }
00146 
00147     if($found){
00148         push(@align_coordinates, [$qstart, $hstart]);
00149     }
00150 
00151     my $cigar_string = "";
00152     my $last = $#align_coordinates;
00153     if($last >= 0){
00154         for(my $i=0; $i<$last; $i++){
00155             my $q_this_start = $align_coordinates[$i]->[0];
00156             my $q_next_start = $align_coordinates[$i+1]->[0];
00157             my $q_length = ($q_next_start-$q_this_start-1)*$qinc;
00158             $q_length = abs($q_length);
00159             my $h_this_start = $align_coordinates[$i]->[1];
00160             my $h_next_start = $align_coordinates[$i+1]->[1];
00161             my $h_length = ($h_next_start-$h_this_start-1)*$hinc;
00162             $h_length = abs($h_length);
00163 
00164             my $diff = $q_length - $h_length;
00165             if($diff > 0){ # Insertion
00166                 $cigar_string .= $diff unless($diff == 1);
00167                 $cigar_string .= 'I';
00168             }elsif($diff < 0){ # Deletion
00169                 $cigar_string .= -$diff unless($diff == -1);
00170                 $cigar_string .= 'D';
00171             }else{ # e.g $diff == 0, Match
00172                 $cigar_string .= $q_length unless($q_length == 1);
00173                 $cigar_string .= 'M';
00174             }
00175                 
00176         } # for
00177     } # if
00178     
00179     return $cigar_string;
00180 }
00181 
00182 
00183 sub _findStrands {
00184     my ($self,$hsp) = @_;
00185     
00186     my $qstrand = $hsp->query->strand;
00187     unless($qstrand == 1 || $qstrand == -1){
00188         $self->warn("query's strand value is neither 1 or -1");
00189         $qstrand = 1;
00190     }
00191     
00192     my $hstrand = $hsp->subject->strand;
00193     unless($hstrand == 1 || $hstrand == -1){
00194         $self->warn("subject's strand value is neither 1 or -1");
00195         $hstrand = 1;
00196     }
00197     
00198     return ( $qstrand, $hstrand);
00199 }
00200 
00201 sub _findTypes {
00202     my ($self,$hsp) = @_;
00203 
00204     my $type1;
00205     my $type2;
00206     my $len1 = $hsp->query->length();
00207     my $len2 = $hsp->subject->length();
00208 
00209     if ($len1/$len2 > 2) {
00210         $type1 = 'dna';
00211         $type2 = 'pep';
00212     } elsif ($len2/$len1 > 2) {
00213         $type1 = 'pep';
00214         $type2 = 'dna';
00215     } else {
00216         $type1 = 'dna';
00217         $type2 = 'dna';
00218     }
00219 
00220     return ($type1,$type2);
00221 }
00222 
00223 sub _findIncrements {
00224     my ($self,$qstrand,$hstrand,$qtype,$htype) = @_;
00225 
00226     my $qinc   = 1 * $qstrand;
00227     my $hinc   = 1 * $hstrand;
00228 
00229     if ($qtype eq 'dna' && $htype eq 'pep') {
00230     $qinc = 3 * $qinc;
00231     }
00232     if ($qtype eq 'pep' && $htype eq 'dna') {
00233     $hinc = 3 * $hinc;
00234     }
00235 
00236     return ($qinc,$hinc);
00237 }
00238 
00239 # This is a core logic of cigar string. The finite state machine theory is 
00240 # apply. See the below table, x-axis represents the input, with 3 options: 
00241 # (+/+) -- Both current query and subject bases are non-gap. Match
00242 # (-/+) -- The current query base is gap, but subject not. Deletion
00243 # (+/-) -- The current subject base is gap, but query not. Insertion
00244 # While the y-axis means the current state with letter 'M', 'D', 'I'
00245 #
00246 # The content of this table is the action taken in response of the input and 
00247 # the current state.
00248 # R     remain the state, counter increment.
00249 # G;X   generate the cigar line based on the current state and counter;
00250 #       clear the counter to zero and change to the state X
00251 #       
00252 #       ||  +/+  |  -/+ |  +/-  |
00253 # -------+----------------------+
00254 #    M  ||   R   |  G;D |   G;I |
00255 # ------------------------------+
00256 #    D  ||  G;M  |   R  |   G;I |
00257 # ------------------------------+   
00258 #    I  ||  G;M  |  G;D |    R  |
00259 #
00260 
00261 =head2 generate_cigar_string
00262 
00263   Name : generate_cigar_string
00264   Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars);
00265   Function: generate the cigar string for a piece of alignment.
00266   Args:     2 array references. The lengths of 2 arrays are the same
00267   Return:   a cigar string
00268 
00269 =cut
00270 
00271 # Developer's Note: The method is originally abstracted from the concept of 
00272 # cigar string. It only asks the essential information of 2 sequence characters
00273 # of the alignment, while the BlastWorn::split_HSP asks more unused information
00274 # for cigar string, which is useful to form align_coordinates. - Juguang
00275 
00276 my ($count, $state); # strictly only used in the following 2 subs
00277 
00278 sub generate_cigar_string {
00279 
00280 #   my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_;
00281 
00282     my ($self, $qchars_ref, $hchars_ref) = @_;
00283     
00284     my @qchars = @{$qchars_ref};
00285     my @hchars = @{$hchars_ref};
00286 
00287     unless(scalar(@qchars) == scalar(@hchars)){
00288         $self->throw("two sequences are not equal in lengths");
00289     }
00290     
00291     $count = 0;
00292     $state = 'M';    # the current state of gaps, (M, D, I) 
00293    
00294     my $cigar_string = '';
00295     for(my $i=0; $i <= $#qchars; $i++){
00296         my $qchar = $qchars[$i];
00297         my $hchar = $hchars[$i];
00298         if($qchar ne '-' && $hchar ne '-'){ # Match
00299             $cigar_string .= $self->_sub_cigar_string('M');
00300         }elsif($qchar eq '-'){ # Deletion
00301             $cigar_string .= $self->_sub_cigar_string('D');
00302         }elsif($hchar eq '-'){ # Insertion
00303             $cigar_string .= $self->_sub_cigar_string('I');
00304         }else{
00305             $self->throw("Impossible state that 2 gaps on each seq aligned");
00306         }
00307     }
00308     $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
00309     return $cigar_string;
00310 }
00311 
00312 sub _sub_cigar_string {
00313     my ($self, $new_state) = @_;
00314     my $sub_cigar_string = '';
00315     if($state eq $new_state){
00316         $count++; # Remain the state and increase the counter
00317     }else{
00318         $sub_cigar_string .= $count unless $count == 1;
00319         $sub_cigar_string .= $state;
00320         $count = 1;
00321         $state = $new_state;
00322     }
00323     return $sub_cigar_string;
00324 }
00325 
00326 =head2 generate_cigar_string_by_hsp
00327   
00328   Name :    generate_cigar_string_by_hsp
00329   Usage :   my $hsp; # a ready GenericHSP object
00330 my $cigar_string = $self->generate_cigar_string_by_hsp($hsp);
00331   Function: generate a cigar string by given HSP object.
00332   Args :    a GenericHSP object
00333   Returns:  a text string of cigar string
00334 
00335 =cut
00336 
00337 sub generate_cigar_string_by_hsp {
00338     my ($self, $hsp) = @_;
00339 
00340     unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
00341         $self->throw("A GenericHSP object needed");
00342     }
00343 
00344     my @qchars = split(//, $hsp->query_string);
00345     my @hchars = split(//, $hsp->hit_string);
00346 
00347     return $self->generate_cigar_string(\@qchars, \@hchars);
00348 }
00349 
00350 1;