Archive Ensembl HomeArchive Ensembl Home
NCBITaxon.pm
Go to the documentation of this file.
00001 =head1 NAME
00002 
00003 NCBITaxon - DESCRIPTION of Object
00004 
00005 =head1 DESCRIPTION
00006   
00007   An object that hold a node within a taxonomic tree.  Inherits from NestedSet.
00008 
00009   From Bio::Species
00010    classification
00011    common_name
00012    binomial
00013 
00014   Here are also the additional methods in Bio::Species that "might" be useful, but let us
00015   forget about these for now.
00016    genus
00017    species
00018    sub_species
00019    variant
00020    organelle
00021    division
00022 
00023 =head1 CONTACT
00024 
00025   Contact Jessica Severin on implemetation/design detail: jessica@ebi.ac.uk
00026   Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
00027 
00028 =head1 APPENDIX
00029 
00030 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
00031 
00032 =cut
00033 
00034 package Bio::EnsEMBL::Compara::NCBITaxon;
00035 
00036 use strict;
00037 use Bio::Species;
00038 use Bio::EnsEMBL::Compara::NestedSet;
00039 use Bio::EnsEMBL::Utils::Exception;
00040 use Bio::EnsEMBL::Utils::Argument;
00041 
00042 our @ISA = qw(Bio::EnsEMBL::Compara::NestedSet);
00043 
00044 =head2 copy
00045 
00046   Arg [1]    : int $member_id (optional)
00047   Example    :
00048   Description: returns copy of object, calling superclass copy method
00049   Returntype :
00050   Exceptions :
00051   Caller     :
00052 
00053 =cut
00054 
00055 sub copy {
00056   my $self = shift;
00057   
00058   my $mycopy = $self->SUPER::copy;
00059   bless $mycopy, "Bio::EnsEMBL::Compara::NCBITaxon";
00060   
00061   $mycopy->ncbi_taxid($self->ncbi_taxid);
00062   $mycopy->rank($self->rank);
00063   $mycopy->genbank_hidden_flag($self->genbank_hidden_flag);
00064 
00065   return $mycopy;
00066 }
00067 
00068 
00069 sub ncbi_taxid {
00070   my $self = shift;
00071   my $value = shift;
00072   $self->node_id($value) if($value); 
00073   return $self->node_id;
00074 }
00075 
00076 sub taxon_id {
00077   my $self = shift;
00078   my $value = shift;
00079   $self->node_id($value) if($value); 
00080   return $self->node_id;
00081 }
00082 
00083 sub dbID {
00084   my $self = shift;
00085   my $value = shift;
00086   $self->node_id($value) if($value); 
00087   return $self->node_id;
00088 }
00089 
00090 sub rank {
00091   my $self = shift;
00092   $self->{'_rank'} = shift if(@_);
00093   return $self->{'_rank'};
00094 }
00095 
00096 sub genbank_hidden_flag {
00097   my $self = shift;
00098   $self->{'_genbank_hidden_flag'} = shift if(@_);
00099   return $self->{'_genbank_hidden_flag'};
00100 }
00101 
00102 =head2 classification
00103 
00104   Arg[SEPARATOR]  : String (optional); used to separate the classification by
00105                     when returning as a string. If not specified then a single
00106                     space will be used.
00107   Arg[FULL]       : Boolean (optional); indicates we want all nodes including
00108                     those which Genbank sets as hidden
00109   Arg[AS_ARRAY]   : Boolean (optional); says the return type will be an 
00110                     ArrayRef of all nodes in the classification as instances
00111                     of NCBITaxon.
00112   Example         : my $classification_string = $node->classification();
00113   Description     : Returns the String representation of a taxon node's 
00114                     classification or the objects which constitute it (
00115                     including the current node). The String return when
00116                     split is compatible with BioPerl's Species classification
00117                     code and will return a data structure compatible with
00118                     that found in core species MetaContainers.
00119                     
00120                     This code is a redevelopment of existing code which
00121                     descended down the taxonomy which had disadvanatages 
00122                     when a classification was requested on nodes causing
00123                     the taxonomy to bi/multi-furcate.
00124                     
00125                     Note the String representation does have some disadvantages
00126                     when working with the poorer end of the taxonomy where
00127                     species nodes are not well defined. For these situations
00128                     you are better using the array representation and 
00129                     capturing the required information from the nodes.
00130                     
00131                     Also to maintain the original functionality of the method
00132                     we filter any species, subspecies or subgenus nodes above
00133                     the current node. For the true classification always
00134                     call using the array structure.
00135                     
00136                     Recalling this subroutine with the same parameters for
00137                     separators will return a cached representation. Calling
00138                     for AS_ARRAY will cause the classificaiton to be 
00139                     recalculated each time.
00140   Returntype      : String if not asking for an array otherwise the array
00141   Exceptions      : - 
00142   Caller          : Public
00143 
00144 =cut
00145  
00146 sub classification {
00147   my ($self, @args) = @_;
00148   my ($separator, $full, $as_array) = rearrange([qw( SEPARATOR FULL AS_ARRAY )], @args);
00149 
00150   #setup defaults
00151   $separator = ' ' unless(defined $separator);
00152   $full = 0 unless (defined $full);
00153   
00154   if(!$as_array) {
00155     #Reset the separators & classifications if we already had one & it 
00156     #differed from the input
00157     if(defined $self->{_separator} && $self->{_separator} ne $separator) {
00158       $self->{_separator} = undef;
00159       $self->{_classification} = undef;
00160     }
00161     if(defined $self->{_separator_full} && $self->{_separator_full} ne $separator) {
00162       $self->{_separator_full} = undef;
00163       $self->{_classification_full} = undef;
00164     }
00165     
00166     $self->{_separator} = $separator unless (defined $self->{_separator});
00167     $self->{_separator_full} = $separator unless (defined $self->{_separator_full});
00168     
00169     return $self->{_classification_full} if ($full && defined $self->{_classification_full});
00170     return $self->{_classification} if (!$full && defined $self->{_classification});
00171   }  
00172 
00173   my $node = $self;
00174   my @classification;
00175   while( $node->name() ne 'root' ) {
00176     my $subgenus = $node->rank() eq 'subgenus';
00177     if($full) {
00178       push(@classification, $node);
00179     }
00180     else {
00181       unless($node->genbank_hidden_flag() || $subgenus) {
00182         push(@classification, $node);
00183       }
00184     }
00185     
00186     $node = $node->parent();
00187   }
00188   
00189   if($as_array) {
00190     return \@classification;
00191   }
00192 
00193   #Once we have a normal array we can do top-down as before to replicate 
00194   #the original functionality
00195   my $text_classification = $self->_to_text_classification(\@classification);
00196   
00197   if ($full) {
00198     $self->{_classification_full} = join($separator, @{$text_classification});
00199     $self->{_separator_full} = $separator;
00200     return $self->{_classification_full};
00201   } else {
00202     $self->{_classification} = join($separator, @{$text_classification});
00203     $self->{_separator} = $separator;
00204     return $self->{_classification};
00205   }
00206 }
00207 
00208 =head2 _to_text_classification
00209 
00210   Arg[1]          : ArrayRef of the classification array to be converted to 
00211                     the text classification 
00212   Example         : my $array = $node->_to_text_classification(\@classification);
00213   Description     : Returns the Array representation of a taxon node's 
00214                     classification or the objects which constitute it (
00215                     including the current node) as the species names or split
00216                     according to the rules for working with BioPerl.
00217   Returntype      : ArrayRef of Strings
00218   Exceptions      : - 
00219   Caller          : Private
00220 
00221 =cut
00222 
00223 sub _to_text_classification {
00224   my ($self, $classification) = @_;
00225   my @text_classification;
00226   my $first = 1;
00227   for my $node ( @{$classification}) {
00228     my $subgenus = $node->rank() eq 'subgenus';
00229     my $species = $node->rank() eq 'species';
00230     my $subspecies = $node->rank() eq 'subspecies';
00231     
00232     if($first) {
00233       if($species || $subspecies) {
00234         my ($genus, $species, $subspecies) = split(q{ }, $node->binomial());
00235         unshift @text_classification, $species;
00236         unshift @text_classification, $subspecies if (defined $subspecies);
00237       }
00238       $first = 0;
00239       next;
00240     }
00241     
00242     next if $subgenus || $species || $subspecies;
00243     push(@text_classification, $node->name());
00244   }
00245   return \@text_classification;
00246 }
00247 
00248 =head2 subspecies
00249 
00250   Example    : $ncbi->subspecies;
00251   Description: Returns the subspeceis name for this species
00252   Example    : "verus" for Pan troglodytes verus
00253   Returntype : string
00254   Exceptions :
00255   Caller     : general
00256 
00257 =cut
00258 
00259 sub subspecies {
00260   my $self = shift;
00261 
00262   unless (defined $self->{'_species'}) {
00263     my ($genus, $species, $subspecies) = split(" ", $self->binomial);
00264     $self->{'_species'} = $species;
00265     $self->{'_genus'} = $genus;
00266     $self->{'_subspecies'} = $subspecies;
00267   }
00268 
00269   return $self->{'_species'};
00270 }
00271 
00272 
00273 =head2 species
00274 
00275   Example    : $ncbi->species;
00276   Description: Returns the speceis name for this species
00277   Example    : "sapiens" for Homo sapiens
00278   Returntype : string
00279   Exceptions :
00280   Caller     : general
00281 
00282 =cut
00283 
00284 sub species {
00285   my $self = shift;
00286 
00287   unless (defined $self->{'_species'}) {
00288     my ($genus, $species, $subspecies) = split(" ", $self->binomial);
00289     $self->{'_species'} = $species;
00290     $self->{'_genus'} = $genus;
00291     $self->{'_subspecies'} = $subspecies;
00292   }
00293 
00294   return $self->{'_species'};
00295 }
00296 
00297 
00298 =head2 genus
00299 
00300   Example    : $ncbi->genus;
00301   Description: Returns the genus name for this species
00302   Returntype : string
00303   Example    : "Homo" for Homo sapiens
00304   Exceptions :
00305   Caller     : general
00306 
00307 =cut
00308 
00309 sub genus {
00310   my $self = shift;
00311 
00312   unless (defined $self->{'_genus'}) {
00313     my ($genus, $species, $subspecies) = split(" ", $self->binomial);
00314     $self->{'_species'} = $species;
00315     $self->{'_genus'} = $genus;
00316     $self->{'_subspecies'} = $subspecies;
00317   }
00318 
00319   return $self->{'_genus'};
00320 }
00321 
00322 =head2 common_name
00323 
00324   Example    : $ncbi->common_name;
00325   Description: The comon name as defined by Genbank
00326   Returntype : string
00327   Exceptions : returns undef if no genbank common name exists.
00328   Caller     : general
00329 
00330 =cut
00331 
00332 sub common_name {
00333   my $self = shift;
00334   if ($self->has_tag('genbank common name') && $self->rank eq 'species') {
00335     return $self->get_tagvalue('genbank common name');
00336   } else {
00337     return undef;
00338   }
00339 }
00340 
00341 =head2 ensembl_alias_name
00342 
00343   Example    : $ncbi->ensembl_alias_name;
00344   Description: The comon name as defined by ensembl alias
00345   Returntype : string
00346   Exceptions : returns undef if no ensembl alias name exists.
00347   Caller     : general
00348 
00349 =cut
00350 
00351 sub ensembl_alias_name {
00352   my $self = shift;
00353 
00354   #Not checking for rank as we do above, because we do not get dog since the
00355   #rank for dog is subspecies (ensembl-51).
00356   if ($self->has_tag('ensembl alias name')) {
00357     return $self->get_tagvalue('ensembl alias name');
00358   } else {
00359     return undef;
00360   }
00361 }
00362 
00363 
00364 =head2 binomial
00365 
00366   Example    : $ncbi->binomial;
00367   Description: The binomial name (AKA the scientific name) of this genome
00368   Returntype : string
00369   Exceptions : warns when node is not a species or has no scientific name
00370   Caller     : general
00371 
00372 =cut
00373 
00374 sub binomial {
00375   my $self = shift;
00376   if ($self->has_tag('scientific name') && ($self->rank eq 'species' || $self->rank eq 'subspecies')) {
00377     return $self->get_tagvalue('scientific name');
00378   } else {
00379     warning("taxon_id=",$self->node_id," is not a species or subspecies. So binomial is undef\n");
00380     return undef;
00381   }
00382 }
00383 
00384 =head2 ensembl_alias
00385 
00386   Example    : $ncbi->ensembl_alias;
00387   Description: The ensembl_alias name (AKA the name in the ensembl website) of this genome
00388   Returntype : string
00389   Exceptions : warns when node is not a species or has no ensembl_alias
00390   Caller     : general
00391 
00392 =cut
00393 
00394 sub ensembl_alias {
00395   my $self = shift;
00396   if ($self->has_tag('ensembl alias name')) {
00397     return $self->get_tagvalue('ensembl alias name');
00398   } else {
00399     warning("taxon_id=",$self->node_id," is not a species or subspecies. So ensembl_alias is undef\n");
00400     return undef;
00401   }
00402 }
00403 
00404 
00405 =head2 short_name
00406 
00407   Example    : $ncbi->short_name;
00408   Description: The name of this genome in the Gspe ('G'enera
00409                'spe'cies) format.
00410   Returntype : string
00411   Exceptions : none
00412   Caller     : general
00413 
00414 =cut
00415 
00416 sub short_name {
00417   my $self = shift;
00418   my $name = $self->name;
00419   $name =~  s/(\S)\S+\s(\S{3})\S+/$1$2/;
00420   $name =~ s/\ //g;
00421   return $name;
00422 }
00423 
00424 sub get_short_name {
00425   my $self = shift;
00426   return $self->short_name;
00427 }
00428 
00429 
00430 sub RAP_species_format {
00431   my $self = shift;
00432   my $newick = "";
00433   
00434   if($self->get_child_count() > 0) {
00435     $newick .= "(";
00436     my $first_child=1;
00437     foreach my $child (@{$self->sorted_children}) {  
00438       $newick .= "," unless($first_child);
00439       $newick .= $child->newick_format;
00440       $first_child = 0;
00441     }
00442     $newick .= ")";
00443   }
00444   
00445   $newick .= sprintf("\"%s\"", $self->name,);
00446   $newick .= sprintf(":%1.4f", $self->distance_to_parent) if($self->distance_to_parent > 0);
00447 
00448   if(!($self->has_parent)) {
00449     $newick .= ";";
00450   }
00451   return $newick;
00452 }
00453 
00454 
00455 sub print_node {
00456   my $self  = shift;
00457   printf("(%s", $self->node_id);
00458   printf(" %s", $self->rank) if($self->rank);
00459   print(")");
00460   printf("%s", $self->name) if($self->name);
00461   print("\n");
00462 }
00463 
00464 1;