Archive Ensembl HomeArchive Ensembl Home
FormatTree.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009     http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =cut
00020 
00021 package Bio::EnsEMBL::Compara::FormatTree;
00022 
00023 use strict;
00024 use warnings;
00025 use Data::Dumper;
00026 use Carp;
00027 
00028 # Grammar to parse $fmt
00029 sub _tokenize {
00030   my ($self) = @_;
00031   eval { require Parse::RecDescent };
00032   if ($@) {
00033     die 'You need Parse::RecDescent installed to role-your-own tree format\n'
00034   }
00035   my $parser = Parse::RecDescent->new(q(
00036                                    {my @tokens; push @tokens,{}}
00037                                     hyphen      : "-"
00038                                     has_parent  : "^"
00039                                     { $tokens[-1]->{has_parent} = 1 }
00040                                     or          : "|"
00041                                     character   : /[\w\.\s:\|]+/i
00042                                     literal     : character
00043                                         { $tokens[-1]->{literal} = $item{character} }
00044                                     Letter_code : "n" | "c" | "d" | "t" | "l" | "h" | "s" | "p" | "m" | "g" | "i" | "e" | "o" | "x" | "S" | "N" | "P"
00045                                     preliteral  : character
00046                                         { $tokens[-1]->{ preliteral } = $item{character} }
00047                                     postliteral : character
00048                                        { $tokens[-1]->{ postliteral } = $item{character} }
00049                                     In_leaf     : hyphen Letter_code
00050                                       {
00051                                          $tokens[-1]->{main} = $item{Letter_code}
00052                                       }
00053                                     In_internal : Letter_code hyphen
00054                                       {
00055                                          $tokens[-1]->{main} = $item{Letter_code}
00056                                       }
00057                                     Format      : /^/ Entry(s) /$/ {\@tokens}
00058                                     Entry       : (Token_or_literal)(s)
00059                                     Token_or_literal : Token
00060                                     Token_or_literal : literal
00061                                       {
00062                                          push @tokens, {}
00063                                       }
00064 
00065                                     Token       : "%{" (has_parent)(?) ('"' <skip: ''> preliteral '"')(?) Condition ('"' <skip: ''> postliteral '"')(?) "}"
00066                                       {
00067                                          push @tokens, {}
00068                                       }
00069 
00070                                     Condition   : Code ( or Letter_code )(s?)
00071                                       {
00072                                         if (scalar @{$item[2]}) {
00073                                             $tokens[-1]->{alternatives} = join "", @{$item[2]}
00074                                         }
00075                                       }
00076                                     Code        : In_leaf
00077                                       {
00078                                          $tokens[-1]->{place} = "Leaf";
00079                                       }
00080                                     Code        : In_internal
00081                                       {
00082                                          $tokens[-1]->{place} = "Internal";
00083                                       }
00084                                     Code        : Letter_code
00085                                       {
00086                                          $tokens[-1]->{place} = "Both";
00087                                          $tokens[-1]->{main} = $item{Letter_code}
00088                                       }
00089 ));
00090   my $fmt = $self->{fmt};
00091   my $tokens = $parser->Format($fmt);
00092   croak "Format $fmt is not valid\n" unless (defined $tokens);
00093   my @tokens = grep {scalar keys %{$_} > 0} @$tokens;    ## Hacky... but shouldn't be needed anymore (just a pop)
00094   $self->{tokens} = [@tokens];
00095 }
00096 
00097 ## Callbacks
00098 
00099 my %callbacks = ();
00100 
00101 ## maybe we can use AUTOLOAD to populate most of these?
00102 
00103 my $name_cb = sub {
00104   my ($self) = @_;
00105   return sprintf ("%s",$self->{tree}->name || '');
00106 };
00107 
00108 my $distance_to_parent_cb = sub {
00109   my ($self) = @_;
00110   my $dtp = $self->{tree}->distance_to_parent();
00111   if ($dtp =~ /^\d+\.\d+$/) {
00112     return sprintf ("%1.4f", $self->{tree}->distance_to_parent);
00113   } else {
00114     return sprintf ("%d", $self->{tree}->distance_to_parent);
00115   }
00116 };
00117 
00118 my $genbank_common_name = sub {
00119   my ($self) = @_;
00120   my $common = uc($self->{tree}->get_tagvalue('genbank common name'));
00121   $common =~ s/\,//g;
00122   $common =~ s/\ /\./g;
00123   $common =~ s/\'//g;
00124   return $common || undef;
00125 };
00126 
00127 my $ensembl_common_name = sub {
00128   my ($self) = @_;
00129   my $common = uc($self->{tree}->get_tagvalue('ensembl common name'));
00130   $common =~ s/\,//g;
00131   $common =~ s/\ /\./g;
00132   $common =~ s/\'//g;
00133   return $common;
00134 };
00135 
00136 my $ensembl_timetree_mya_cb = sub {
00137   my ($self) = @_;
00138   my $str = sprintf ("%s", $self->{tree}->get_tagvalue('ensembl timetree mya'));
00139   return $str eq "" ? undef : $str;
00140 };
00141 
00142 my $gdb_id_cb = sub {
00143   my ($self) = @_;
00144   my $gdb_id;
00145   eval {
00146     $gdb_id = $self->{tree}->adaptor->db->get_GenomeDBAdaptor->fetch_by_taxon_id($self->{tree}->taxon_id)->dbID;
00147   };
00148   return $gdb_id;
00149 };
00150 
00151 my $node_id_cb = sub {  ## only if we are in a leaf? ... if ($self->{tree}->is_leaf);
00152   my ($self) = @_;
00153   return sprintf("%s", $self->{tree}->node_id);
00154 };
00155 
00156 my $label_cb = sub { ## only if we are in a leaf? ... if ($self->{tree}->is_leaf);
00157   my ($self) = @_;
00158   my $display_label = $self->{tree}->gene_member->display_label;
00159   return $display_label;
00160 };
00161 
00162 my $sp_short_name_cb = sub {
00163   my ($self) = @_;
00164   my $sp;
00165   eval {
00166     $sp = $self->{tree}->genome_db->short_name
00167   };
00168   return $sp;
00169 };
00170 
00171 my $stable_id_cb = sub {  ## only if we are in a leaf?
00172   my ($self) = @_;
00173   return $self->{tree}->gene_member->stable_id;
00174 };
00175 
00176 my $prot_id_cb = sub {
00177   my ($self) = @_;
00178   my $prot_member;
00179   eval {$prot_member = $self->{tree}->get_canonical_peptide_Member->stable_id};
00180   return $prot_member;
00181 };
00182 
00183 my $member_id_cb = sub {
00184   my ($self) = @_;
00185   return sprintf ("%s",$self->{tree}->member_id);
00186 };
00187 
00188 my $taxon_id_cb = sub {
00189   my ($self) = @_;
00190   my $taxon_id;
00191   eval { $taxon_id = $self->{tree}->taxon_id };
00192   return $taxon_id;
00193 #  return sprintf ("%s", $self->{tree}->taxon_id);
00194 };
00195 
00196 my $sp_name_cb = sub {
00197   my ($self) = @_;
00198   my $species_name;
00199   if ($self->{tree}->isa('Bio::EnsEMBL::Compara::GeneTreeMember')) {
00200       $species_name = $self->{tree}->genome_db->name;
00201       $species_name =~ s/\ /\_/g;
00202       return $species_name;
00203   } elsif ($self->{tree}->isa('Bio::EnsEMBL::Compara::CAFETreeNode')){
00204       my $taxon_id = $self->{tree}->taxon_id();
00205       my $genome_db_adaptor = $self->{tree}->adaptor->db->get_GenomeDBAdaptor;
00206       my $genome_db;
00207       eval {
00208           $genome_db = $genome_db_adaptor->fetch_by_taxon_id($taxon_id);
00209       };
00210       if ($@) {
00211           return $taxon_id;
00212       }
00213       return $genome_db->name();
00214   }
00215   return undef;
00216 };
00217 
00218 my $n_members_cb = sub {
00219     my ($self) = @_;
00220     my $n_members;
00221     if ($self->{tree}->isa('Bio::EnsEMBL::Compara::CAFETreeNode')) {
00222         return $self->{tree}->n_members();
00223     }
00224     return undef;
00225 };
00226 
00227 my $pvalue_cb = sub {
00228     my ($self) = @_;
00229     my $pval;
00230     if ($self->{tree}->isa('Bio::EnsEMBL::Compara::CAFETreeNode')) {
00231         return $self->{tree}->p_value();
00232     }
00233     return undef;
00234 };
00235 
00236 %callbacks = (
00237           'n' => $name_cb,
00238           'c' => $genbank_common_name,
00239           'e' => $ensembl_common_name,
00240           'd' => $distance_to_parent_cb,
00241           't' => $ensembl_timetree_mya_cb,
00242           'g' => $gdb_id_cb,
00243           'o' => $node_id_cb,
00244           'l' => $label_cb,
00245           's' => $sp_short_name_cb,
00246           'i' => $stable_id_cb,
00247           'p' => $prot_id_cb,
00248           'm' => $member_id_cb,
00249           'x' => $taxon_id_cb,
00250           'S' => $sp_name_cb,
00251           'N' => $n_members_cb, # Used in cafe trees (number of members)
00252           'P' => $pvalue_cb, # Used in cafe trees (pvalue)
00253 #         'E' =>  ## Implement the "Empty" option
00254          );
00255 
00256 
00257 # Maybe leaves and internal nodes should be formatted different?
00258 my %cache;
00259 sub new {
00260   my ($class,$fmt) = @_;
00261   $fmt = "%{n}" unless (defined $fmt); # "full" by default
00262   if (defined $cache{$fmt}) {
00263     return $cache{$fmt};
00264   }
00265   my $obj = bless ({
00266             'fmt' => $fmt,
00267             'tokens' => [],
00268             'callbacks' => {%callbacks},
00269            }, $class);
00270   eval {
00271     $obj->_tokenize();
00272   };
00273   if ($@) {
00274     die $@ if ($@ =~ /Parse::RecDescent/);
00275     die "Bad format : $fmt\n";
00276   }
00277   $cache{$fmt} = $obj;
00278   return $obj;
00279 }
00280 
00281 sub format_newick {
00282   my ($self, $tree) = @_;
00283   return $self->_internal_format_newick($tree);
00284 }
00285 
00286 sub _internal_format_newick {
00287   my ($self, $tree) = @_;
00288 
00289   my $newick = "";
00290   if ($tree->get_child_count()>0) {
00291     $newick .= "(";
00292     my $first_child = 1;
00293     for my $child (@{$tree->sorted_children}) {
00294       $newick .= "," unless ($first_child);
00295       $newick .= $self->_internal_format_newick($child);
00296       $first_child = 0;
00297     }
00298     $newick .= ")";
00299   }
00300 
00301   my $header = "";
00302   $self->{tree} = $tree;
00303   for my $token (@{$self->{tokens}}) {
00304     if (defined $token->{literal}) {
00305       $header .= $token->{literal}
00306     } elsif (($token->{place} eq "Leaf") && ($tree->is_leaf) ||
00307          ($token->{place} eq "Internal") && (! $tree->is_leaf) ||
00308          ($token->{place} eq "Both")) {
00309       next if (defined $token->{has_parent} && $token->{has_parent} == 1 && !$tree->parent);
00310       for my $item (split //,$token->{main}.$token->{alternatives}x!!$token->{alternatives}) {  ## For "main" and "alternatives"
00311     my $itemstr = $self->{callbacks}{$item}->($self);
00312 #   print STDERR "ITEMSTR:$itemstr\n";exit;
00313     if (defined $itemstr) {
00314       $header .= $token->{preliteral}x!!$token->{preliteral}.$itemstr.$token->{postliteral}x!!$token->{postliteral};
00315       last;
00316     }
00317       }
00318     }
00319   }
00320 #  $header .= ":".$self->{callbacks}{d}->($self);
00321   return $newick.$header;
00322 }
00323 
00324 
00325 1;
00326 
00327 ### NEED TO BE UPDATED
00328 
00329 # ++ A "format" is a regular string containing string literals and "tokens". Tokens are:
00330 # %{n} --> then "name" of the node ($self->name)
00331 # %{c} --> the common name ($self->get_tagvalue('genbank common name'))
00332 # %{d} --> gdb_id ($self->adaptor->db->get_GenomeDBAdaptor->fetch_by_taxon_id($self->taxon_id)->dbID)
00333 # %{t} --> timetree ($self->get_tagvalue('ensembl timetree mya')
00334 # %{l} --> display_label ($self->gene_member->display_label)
00335 # %{h} --> genome short name ($self->genome_db->short_name)
00336 # %{s} --> stable_id ($self->gene_member->stable_id)
00337 # %{p} --> peptide Member ($self->get_canonical_peptide_Member->stable_id)
00338 # %{t} --> taxon_id ($self->taxon_id)
00339 # %{m} --> member_id ($self->member_id)
00340 # %{g} --> genome_db name ($self->genome_db->name)
00341 # %{i} --> node_id ($self->node_id)
00342 # %{e} --> nothing (useful to include only regular characters, see below)
00343 
00344 # ++ These "tokens" can be modified using the following rules:
00345 
00346 # + Apply tokens only to leaves or internal nodes:
00347 # %{n}  --> The token applies to nodes and leaves
00348 # %{-n} --> The token applies only to leaves
00349 # %{n-} --> The token applies only to internal nodes
00350 
00351 # + Tokens can be applied conditionally:
00352 # %{p:n} --> Give the "peptide_member_stable_id" or (if it is undefined), the name
00353 # %{-p:n} --> Same as below, but only for leaves
00354 # %{n:-p} --> Give the name, but for leaves give the peptide_id.
00355 
00356 # + string literals can be inserted outside or inside tokens (the meaning is slightly different):
00357 # _%{n} --> Put an underscore and the name.
00358 # %{_n} --> Put an underscore and the name only if name is defined.
00359 # %{_-e} --> Put an underscore only if you are in a leaf.
00360 
00361 # + hyphens and closing brackets are not allowed (we can define a way to scape them).
00362 
00363 # ++ Equivalences with existing formats:
00364 
00365 # "full" --> ‘%{n}’
00366 
00367 # "full_common" --> '%{n}%{ -c}%{.-d}{_t-}' -- Reads: Print the name of the node, if you are in a leaf and its "common_name" is defined print a space and the "common_name". Then, if you are in a leaf and the "gdb_id" is defined, print a dot and the "gdb_id", print an underscore and the "ensembl timetree mya".
00368 
00369 # "int_node_id" --> '%{-n}%{-I}'
00370 
00371 # display_label_composite --> '%{-l_}%{n}%{_-e}%{-h}' -- Reads: If you are in a leaf and the "display label" is defined, print it followed by an underscore. Print the "name". If you are in a leaf node, print an underscore and the "genome short name".
00372 
00373 # "gene_stable_id_composite" --> '%{-s_}%{n}{_e}%{-h}'
00374 
00375 # "full_web" --> '%{n:-p}%{_-e}%{-h}%{_-e}%{-l} -- Reads: Print the name or the stable id of the peptide member if it you are in a leaf and it is defined. Then print an underscore if you are in a leaf. Print the genome short name if you are in a leaf. Print an underscore if you are in a leaf. Print the display label if you are in a leaf.
00376 
00377 # Etc...
00378 
00379 
00380 # M;
00381 
00382 # PS: The specification of the format in EBNF could be something like this:
00383 
00384 # Format = “'” { string_literal | Token } “'” .
00385 # Token = “%{“ [string_literal] Code [ “:” Code ] [string_literal] “}” .
00386 # Code = [ “-”  ] Letter_code [ “-” ]
00387 # Letter_code = [ “n“ | “c” | “d” | “t” | “l” | “h” | “s” | “p” | “t” | “m” | “g” | “i” | “e” ] .