Archive Ensembl HomeArchive Ensembl Home
OrthoXMLWriter.pm
Go to the documentation of this file.
00001 package Bio::EnsEMBL::Compara::Graph::OrthoXMLWriter;
00002 
00003 =pod
00004 
00005 =head1 NAME
00006 
00007 Bio::EnsEMBL::Compara::Graph::OrthoXMLWriter
00008 
00009 =head1 SYNOPSIS
00010 
00011   use Bio::EnsEMBL::Compara::Graph::OrthoXMLWriter;
00012   
00013   my $string_handle = IO::String->new();
00014   my $w = Bio::EnsEMBL::Compara::Graph::OrthoXMLWriter->new(
00015     -SOURCE => 'Ensembl', -SOURCE_VERSION => 63, -HANDLE => $string_handle
00016   );
00017   
00018   my $pt = $dba->get_ProteinTreeAdaptor()->fetch_node_by_node_id(2);
00019   
00020   $w->write_trees($pt);
00021   $w->finish(); #YOU MUST CALL THIS TO WRITE THE FINAL TAG
00022   
00023   my $xml_scalar_ref = $string_handle->string_ref();
00024   
00025   #Or to write to a file via IO::File
00026   my $file_handle = IO::File->new('output.xml', 'w');
00027   $w = Bio::EnsEMBL::Compara::Graph::OrthoXMLWriter->new(
00028     -SOURCE => 'Ensembl', -SOURCE_VERSION => 63, -HANDLE => $file_handle
00029   );
00030   $w->write_trees($pt);
00031   $w->finish(); #YOU MUST CALL THIS TO WRITE THE FINAL TAG
00032   $file_handle->close();
00033   
00034   #Or letting this deal with it
00035   $w = Bio::EnsEMBL::Compara::Graph::OrthoXMLWriter->new(
00036     -SOURCE => 'Ensembl', -SOURCE_VERSION => 63, -FILE => 'loc.xml'
00037   );
00038   $w->write_trees($pt);
00039   $w->finish(); #YOU MUST CALL THIS TO WRITE THE FINAL TAG
00040   $w->handle()->close();
00041 
00042 =head1 DESCRIPTION
00043 
00044 Used as a way of emitting Compara GeneTrees in a format which conforms
00045 to L<OrthoXML|http://www.orthoxml.org/>. The code is built to work with
00046 instances of L<Bio::EnsEMBL::Compara::GeneTreeNode>.
00047 
00048 The same document is persistent between write_trees() calls so to create
00049 a new XML document create a new instance of this object.
00050 
00051 =head1 SUBROUTINES/METHODS
00052 
00053 See inline
00054 
00055 =head1 MAINTAINER
00056 
00057 $Author: mm14 $
00058 
00059 =head VERSION
00060 
00061 $Revision: 1.10 $
00062 
00063 =head1 LICENSE
00064 
00065  Copyright (c) 1999-2012 The European Bioinformatics Institute and
00066  Genome Research Limited.  All rights reserved.
00067 
00068  This software is distributed under a modified Apache license.
00069  For license details, please see
00070 
00071    http://www.ensembl.org/info/about/code_licence.html
00072 
00073 =head1 CONTACT
00074 
00075  Please email comments or questions to the public Ensembl
00076  developers list at <dev@ensembl.org>.
00077 
00078  Questions may also be sent to the Ensembl help desk at
00079  <helpdesk@ensembl.org>.
00080 
00081 =cut
00082 
00083 use strict;
00084 use warnings;
00085 
00086 use base qw(Bio::EnsEMBL::Compara::Graph::BaseXMLWriter);
00087 
00088 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
00089 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
00090 use Bio::EnsEMBL::Utils::Scalar qw(check_ref wrap_array);
00091 
00092 my $ortho_uri = 'http://orthoXML.org';
00093 
00094 =pod
00095 
00096 =head2 new()
00097 
00098   Arg[SOURCE]           : String; the source of the dumped data.
00099                           Defaults to B<Unknown>.
00100   Arg[SOURCE_VERSION]   : String; the version source of the dumped data.
00101                           Defaults to B<Unknown>.
00102   Arg[HANDLE]           : IO::Handle; pass in an instance of IO::File or
00103                           an instance of IO::String so long as it behaves
00104                           the same as IO::Handle. Can be left blank in 
00105                           favour of the -FILE parameter
00106   Arg[FILE]             : Scalar; file to write to              
00107   Arg[NO_RELEASE_TREES] : Boolean; if set to true this will force the writer
00108                           to avoid calling C<release_tree()> on every tree
00109                           given. Defaults to false
00110   Arg[POSSIBLE_ORTHOLOGS] : Boolean: if set to true, duplication nodes with a
00111                             score < 0.25 are considered as speciations
00112   Description : Creates a new tree writer object. 
00113   Returntype  : Instance of the writer
00114   Exceptions  : None
00115   Example     : my $w = Bio::EnsEMBL::Compara::Graph::OrthoXMLWriter->new(
00116                   -SOURCE => 'Ensembl',  -HANDLE => $handle
00117                 );
00118   Status      : Stable  
00119   
00120 =cut
00121 
00122 sub new {
00123   my ($class, @args) = @_;
00124   $class = ref($class) || $class;
00125   my $self = $class->SUPER::new(@args);
00126   
00127   my ($source, $source_version, $no_release_trees, $possible_orthologs) = 
00128     rearrange([qw(source source_version no_release_trees possible_orthologs)], @args);
00129 
00130   $source ||= 'Unknown';
00131   $source_version ||= 'Unknown';
00132   $possible_orthologs = 0 unless defined $possible_orthologs;
00133 
00134   $self->source($source);
00135   $self->source_version($source_version);
00136   $self->no_release_trees($no_release_trees);
00137   $self->possible_orthologs($possible_orthologs);
00138 
00139   return $self;
00140 }
00141 
00142 =pod
00143 
00144 =head2 namespaces()
00145 
00146 Provides the namespaces used in this writer (the OrthoXML namespace)
00147 
00148 =cut
00149 
00150 sub namespaces {
00151   my ($self) = @_;
00152   return {
00153     "$ortho_uri/2011/" => ''
00154   };
00155 }
00156 
00157 =pod
00158 
00159 =head2 no_release_trees()
00160 
00161   Arg [0] : Boolean; indicates if we need to avoid releasing trees
00162   Returntype : Boolean
00163   Exceptions : None
00164   Status     : Stable
00165  
00166 =cut
00167 
00168 sub no_release_trees {
00169   my ($self, $no_release_trees) = @_;
00170   $self->{no_release_trees} = $no_release_trees if defined $no_release_trees;
00171   return $self->{no_release_trees};
00172 }
00173 
00174 =pod
00175 
00176 =head2 source()
00177 
00178   Arg[0] : The value to set this to
00179   Description : Indicates the source of the stable identifiers for the 
00180                 peptides.
00181   Returntype : String
00182   Exceptions : None
00183   Status     : Stable
00184   
00185 =cut
00186 
00187 sub source {
00188   my ($self, $source) = @_;
00189   $self->{source} = $source if defined $source;
00190   return $self->{source};
00191 }
00192 
00193 =pod
00194 
00195 =head2 source_version()
00196 
00197   Arg[0] : The value to set this to
00198   Description : Indicates the version of the data
00199   Returntype : String
00200   Exceptions : None
00201   Status     : Stable
00202   
00203 =cut
00204 
00205 sub source_version {
00206   my ($self, $source_version) = @_;
00207   $self->{source_version} = $source_version if defined $source_version;
00208   return $self->{source_version};
00209 }
00210 
00211 =pod
00212 
00213 =head2 possible_orthologs()
00214 
00215   Arg [0] : Boolean; indicates if we want to treat not-supported duplications as speciations
00216   Returntype : Boolean
00217   Exceptions : None
00218   Status     : Stable
00219  
00220 =cut
00221 
00222 sub possible_orthologs {
00223   my ($self, $possible_orthologs) = @_;
00224   $self->{possible_orthologs} = $possible_orthologs if defined $possible_orthologs;
00225   return $self->{possible_orthologs};
00226 }
00227 
00228 
00229 =head2 write_trees()
00230 
00231   Arg[0]      : The tree to write. Can be a single Tree or an ArrayRef
00232   Description : Writes a tree into the backing document representation
00233   Returntype  : None
00234   Exceptions  : Possible if there is an issue with retrieving data from the tree
00235   instance
00236   Example     : $writer->write_trees($tree);
00237                 $writer->write_trees([$tree_one, $tree_two]);
00238   Status      : Stable  
00239   
00240 =cut
00241 
00242 sub write_trees {
00243   my ($self, $trees) = @_;
00244 
00245   $trees = wrap_array($trees);
00246 
00247   # Create a list of all members, grouped by species
00248   my $hash_members = {};
00249   my $list_species = [];
00250   foreach my $tree (@{$trees}) {
00251     $self->_get_members_list($tree, $list_species, $hash_members);
00252   }
00253 
00254   return $self->write_data(
00255     $list_species,
00256     sub {
00257       my ($species) = @_;
00258     return ${$hash_members}{$species->dbID};
00259     },
00260     $trees
00261   );
00262 }
00263 
00264 =pod
00265 
00266 =head2 write_data()
00267 
00268   Arg[0]      : List reference of all the species (must contain GenomeDB objects)
00269   Arg[1]      : A function that, given a GenomeDB, returns a list of all the
00270                 members used in the trees for this species
00271   Arg[2]      : List reference of all the trees
00272   Description : Generic method to write the content
00273   Returntype  : None
00274   Exceptions  : Possible if there is an issue with retrieving data from the tree
00275   instance
00276   Status      : Stable  
00277   
00278 =cut
00279 
00280 sub write_data {
00281   my ($self, $list_species, $callback_list_members, $list_trees) = @_;
00282   my $w = $self->_writer();
00283 
00284   # Prints each database
00285   foreach my $species (@$list_species) {
00286     # species should be a GenomeDB instance
00287 
00288     $w->startTag("species", "NCBITaxId" => $species->taxon_id, "name" => $species->name);
00289     $w->startTag("database", "name" => $self->source_version, "version" => sprintf("%s/%s", $species->assembly, $species->genebuild));
00290     $w->startTag("genes");
00291 
00292     foreach my $member (@{$callback_list_members->($species)}) {
00293     $w->emptyTag("gene", "id" => $member->member_id, "geneId" => $member->gene_member->stable_id, ($member->source_name eq "ENSEMBLPEP" ? "protId" : "transcriptId") => $member->stable_id);
00294     }
00295 
00296     $w->endTag("genes");
00297     $w->endTag("database");
00298     $w->endTag("species");
00299   }
00300 
00301   # Prints the score definition
00302   $w->startTag("scores");
00303   $w->emptyTag("scoreDef", "id" => "bootstrap", "desc" => "Reliability of the branch");
00304   $w->emptyTag("scoreDef", "id" => "duplication_confidence_score", "desc" => "Reliability of the duplication");
00305   $w->endTag("scores");
00306 
00307   # Prints each tree
00308   $w->startTag("groups");
00309   foreach my $tree (@{$list_trees}) {
00310     $self->_write_tree($tree);
00311     $tree->release_tree() if ! $self->no_release_trees;
00312   }
00313   $w->endTag("groups");
00314 
00315   return;
00316 }
00317 
00318 ########### PRIVATE
00319 
00320 sub _write_opening {
00321   my ($self, $w) = @_;
00322   my $xsi_uri = $self->xml_schema_namespace();
00323   $w->xmlDecl("UTF-8");
00324   $w->forceNSDecl("${ortho_uri}/2011/");
00325   $w->forceNSDecl($xsi_uri);
00326   $w->startTag("orthoXML", [$xsi_uri, 'schemaLocation'] => 
00327     "${ortho_uri} ${ortho_uri}/0.3/orthoxml.xsd",
00328     'version'=>'0.3',
00329     "origin" => $self->source,
00330     "originVersion" => $self->source_version,
00331   );
00332 
00333   return;
00334 }
00335 
00336 sub _write_closing {
00337   my ($self) = @_;
00338   $self->_writer()->endTag("orthoXML");
00339 }
00340 
00341 sub _get_members_list {
00342   my ($self, $tree, $list_species, $hash_members) = @_;
00343 
00344   foreach my $leaf (@{$tree->get_all_leaves}) {
00345     if (not defined ${$hash_members}{$leaf->genome_db_id}) {
00346       push @{$list_species}, $leaf->genome_db;
00347     ${$hash_members}{$leaf->genome_db_id} = [];
00348     }
00349     push @{${$hash_members}{$leaf->genome_db_id}}, $leaf;
00350   }
00351 }
00352 
00353 sub _write_tree {
00354   my ($self, $tree) = @_;
00355   no warnings 'recursion';
00356   
00357   # an OrthoXML file must begin with a orthologGroup
00358   if (_is_reliable_duplication($self, $tree)) {
00359     # Goes recursively until the next speciation node
00360     foreach my $child (@{$tree->children()}) {
00361       $self->_write_tree($child);
00362     }
00363   } elsif (not $tree->is_leaf) {
00364     # Can now write the tree
00365     $self->_process($tree);
00366   }
00367   
00368   return;
00369 }
00370 
00371 sub _is_reliable_duplication {
00372   my $self = shift;
00373   my $node = shift;
00374   my $node_type = $node->get_tagvalue('node_type');
00375   return 0 unless defined $node_type;
00376   return 0 if $node_type eq 'speciation';
00377   return 1 if $node_type eq 'gene_split';
00378   return 1 if not $self->possible_orthologs();
00379   my $sis = $node->get_tagvalue('duplication_confidence_score');
00380   return ((defined $sis) and ($sis >= 0.25));
00381 }
00382 
00383 sub _process {
00384   my ($self, $node) = @_;
00385   no warnings 'recursion';
00386 
00387   if(check_ref($node, 'Bio::EnsEMBL::Compara::GeneTreeMember')) {
00388     return $self->_writer->emptyTag("geneRef", "id" => $node->member_id);
00389   }
00390   elsif(check_ref($node, 'Bio::EnsEMBL::Compara::GeneTreeNode')) {
00391     my $tagname = _is_reliable_duplication($self, $node) ? "paralogGroup" : "orthologGroup";
00392 
00393     my $w = $self->_writer();
00394     $w->startTag(
00395       $tagname,
00396       $node->can("stable_id") ? ("id" => $node->stable_id()) : ("id" => $node->node_id()),
00397     );
00398  
00399     $self->_genetreenode_body($node);
00400  
00401     $w->endTag($tagname);
00402     return;
00403   }
00404   my $ref = ref($node);
00405   throw("Cannot process type $ref");
00406 }
00407 
00408 ###### PROCESSORS
00409 
00410 #body writes data
00411 sub _genetreenode_body {
00412   my ($self, $node) = @_;
00413   no warnings 'recursion';
00414   
00415   my $w = $self->_writer();
00416   
00417    # Scores
00418   foreach my $tag (qw(duplication_confidence_score bootstrap)) {
00419     my $value = $node->get_tagvalue($tag);
00420     if (defined $value and $value ne '') {
00421       $w->emptyTag('score', 'id' => $tag, 'value' => $value);
00422     }
00423   }
00424   
00425   # Properties
00426   foreach my $tag (qw(taxon_id taxon_name)) {
00427     my $value = $node->get_tagvalue($tag);
00428     if (defined $value and $value ne '') {
00429       $w->emptyTag('property', 'name' => $tag, 'value' => $value);
00430     }
00431   }
00432 
00433   # dubious_duplication is in another field
00434   if ($node->get_tagvalue('node_type', '') eq 'dubious') {
00435      $w->emptyTag('property', 'name' => 'dubious_duplication', 'value' => 1);
00436   }
00437   
00438   if($node->get_child_count()) {
00439     foreach my $child (@{$node->children()}) {
00440       $self->_process($child);
00441     }
00442     }
00443   
00444   return;
00445 }
00446 
00447 
00448 1;