Archive Ensembl HomeArchive Ensembl Home
DumpAllHomologiesOrthoXML.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009    http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::DumpAllHomologiesOrthoXML
00022 
00023 =head1 DESCRIPTION
00024 
00025 This Analysis/RunnableDB is designed to dump all the homologies of a database
00026 in a single file, with the OrthoXML format
00027 
00028 It requires one parameter:
00029  - compara_db: connection parameters to the Compara database
00030 
00031 The following parameters are optional:
00032  - tree_type: [string] restriction on which trees should be dumped (see the
00033               corresponding field in the gene_tree_root table)
00034  - possible_ortho: [boolean] (default 0) whether or not low confidence
00035                    duplications should be treated as speciations
00036  - file: [string] output file to dump (otherwise: standard output)
00037 
00038 =head1 SYNOPSIS
00039 
00040 standaloneJob.pl Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::DumpAllHomologiesOrthoXML
00041   -compara_db 'mysql://ensro:@compara4:3306/mp12_compara_nctrees_66c'
00042 
00043 =head1 AUTHORSHIP
00044 
00045 Ensembl Team. Individual contributions can be found in the CVS log.
00046 
00047 =head1 MAINTAINER
00048 
00049 $Author: mm14 $
00050 
00051 =head VERSION
00052 
00053 $Revision: 1.1 $
00054 
00055 =head1 APPENDIX
00056 
00057 The rest of the documentation details each of the object methods.
00058 Internal methods are usually preceded with an underscore (_)
00059 
00060 =cut
00061 
00062 
00063 package Bio::EnsEMBL::Compara::RunnableDB::GeneTrees::DumpAllHomologiesOrthoXML; 
00064 
00065 use strict;
00066 
00067 use IO::File;
00068 
00069 use Bio::EnsEMBL::ApiVersion;
00070 
00071 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00072 
00073 sub param_defaults {
00074     return {
00075         "ortholog_method_link_id" => 201,
00076            };
00077 }
00078 
00079 
00080 sub fetch_input {
00081     my ($self) = @_;
00082 
00083     # Defines the file handle
00084     my $file_handle = *STDOUT;
00085     if (defined $self->param('file')) {
00086         $file_handle = IO::File->new($self->param_substitute($self->param('file')), 'w');
00087     }
00088     $self->param('file_handle', $file_handle);
00089 }
00090 
00091 
00092 sub run {
00093     my ($self) = @_;
00094     my $HANDLE = $self->param('file_handle');
00095 
00096     my $version = software_version();
00097     print $HANDLE "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n";
00098     print $HANDLE "<orthoXML xmlns=\"http://orthoXML.org/2011/\" origin=\"Ensembl Compara\" version=\"0.3\" originVersion=\"$version\">\n";
00099 
00100     my $sql = 'SELECT member.taxon_id, name, member_id, stable_id, assembly, genebuild,source_name FROM gene_tree_member JOIN member USING (member_id) JOIN genome_db USING (genome_db_id) ORDER BY taxon_id, member_id';
00101     my $sth = $self->compara_dba->dbc->prepare($sql, {mysql_use_result=>1});
00102     $sth->execute;
00103     my $last;
00104     while(my $rowhash = $sth->fetchrow_hashref) {
00105         if (not defined $last or $last ne ${$rowhash}{taxon_id}) {
00106             print $HANDLE "</genes></database></species>\n" if defined $last;
00107             $last = ${$rowhash}{taxon_id};
00108             print $HANDLE "<species name=\"", ${$rowhash}{name}, "\" NCBITaxId=\"", $last, "\"><database name=\"Unknown\" version=\"", ${$rowhash}{assembly}, "/", ${$rowhash}{genebuild}, "\"><genes>\n";
00109         }
00110         print $HANDLE "\t<gene id=\"", ${$rowhash}{member_id}, "\" ".(${$rowhash}{source_name} eq 'ENSEMBLPEP' ? "protId" : "transcriptId")."=\"", ${$rowhash}{stable_id}, "\"/>\n";
00111     }
00112     print $HANDLE "</genes></database></species>\n" if defined $last;
00113     print $HANDLE "<groups>\n";
00114 
00115     $sql = "SELECT homology_id, peptide_member_id, homology.description FROM homology_member JOIN homology USING (homology_id) JOIN method_link_species_set USING (method_link_species_set_id) WHERE method_link_id=".$self->param('ortholog_method_link_id');
00116     if (defined $self->param('id_range')) {
00117         my $range = $self->param_substitute($self->param('id_range'));
00118         $range =~ s/-/ AND /;
00119         $sql .= " AND homology_id BETWEEN $range";
00120     }
00121     $sth = $self->compara_dba->dbc->prepare($sql, {mysql_use_result=>1});
00122 
00123     $sth->execute;
00124     my %seen;
00125     while(my $rowhash = $sth->fetchrow_hashref) {
00126         if (exists $seen{${$rowhash}{homology_id}}) {
00127             print $HANDLE "<orthologGroup id=\"", ${$rowhash}{homology_id}, "\"><property name=\"homology_description\" value=\"", ${$rowhash}{description}, "\" /><geneRef id=\"", ${$rowhash}{peptide_member_id}, "\" /><geneRef id=\"", $seen{${$rowhash}{homology_id}}, "\" /></orthologGroup>\n";
00128             delete $seen{${$rowhash}{homology_id}};
00129         } else {
00130             $seen{${$rowhash}{homology_id}} = ${$rowhash}{peptide_member_id};
00131         }
00132     }
00133     
00134     print $HANDLE "</groups>\n";
00135     print $HANDLE "</orthoXML>";
00136 
00137 }
00138 
00139 sub write_output {
00140     my ($self) = @_;
00141     $self->param('file_handle')->close();
00142 }
00143 
00144 
00145 1;