Archive Ensembl HomeArchive Ensembl Home
HclusterParseOutput.pm
Go to the documentation of this file.
00001 =head1 LICENSE
00002 
00003   Copyright (c) 1999-2012 The European Bioinformatics Institute and
00004   Genome Research Limited.  All rights reserved.
00005 
00006   This software is distributed under a modified Apache license.
00007   For license details, please see
00008 
00009    http://www.ensembl.org/info/about/code_licence.html
00010 
00011 =head1 CONTACT
00012 
00013   Please email comments or questions to the public Ensembl
00014   developers list at <dev@ensembl.org>.
00015 
00016   Questions may also be sent to the Ensembl help desk at
00017   <helpdesk@ensembl.org>.
00018 
00019 =head1 NAME
00020 
00021 Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::HclusterParseOutput
00022 
00023 =head1 DESCRIPTION
00024 
00025 This is the RunnableDB that parses the output of Hcluster, stores the clusters as trees without internal structure
00026 (each tree will have one root and several leaves) and dataflows the cluster_ids down branch #2.
00027 
00028 =head1 SYNOPSIS
00029 
00030 my $aa = $sdba->get_AnalysisAdaptor;
00031 my $analysis = $aa->fetch_by_logic_name('HclusterParseOutput');
00032 my $rdb = new Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::HclusterParseOutput(
00033                          -input_id   => "{'mlss_id'=>40069}",
00034                          -analysis   => $analysis);
00035 
00036 $rdb->fetch_input
00037 $rdb->run;
00038 
00039 =head1 AUTHORSHIP
00040 
00041 Ensembl Team. Individual contributions can be found in the CVS log.
00042 
00043 =head1 MAINTAINER
00044 
00045 $Author: mm14 $
00046 
00047 =head VERSION
00048 
00049 $Revision: 1.9 $
00050 
00051 =head1 APPENDIX
00052 
00053 The rest of the documentation details each of the object methods.
00054 Internal methods are usually preceded with an underscore (_)
00055 
00056 =cut
00057 
00058 package Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::HclusterParseOutput;
00059 
00060 use strict;
00061 
00062 use Bio::EnsEMBL::Compara::GeneTree;
00063 use Bio::EnsEMBL::Compara::GeneTreeNode;
00064 use Bio::EnsEMBL::Compara::Graph::ConnectedComponents;
00065 use Bio::EnsEMBL::Compara::MethodLinkSpeciesSet;
00066 
00067 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00068 
00069 
00070 
00071 sub run {
00072     my $self = shift @_;
00073 
00074     $self->parse_hclusteroutput;
00075 }
00076 
00077 
00078 sub write_output {
00079     my $self = shift @_;
00080 
00081     $self->dataflow_clusters;
00082 }
00083 
00084 
00085 ##########################################
00086 #
00087 # internal methods
00088 #
00089 ##########################################
00090 
00091 sub parse_hclusteroutput {
00092     my $self = shift;
00093 
00094     my $mlss_id = $self->param('mlss_id') or die "'mlss_id' is an obligatory parameter";
00095 
00096     my $protein_tree_adaptor = $self->compara_dba->get_ProteinTreeAdaptor;
00097 
00098     my $cluster_dir   = $self->param('cluster_dir');
00099     my $filename      = $cluster_dir . '/hcluster.out';
00100 
00101     # Create the clusterset and associate mlss
00102     my $clusterset = new Bio::EnsEMBL::Compara::GeneTree;
00103     $clusterset->tree_type('proteinclusterset');
00104     $clusterset->method_link_species_set_id($mlss_id);
00105     $self->param('clusterset', $clusterset);
00106 
00107     my $clusterset_root = new Bio::EnsEMBL::Compara::GeneTreeNode;
00108     $clusterset->root($clusterset_root);
00109     $protein_tree_adaptor->store($clusterset);
00110 
00111     my @allclusters;
00112     $self->param('allclusters', \@allclusters);
00113 
00114     # FIXME: load the entire file in a hash and store in decreasing
00115     # order by cluster size this will make big clusters go first in the
00116     # alignment process, which makes sense since they are going to take
00117     # longer to process anyway
00118     open(FILE, $filename) or die "Could not open '$filename' for reading : $!";
00119     while (<FILE>) {
00120         # 0       0       0       1.000   2       1       697136_68,
00121         # 1       0       39      1.000   3       5       1213317_31,1135561_22,288182_42,426893_62,941130_38,
00122         chomp $_;
00123 
00124         my ($cluster_id, $dummy1, $dummy2, $dummy3, $dummy4, $dummy5, $cluster_list) = split("\t",$_);
00125 
00126         next if ($dummy5 < 2);
00127         $cluster_list =~ s/\,^//;
00128         my @cluster_list = split(",",$cluster_list);
00129 
00130         # If it's a singleton, we don't store it as a protein tree
00131         next if (2 > scalar(@cluster_list));
00132 
00133         my $clusterset_leaf = new Bio::EnsEMBL::Compara::GeneTreeNode;
00134         $clusterset_leaf->no_autoload_children();
00135         $clusterset_root->add_child($clusterset_leaf);
00136 
00137         my $cluster = new Bio::EnsEMBL::Compara::GeneTree;
00138         $cluster->tree_type('proteintree');
00139         $cluster->method_link_species_set_id($mlss_id);
00140 
00141         my $cluster_root = new Bio::EnsEMBL::Compara::GeneTreeNode;
00142         $cluster->root($cluster_root);
00143         $cluster->clusterset_id($cluster_root->node_id);
00144         $cluster_root->tree($cluster);
00145         $clusterset_leaf->add_child($cluster_root);
00146 
00147         foreach my $member_hcluster_id (@cluster_list) {
00148             my ($pmember_id, $genome_db_id) = split("_", $member_hcluster_id);
00149             my $node = new Bio::EnsEMBL::Compara::GeneTreeNode;
00150             $cluster_root->add_child($node);
00151             #leaves are GeneTreeNode objects, bless to make into GeneTreeMember objects
00152             bless $node, "Bio::EnsEMBL::Compara::GeneTreeMember";
00153 
00154             #the building method uses member_id's to reference unique nodes
00155             #which are stored in the node_id value, copy to member_id
00156             $node->member_id($pmember_id);
00157         }
00158 
00159         # Store the cluster:
00160         $protein_tree_adaptor->store($clusterset_leaf);
00161         push @allclusters, $cluster->root_id;
00162 
00163         my $leafcount = scalar(@{$cluster->root->get_all_leaves});
00164         print STDERR "cluster $cluster with $leafcount leaves\n" if $self->debug;
00165         $cluster->store_tag('gene_count', $leafcount);
00166         $cluster_root->disavow_parent();
00167 
00168     }
00169     close FILE;
00170     $clusterset_root->build_leftright_indexing(1);
00171     $protein_tree_adaptor->store($clusterset);
00172     $self->param('clusterset_id', $clusterset_root->node_id);
00173     my $leafcount = scalar(@{$clusterset->root->get_all_leaves});
00174     print STDERR "clusterset $clusterset with $leafcount leaves\n" if $self->debug;
00175 }
00176 
00177 
00178 sub dataflow_clusters {
00179     my $self = shift;
00180 
00181     foreach my $tree_id (@{$self->param('allclusters')}) {
00182         $self->dataflow_output_id({ 'protein_tree_id' => $tree_id, }, 2);
00183     }
00184     $self->input_job->autoflow(0);
00185     $self->dataflow_output_id({ 'clusterset_id' => $self->param('clusterset_id') }, 1);
00186 }
00187 
00188 1;