Archive Ensembl HomeArchive Ensembl Home
GroupGenomesUnderTaxa.pm
Go to the documentation of this file.
00001 
00002 =pod 
00003 
00004 =head1 NAME
00005 
00006 Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::GroupGenomesUnderTaxa
00007 
00008 =head1 DESCRIPTION
00009 
00010 This Runnable takes in a list of internal taxonomic nodes by their names and an MLSS_id,
00011 and in the output maps each of the input taxonomic nodes onto a list of high coverage genome_db_ids belonging to the given MLSS_id
00012 
00013 The format of the input_id follows the format of a Perl hash reference.
00014 Example:
00015     { 'mlss_id' => 40069, 'taxlevels' => ['Theria', 'Sauria', 'Tetraodontiformes'] }
00016 
00017 supported keys:
00018     'mlss_id'               => <number>
00019 
00020     'taxlevels'             => <list-of-names>
00021 
00022     'filter_high_coverage'  => 0|1
00023 
00024 =cut
00025 
00026 
00027 package Bio::EnsEMBL::Compara::RunnableDB::ProteinTrees::GroupGenomesUnderTaxa;
00028 
00029 use strict;
00030 
00031 use base ('Bio::EnsEMBL::Compara::RunnableDB::BaseRunnable');
00032 
00033 
00034 sub fetch_input {
00035     my $self = shift @_;
00036 
00037     my $mlss_id     = $self->param('mlss_id')
00038                         or die "'mlss_id' is an obligatory parameter";
00039 
00040     my $mlss        = $self->compara_dba()->get_MethodLinkSpeciesSetAdaptor->fetch_by_dbID($mlss_id) or die "Could not fetch mlss with dbID=$mlss_id";
00041     my $species_set = $mlss->species_set;
00042     my $genome_dbs  = (ref($species_set) eq 'ARRAY') ? $species_set : $species_set->genome_dbs();
00043 
00044     my $filter_high_coverage = $self->param('filter_high_coverage');
00045 
00046     my @selected_gdb_ids = ();
00047 
00048     foreach my $genome_db (@$genome_dbs) {
00049         if($filter_high_coverage) {
00050             my $core_adaptor = $genome_db->db_adaptor()
00051                     or die "Could not connect to core database adaptor";
00052 
00053             my $coverage_depth = $core_adaptor->get_MetaContainer()->list_value_by_key('assembly.coverage_depth')->[0]
00054                     or die "'assembly.coverage_depth' is not defined in core database's meta table". $core_adaptor->dbname; 
00055 
00056             if( ($coverage_depth eq 'high') or ($coverage_depth eq '6X')) {
00057                 push @selected_gdb_ids, $genome_db->dbID();
00058             }
00059         } else {    # take all of them
00060             push @selected_gdb_ids, $genome_db->dbID();
00061         }
00062     }
00063 
00064     my $selected_gdb_id_string = join(',', @selected_gdb_ids);
00065 
00066     ###
00067 
00068     my $dbc         = $self->compara_dba()->dbc();
00069 
00070     my $taxlevels   = $self->param('taxlevels')
00071                         or die "'taxlevels' is an obligatory parameter";
00072 
00073     my @species_sets = ();
00074 
00075     foreach my $taxlevel (@$taxlevels) {
00076         push @species_sets, filter_genomes_by_taxlevel($dbc, $selected_gdb_id_string, $taxlevel);
00077     }
00078 
00079     $self->param('species_sets', \@species_sets);
00080 }
00081 
00082 
00083 sub write_output {      # dataflow the results
00084     my $self = shift;
00085 
00086     my $species_sets = $self->param('species_sets');
00087 
00088     $self->dataflow_output_id( { 'species_sets' => $species_sets }, 2);
00089 }
00090 
00091 
00092 # ------------------------- non-interface subroutines -----------------------------------
00093 
00094 
00095 sub filter_genomes_by_taxlevel {    # not a method
00096     my ($dbc, $selected_gdb_id_string, $taxlevel) = @_;
00097 
00098     my $sql = qq{
00099         SELECT DISTINCT g.genome_db_id
00100           FROM ncbi_taxa_name parent_name, ncbi_taxa_node parent_node, ncbi_taxa_node child_node, genome_db g
00101          WHERE parent_name.name='$taxlevel'
00102            AND parent_name.taxon_id=parent_node.taxon_id
00103            AND parent_node.left_index<child_node.left_index
00104            AND child_node.right_index<=parent_node.right_index
00105            AND child_node.taxon_id=g.taxon_id
00106            AND g.genome_db_id in ($selected_gdb_id_string)
00107       ORDER BY g.genome_db_id
00108     };
00109 
00110     my @species_subset = ();
00111 
00112     my $sth = $dbc->prepare($sql);
00113     $sth->execute();
00114 
00115     while(my ($genome_db_id) = $sth->fetchrow()) {
00116         push @species_subset, $genome_db_id;
00117     }
00118 
00119     return \@species_subset;
00120 }
00121 
00122 1;